In [1]:
import rdkit
import rdkit.Chem as Chem
import rdkit.Chem.rdMolDescriptors
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt, CalcNumLipinskiHBA, CalcNumLipinskiHBD, CalcCrippenDescriptors

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt


import numpy as np

import sklearn
import sklearn.model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score




In [2]:
drugs = []

with open("drugs.smi") as fp:
    for l in fp:
        smiles = l.strip()
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            drugs.append(mol)

In [3]:
#딕셔너리 만들기
property = {"MW": [],"HBA":[], "HBD":[], "logP":[],"is_drug":[]}

for mol in drugs:
    mw = CalcExactMolWt(mol)
    property["MW"].append(mw)
    
    hba = CalcNumLipinskiHBA(mol)
    property["HBA"].append(hba)
    
    hbd = CalcNumLipinskiHBD(mol)
    property["HBD"].append(hbd)
    
    logp, mr = CalcCrippenDescriptors(mol)
    property["logP"].append(logp)
    
    property["is_drug"].append(1)
    

In [4]:
drug_df = pd.DataFrame(property)

In [5]:
non_drugs = []

with open("non_drugs.smi") as fp:
    for l in fp:
        smiles = l.strip()
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            non_drugs.append(mol)

In [6]:
#딕셔너리 만들기
property = {"MW": [],"HBA":[], "HBD":[], "logP":[],"is_drug":[]}

for mol in non_drugs:
    mw = CalcExactMolWt(mol)
    property["MW"].append(mw)
    
    hba = CalcNumLipinskiHBA(mol)
    property["HBA"].append(hba)
    
    hbd = CalcNumLipinskiHBD(mol)
    property["HBD"].append(hbd)
    
    logp, mr = CalcCrippenDescriptors(mol)
    property["logP"].append(logp)
    
    property["is_drug"].append(0)
    

In [7]:
non_drug_df = pd.DataFrame(property)

In [8]:
new_df = pd.concat([drug_df, non_drug_df])

In [9]:
X = new_df.iloc[:,:-1]

In [10]:
y = new_df.iloc[:, -1]

In [11]:
X_train,X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
my_model = RandomForestClassifier()

In [13]:
my_model.fit(X_train, y_train)

RandomForestClassifier()

In [14]:
y_pred = my_model.predict(X_test)

In [15]:
precision_score(y_test, y_pred)

0.5494505494505495

In [16]:
recall_score(y_test, y_pred)

0.3875968992248062

In [17]:
f1_score(y_test, y_pred)

0.4545454545454546

In [18]:
# ============ 여기까지 저번주꺼 ==========

In [19]:
#테스트 해볼 파라미터

# 1. n_estimator
# 2. max_depth
# 3. min_sample_split
# 4. min_sample_leaf

In [38]:
my_model_v2 = RandomForestClassifier(n_estimators =200) # n_esti기본값은100

In [39]:
my_model_v2.fit(X_train, y_train) #학습

RandomForestClassifier(n_estimators=200)

In [40]:
y_pred = my_model_v2.predict(X_test) #문제풀이

In [41]:
precision_score(y_test, y_pred)

0.5913978494623656

In [42]:
recall_score(y_test, y_pred)

0.4263565891472868

In [43]:
f1_score(y_test, y_pred)

0.4954954954954955

In [44]:
my_model_v3 = RandomForestClassifier(max_depth = 5) #교수님꺼 해보기

In [45]:
my_model_v3.fit(X_train, y_train) 

RandomForestClassifier(max_depth=5)

In [46]:
y_pred = my_model_v3.predict(X_test) #문제풀이

In [47]:
precision_score(y_test, y_pred)

0.7692307692307693

In [48]:
recall_score(y_test, y_pred)

0.15503875968992248

In [49]:
f1_score(y_test, y_pred)

0.25806451612903225

In [50]:
my_model_v4 = RandomForestClassifier(n_estimators = 400)

In [51]:
my_model_v4.fit(X_train, y_train) 

RandomForestClassifier(n_estimators=400)

In [52]:
y_pred = my_model_v4.predict(X_test) #문제풀이

In [53]:
precision_score(y_test, y_pred)

0.574468085106383

In [54]:
recall_score(y_test, y_pred)

0.4186046511627907

In [56]:
f1_score(y_test, y_pred)

0.484304932735426

In [57]:
## Feature importance

In [58]:
# 현재 이 노트북에서 사용하고 있는 Feature는 4가지:
# 1. mw
# 2. h-b-a
# 3. h-b-d
# 4. logP
