In [1]:
import rdkit
import rdkit.Chem as Chem
import rdkit.Chem.rdMolDescriptors
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt, CalcCrippenDescriptors, CalcNumLipinskiHBA, CalcNumLipinskiHBD, CalcFractionCSP3, CalcNumRotatableBonds, CalcNumRings, CalcTPSA, CalcNumAromaticRings


import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

import numpy as np

import sklearn
import sklearn.model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error
from sklearn import svm 
from sklearn import preprocessing


In [2]:
drugs = pd.read_csv("drugs.csv")
non_drugs = pd.read_csv("non_drugs.csv")

In [3]:
mw = []
hba = []
hbd = []
logp = []
 
for smi in drugs["smiles"]:
    m = Chem.MolFromSmiles(smi)
    mw.append(CalcExactMolWt(m))
    logp.append(CalcCrippenDescriptors(m)[0]) # because calccrippendescriptors returns two values: logp, mr
    hba.append(CalcNumLipinskiHBA(m))
    hbd.append(CalcNumLipinskiHBD(m))

In [4]:
drugs["MW"] = mw 
drugs["HBA"] = hba
drugs["HBD"] = hbd
drugs["logp"] = logp

In [5]:
mw = []
hba = []
hbd = []
logp = []
 
for smi in non_drugs["smiles"]:
    m = Chem.MolFromSmiles(smi)
    mw.append(CalcExactMolWt(m))
    logp.append(CalcCrippenDescriptors(m)[0])
    hba.append(CalcNumLipinskiHBA(m))
    hbd.append(CalcNumLipinskiHBD(m))

In [6]:
non_drugs["MW"] = mw
non_drugs["HBA"] = hba
non_drugs["HBD"] = hbd
non_drugs["logp"] = logp

In [7]:
all_data = pd.concat([drugs, non_drugs], ignore_index=True)

In [8]:
# 마크다운 사용법: https://gist.github.com/ihoneymon/652be052a0727ad59601

# SVM

In [9]:
my_model = svm.SVC()

In [10]:
X = all_data.iloc[:,2:]

In [11]:
# y = all_data.iloc[:,1]

In [12]:
y = all_data["is_drug"]

In [13]:
# X를 표준화 하여야한다. 절대값의 차이가 너무 크면 잘 작동하지않는다
# 표준점수  = 원점수-평균 /표준편차

min_max_scaler = preprocessing.StandardScaler() #정규분표로 바꾸는거
X_scaled = min_max_scaler.fit_transform(X)

In [14]:
X_scaled_df = pd.DataFrame(X_scaled)

In [15]:
# X의 값을 평균0, 표준편차 1로 가지는 값으로 표준화 되었다.

In [16]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_scaled_df, y, test_size = 0.2)

In [17]:
my_model.fit(X_train, y_train)

SVC()

In [18]:
y_pred = my_model.predict(X_test)

In [19]:
precision_score(y_test, y_pred) 

# 정확도
#약이라고 한것중에 밑에만큼이 맞았다

0.625

In [20]:
recall_score(y_test, y_pred)

# 실제 약중에서 약이라고 예측한것은 밑에 만큼이다.

0.14018691588785046

In [21]:
f1_score(y_test, y_pred)

# pre와 fi의 조화평균

0.22900763358778625

## ======1203======

In [22]:
# 파라미터 바꾸기

# 대부분 C값과 감마값을 바꾼다 

# default C = 1.0


In [23]:
# C값을 키우면 에러를 최소화하고 간격을 좁힌다.(교수님꺼 참고)

In [24]:
my_model_v2 = svm.SVC(C=10)

In [25]:
my_model_v2.fit(X_train, y_train)

SVC(C=10)

In [26]:
y_pred_v2 = my_model_v2.predict(X_test)

In [27]:
f1_score(y_test, y_pred_v2)

0.2647058823529412

In [28]:
precision_score(y_test, y_pred_v2)

0.6206896551724138

In [29]:
recall_score(y_test, y_pred_v2)

0.16822429906542055

In [30]:
my_model_v3 = svm.SVC(C=0.1)

In [31]:
my_model_v3.fit(X_train, y_train)

SVC(C=0.1)

In [32]:
y_pred_v3 = my_model_v3.predict(X_test)

In [33]:
y_pred_v3

# 학습이 되지 않았다!

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [34]:
#f1_score(y_test, y_pred_v3)

In [35]:
#precision_score(y_test, y_pred_v3)

In [36]:
#recall_score(y_test, y_pred_v3)

In [37]:
my_model_v4 = svm.SVC(C=100)

In [38]:
my_model_v4.fit(X_train, y_train)

SVC(C=100)

In [39]:
y_pred_v4 = my_model_v4.predict(X_test)

In [40]:
f1_score(y_test, y_pred_v4)

0.4155844155844156

In [41]:
precision_score(y_test, y_pred_v4)

0.6808510638297872

In [42]:
recall_score(y_test, y_pred_v4)

0.29906542056074764

In [43]:
# C값을 1000, 10000으로 하면 어떻게될까? 한번 해보세요 ㅋㅋㅋ

In [44]:
# gamma 값바꾸기

# 데이터가 주변에 얼마나 영향을 미치는지 결정

# 감마값이 크면 각 데이터는 그 주변에만 영향미침.
# 감마값이 작으면 넓은 영역에서 영향을 미치게된다.

In [45]:
my_model5 = svm.SVC(gamma = 10.0)

In [46]:
my_model5.fit(X_train, y_train)
y_pred = my_model5.predict(X_test)

In [47]:
f1_score(y_test, y_pred)

0.39263803680981585

In [48]:
precision_score(y_test, y_pred)

0.5714285714285714

In [49]:
recall_score(y_test, y_pred)

0.29906542056074764

In [50]:
my_model6 = svm.SVC(C=1, gamma = 0.1)
my_model6.fit(X_train, y_train)
y_pred = my_model6.predict(X_test)
f1_score(y_test, y_pred)

0.13793103448275862

In [51]:
my_model6 = svm.SVC(C=100, gamma=0.1)
my_model6.fit(X_train, y_train)
y_pred = my_model6.predict(X_test)
f1_score(y_test, y_pred)

0.2627737226277372

In [52]:
my_model6 = svm.SVC(C=100, gamma=10)
my_model6.fit(X_train, y_train)
y_pred = my_model6.predict(X_test)
f1_score(y_test, y_pred)

0.4200913242009132

In [53]:
# 최적의 모델을 찾기위해서는 다양한 하이퍼 파라미터의 조합을 테스트 해볼필요가있다.

In [54]:
#다른 커널 사용

# my_model_linear = svm.SVC(kernel='linear')
# my_model_linear.fit(X_train, y_train)
# y_pred_linear = my_model_linear.predict(X_test)

# 잘안나옴

In [59]:
max_f1 = 0.0 
for c in [0.1, 1, 2, 5, 10, 100, 400, 500, 600, 1000]: # test할 다양한 C 값
    for g in [0.001, 0.01, 0.1, 0.5, 1.0, 2, 10, 50, 100]:
        model = svm.SVC(C=c,gamma=g)
        model.fit(X_train, y_train) # 학습
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        if f1 > max_f1: # 기존의 최고의 F1 값을 넘었을 때. 
            max_f1 = f1
            max_prec = prec
            max_recall = recall
            max_c = c
            max_g = g
        print("now C = ",c,",","gamma = ",g)
print("--End of Calculation!--")        
print("C = ",max_c,",","gamma = ",max_g)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


now C =  0.1 , gamma =  0.001
now C =  0.1 , gamma =  0.01
now C =  0.1 , gamma =  0.1
now C =  0.1 , gamma =  0.5
now C =  0.1 , gamma =  1.0
now C =  0.1 , gamma =  2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


now C =  0.1 , gamma =  10
now C =  0.1 , gamma =  50
now C =  0.1 , gamma =  100
now C =  1 , gamma =  0.001
now C =  1 , gamma =  0.01
now C =  1 , gamma =  0.1
now C =  1 , gamma =  0.5
now C =  1 , gamma =  1.0
now C =  1 , gamma =  2
now C =  1 , gamma =  10
now C =  1 , gamma =  50
now C =  1 , gamma =  100
now C =  2 , gamma =  0.001
now C =  2 , gamma =  0.01
now C =  2 , gamma =  0.1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


now C =  2 , gamma =  0.5
now C =  2 , gamma =  1.0
now C =  2 , gamma =  2
now C =  2 , gamma =  10
now C =  2 , gamma =  50
now C =  2 , gamma =  100
now C =  5 , gamma =  0.001
now C =  5 , gamma =  0.01
now C =  5 , gamma =  0.1


  _warn_prf(average, modifier, msg_start, len(result))


now C =  5 , gamma =  0.5
now C =  5 , gamma =  1.0
now C =  5 , gamma =  2
now C =  5 , gamma =  10
now C =  5 , gamma =  50
now C =  5 , gamma =  100
now C =  10 , gamma =  0.001
now C =  10 , gamma =  0.01
now C =  10 , gamma =  0.1
now C =  10 , gamma =  0.5


  _warn_prf(average, modifier, msg_start, len(result))


now C =  10 , gamma =  1.0
now C =  10 , gamma =  2
now C =  10 , gamma =  10
now C =  10 , gamma =  50
now C =  10 , gamma =  100
now C =  100 , gamma =  0.001
now C =  100 , gamma =  0.01
now C =  100 , gamma =  0.1
now C =  100 , gamma =  0.5
now C =  100 , gamma =  1.0
now C =  100 , gamma =  2
now C =  100 , gamma =  10
now C =  100 , gamma =  50
now C =  100 , gamma =  100
now C =  400 , gamma =  0.001
now C =  400 , gamma =  0.01
now C =  400 , gamma =  0.1
now C =  400 , gamma =  0.5
now C =  400 , gamma =  1.0
now C =  400 , gamma =  2
now C =  400 , gamma =  10
now C =  400 , gamma =  50
now C =  400 , gamma =  100
now C =  500 , gamma =  0.001
now C =  500 , gamma =  0.01
now C =  500 , gamma =  0.1
now C =  500 , gamma =  0.5
now C =  500 , gamma =  1.0
now C =  500 , gamma =  2
now C =  500 , gamma =  10
now C =  500 , gamma =  50
now C =  500 , gamma =  100
now C =  600 , gamma =  0.001
now C =  600 , gamma =  0.01
now C =  600 , gamma =  0.1
now C =  600 , gamma =  0.5
n