<a href="https://colab.research.google.com/github/DonghakPark/DataAnalysis/blob/new_drug/my_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Colab Setting


In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

import os
os.chdir('/content/gdrive/MyDrive/Colab Notebooks/2nd_New_Drug_Development_AI_Competition')

Mounted at /content/gdrive


In [2]:
!pip install rdkit
!pip install chemprop

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


### Import LB

In [3]:
import pandas as pd
import numpy as np
import os, random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import chemprop

### Function def and Config

In [4]:

CFG = {
    'NBITS':4096,
    'SEED':42,
}
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

### Load Data

In [5]:
#Load Data

# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('./Data/train.csv')  # 예시 파일 이름
chembl_data.head()

Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles
0,CHEMBL4443947,IC50,'=',0.022,nM,10.66,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.022,10.66,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...
1,CHEMBL4556091,IC50,'=',0.026,nM,10.59,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.026,10.59,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
2,CHEMBL4566431,IC50,'=',0.078,nM,10.11,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.078,10.11,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
3,CHEMBL4545898,IC50,'=',0.081,nM,10.09,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.081,10.09,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
4,CHEMBL4448950,IC50,'=',0.099,nM,10.0,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.099,10.0,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...


In [6]:
#Model Train

train = chembl_data[['Smiles','pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

more_data = pd.read_csv("./Data/new_data_pic50.csv")
more = more_data[['Smiles','pIC50']]
more['Fingerprint'] = more['Smiles'].apply(smiles_to_fingerprint)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more['Fingerprint'] = more['Smiles'].apply(smiles_to_fingerprint)


In [7]:
train_all = pd.concat([train, more],  axis=0, ignore_index=True)
train_all

Unnamed: 0,Smiles,pIC50,Fingerprint
0,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...,10.66,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,10.59,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
2,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,10.11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,10.09,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
4,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...,10.00,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
3811,Cn1cc(NC(=O)c2cnn3ccc(N[C@@H]4CCCC[C@@H]4N)nc2...,9.70,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3812,Cn1cc(NC(=O)c2cnn3ccc(N[C@@H]4CCCCNC4)nc23)c(C...,9.70,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3813,CC[C@H]1[C@@H](COc2nccc3cc(C(N)=O)c(OC)cc23)NC...,10.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3814,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...,10.00,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
train_all.drop_duplicates(subset='Smiles')
train_all

Unnamed: 0,Smiles,pIC50,Fingerprint
0,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...,10.66,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,10.59,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
2,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,10.11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,10.09,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
4,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...,10.00,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
3811,Cn1cc(NC(=O)c2cnn3ccc(N[C@@H]4CCCC[C@@H]4N)nc2...,9.70,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3812,Cn1cc(NC(=O)c2cnn3ccc(N[C@@H]4CCCCNC4)nc23)c(C...,9.70,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3813,CC[C@H]1[C@@H](COc2nccc3cc(C(N)=O)c(OC)cc23)NC...,10.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3814,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...,10.00,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
train_x =  np.stack(train_all['Fingerprint'].values)
train_y = train_all['pIC50'].values
train_x

array([[1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0]])

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

# 랜덤 포레스트 모델 학습
model = RandomForestRegressor()
# 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [100],  # 트리의 개수
    'max_depth': [None],  # 트리의 최대 깊이
    'min_samples_split': [5],  # 노드를 분할하기 위한 최소 샘플 수
    # 'min_samples_leaf': [2, 4],    # 리프 노드가 되기 위한 최소 샘플 수
}
# GridSearchCV 객체 생성
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# 그리드 서치 수행
grid_search.fit(train_x, train_y)
# 최적의 하이퍼파라미터 출력
print("Best Parameters:", grid_search.best_params_)

# 최적의 모델로 예측 및 평가
best_rf = grid_search.best_estimator_
val_y_pred = best_rf.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')

# Validation 데이터로부터의 학습 모델 평가
# model.fit(train_x, train_y)
# val_y_pred = model.predict(val_x)

# mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
# rmse = np.sqrt(mse)

# print(f'RMSE: {rmse}')


Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
RMSE: 1363.415616163378


In [None]:
checkpoint = "./check"
arguments = [
    '--data_path', '/content/gdrive/MyDrive/Colab Notebooks/data/train.csv',
    '--dataset_type', 'regression',
    '--save_dir', checkpoint,
    '--epochs', '10',
    '--num_folds', '2',
    '--ensemble_size', '1',
    '--smiles_columns', 'SMILES',
    '--target_columns', 'MLM', 'HLM'
]

args = chemprop.args.TrainArgs().parse_args(arguments)
mean_score, std_score = chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)

arguments = [
    '--test_path', '/content/gdrive/MyDrive/Colab Notebooks/data/test.csv',
    '--preds_path', '/content/gdrive/MyDrive/Colab Notebooks/result/myresult.csv',
    '--checkpoint_dir', checkpoint,
    '--smiles_columns', 'SMILES'
]

args = chemprop.args.PredictArgs().parse_args(arguments)
preds = chemprop.train.make_predictions(args=args)


In [None]:

test = pd.read_csv('./Data/test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

fingerprint_vectors  = test['Fingerprint'].apply(lambda x: np.array([int(bit) for bit in x]))
fingerprint_df = pd.DataFrame(fingerprint_vectors.tolist())
test_x = fingerprint_df
test_x.columns = test_x.columns.astype(str)

test_y_pred = model.predict(test_x)

In [None]:
from datetime import datetime
now = datetime.now()
formatted_now = str(now.strftime("%Y-%m-%d-%H-%M"))


submit = pd.read_csv('./Data/sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.head()

submit.to_csv('./Data/submit'+formatted_now+'.csv', index=False)