## data preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_train = pd.read_csv('./data/train.csv')
df_train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43
...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51


In [3]:
df_test = pd.read_csv('./data/test.csv')
df_test

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.290,92.86
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15
...,...,...,...,...,...,...,...,...,...
478,TEST_478,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2,1,7,4.207,55.13
479,TEST_479,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5,0,1,-1.736,70.16
480,TEST_480,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3,1,3,1.792,69.72
481,TEST_481,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3,2,2,0.423,69.64


In [4]:
df_sample = pd.read_csv('./data/sample_submission.csv')
df_sample

Unnamed: 0,id,MLM,HLM
0,TEST_000,0,0
1,TEST_001,0,0
2,TEST_002,0,0
3,TEST_003,0,0
4,TEST_004,0,0
...,...,...,...
478,TEST_478,0,0
479,TEST_479,0,0
480,TEST_480,0,0
481,TEST_481,0,0


In [5]:
# column 순서 조절 및 'id' drop
new_column_order = ['id', 'MLM', 'HLM', 'SMILES', 'AlogP', 'Molecular_Weight',
                    'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
                    'Molecular_PolarSurfaceArea']

df_train = df_train[new_column_order]
df_train = df_train.drop('id', axis=1)
df_test = df_test.drop('id', axis=1)

print(f'train size : {df_train.shape}')
df_train.head()

train size : (3498, 10)


Unnamed: 0,MLM,HLM,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,26.01,50.68,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,3.259,400.495,5,2,8,3.259,117.37
1,29.27,50.59,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,2.169,301.407,2,1,2,2.172,73.47
2,5.586,80.892,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,1.593,297.358,5,0,3,1.585,62.45
3,5.71,2.0,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,4.771,494.652,6,0,5,3.475,92.6
4,93.27,99.99,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,2.335,268.31,3,0,1,2.337,42.43


In [6]:
# train data 같은 인풋 값 대비 다른 아웃풋 average로 변환
duplicated_rows = df_train[df_train.iloc[:, 2:].duplicated(keep=False)]
sorted_duplicates = duplicated_rows.sort_values(by='SMILES', ascending=False)
average_df = sorted_duplicates.groupby('SMILES').mean().reset_index()

# column 순서 조절
new_column_order = ['MLM', 'HLM', 'SMILES', 'AlogP', 'Molecular_Weight',
                    'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
                    'Molecular_PolarSurfaceArea']

average_df = average_df[new_column_order]
average_df

Unnamed: 0,MLM,HLM,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,0.9225,28.0615,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,3.556,262.309,3.0,0.0,4.0,3.556,43.6
1,62.2175,77.911,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,2.172,337.372,4.0,2.0,3.0,2.169,82.0
2,68.631,64.669,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,2.293,367.428,5.0,2.0,3.0,2.307,139.85
3,32.1435,85.685,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,1.684,381.45,7.0,1.0,4.0,1.684,126.52
4,3.687,43.3385,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,2.843,360.49,3.0,0.0,4.0,2.843,49.85
5,62.1085,68.1015,CC(C)NC(=O)c1c(Cl)nn(C)c1NC(=O)c1cc(Br)nn1-c1n...,3.75,501.165,5.0,2.0,5.0,3.75,106.72
6,43.17,31.13,CC1CC(=O)N(c2ccc(-c3cccc(C#N)c3)cc2)N=C1c1ccc(...,4.449,381.427,4.0,1.0,3.0,4.446,76.69
7,2.3395,36.8145,CCCCC/N=c1\n(C)c(=O)nc2sccn12,2.484,252.336,4.0,0.0,4.0,2.474,73.56
8,3.442,3.6015,CCOC(=O)CC1(NC(=O)N2Cc3c(sc4c3CCCC4)-n3cccc3C2...,6.727,537.736,3.0,1.0,6.0,6.727,120.05
9,73.545,1.1345,CCc1nc2cc(Br)c(C(=O)OC)nc2n1CC(=O)c1ccccc1,3.815,402.242,5.0,0.0,6.0,3.819,74.08


In [7]:
# 중복값 제거한 train unique value
unique_df = df_train[~df_train.iloc[:, 2:].duplicated(keep=False)]

# unique value 와 average value concat
df_train = pd.concat([unique_df, average_df])
df_train.reset_index(drop = True, inplace = True)
df_train

Unnamed: 0,MLM,HLM,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,26.0100,50.680,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,3.259,400.495,5.0,2.0,8.0,3.259,117.37
1,29.2700,50.590,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,2.169,301.407,2.0,1.0,2.0,2.172,73.47
2,5.5860,80.892,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,1.593,297.358,5.0,0.0,3.0,1.585,62.45
3,5.7100,2.000,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,4.771,494.652,6.0,0.0,5.0,3.475,92.60
4,93.2700,99.990,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,2.335,268.310,3.0,0.0,1.0,2.337,42.43
...,...,...,...,...,...,...,...,...,...,...
3466,56.0435,66.126,Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...,4.282,553.480,5.0,0.0,7.0,4.009,65.98
3467,56.3130,63.732,Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...,4.304,459.520,6.0,1.0,4.0,4.304,113.53
3468,70.2150,93.840,N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1,-1.133,385.417,7.0,0.0,4.0,-1.133,108.65
3469,56.6085,49.519,N#Cc1nccnc1OC1CCN(C(=O)N2CCNC2=O)C1,-0.533,302.289,6.0,1.0,2.0,-0.533,111.44


In [8]:
print(f'test size : {df_test.shape}')
df_test.head()

test size : (483, 8)


Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76
1,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31
2,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.29,92.86
3,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21
4,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15


### merge train and test data for feature engineering

In [9]:
# make target dataframe
df_train_target = df_train[['MLM','HLM']].copy()
df_train_target

Unnamed: 0,MLM,HLM
0,26.0100,50.680
1,29.2700,50.590
2,5.5860,80.892
3,5.7100,2.000
4,93.2700,99.990
...,...,...
3466,56.0435,66.126
3467,56.3130,63.732
3468,70.2150,93.840
3469,56.6085,49.519


In [10]:
df_train = df_train.drop(['MLM','HLM'], axis = 1)
df_total = pd.concat([df_train, df_test])
df_total = df_total.reset_index(drop = True)
df_total.head()

Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,3.259,400.495,5.0,2.0,8.0,3.259,117.37
1,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,2.169,301.407,2.0,1.0,2.0,2.172,73.47
2,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,1.593,297.358,5.0,0.0,3.0,1.585,62.45
3,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,4.771,494.652,6.0,0.0,5.0,3.475,92.6
4,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,2.335,268.31,3.0,0.0,1.0,2.337,42.43


In [11]:
# Alop 결측값 처리
df_total[df_total.AlogP.isna()]

Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
2752,[H][C@]1(CC[C@@]2([H])[C@@H](C)C=CC3=C[C@H](C)...,,418.566,5.0,1.0,7.0,4.634,72.83
3334,COc1cc2c(cc1OC)/C(=N\c1ccccc1)N(Cc1ccccc1F)CC2,,390.45,3.0,0.0,5.0,4.911,34.06
3481,[H]C1(C(O)c2ccc3c(c2)OCO3)C(=O)Oc2cc(OC)ccc2C1...,,404.412,6.0,1.0,4.0,3.942,74.22


In [12]:
# 결측치 LogD 값으로 대체 (corr = 0.96)
df_total['AlogP'].fillna(df_total['LogD'], inplace=True)
row_2752 = df_total.loc[[2752]]
row_3334 = df_total.loc[[3334]]
row_3481 = df_total.loc[[3481]]

missing_df = pd.concat([row_2752, row_3334, row_3481])
missing_df

Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
2752,[H][C@]1(CC[C@@]2([H])[C@@H](C)C=CC3=C[C@H](C)...,4.634,418.566,5.0,1.0,7.0,4.634,72.83
3334,COc1cc2c(cc1OC)/C(=N\c1ccccc1)N(Cc1ccccc1F)CC2,4.911,390.45,3.0,0.0,5.0,4.911,34.06
3481,[H]C1(C(O)c2ccc3c(c2)OCO3)C(=O)Oc2cc(OC)ccc2C1...,3.942,404.412,6.0,1.0,4.0,3.942,74.22


In [13]:
df_total.isna().sum()

SMILES                        0
AlogP                         0
Molecular_Weight              0
Num_H_Acceptors               0
Num_H_Donors                  0
Num_RotatableBonds            0
LogD                          0
Molecular_PolarSurfaceArea    0
dtype: int64

In [14]:
def resumetable(df):
    print(f'dataset size: {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=['data type'])
    summary = summary.reset_index()
    summary = summary.rename(columns={'index': 'feature'})
    summary['num of missing value'] = df.isnull().sum().values
    summary['num of unique value'] = df.nunique().values
    summary['First value'] = df.loc[0].values
    summary['Second value'] = df.loc[1].values
    summary['Third value'] = df.loc[2].values
    return summary

resumetable(df_total)

dataset size: (3954, 8)


Unnamed: 0,feature,data type,num of missing value,num of unique value,First value,Second value,Third value
0,SMILES,object,0,3954,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1
1,AlogP,float64,0,2837,3.259,2.169,1.593
2,Molecular_Weight,float64,0,3425,400.495,301.407,297.358
3,Num_H_Acceptors,float64,0,15,5.0,2.0,5.0
4,Num_H_Donors,float64,0,10,2.0,1.0,0.0
5,Num_RotatableBonds,float64,0,21,8.0,2.0,3.0
6,LogD,float64,0,2910,3.259,2.172,1.585
7,Molecular_PolarSurfaceArea,float64,0,2378,117.37,73.47,62.45


In [15]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Descriptors, AllChem
from rdkit.Chem.Draw import IPythonConsole
from collections import Counter

def count_atoms(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        atom_symbols = [atom.GetSymbol() for atom in mol.GetAtoms()]
        atom_counts = dict(Counter(atom_symbols))
        return atom_counts
    else:
        return {}

df_total['Atom_Counts'] = df_total['SMILES'].apply(count_atoms)
atom_counts_df = pd.DataFrame(df_total['Atom_Counts'].tolist()).fillna(0).astype(int)
df_total = pd.concat([df_total, atom_counts_df], axis=1)
df_total = df_total.drop(columns=['Atom_Counts'])

df_total

Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,C,O,N,S,F,Cl,Br,Se,I,P
0,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,3.259,400.495,5.0,2.0,8.0,3.259,117.37,20,3,4,1,0,0,0,0,0,0
1,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,2.169,301.407,2.0,1.0,2.0,2.172,73.47,16,1,3,1,0,0,0,0,0,0
2,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,1.593,297.358,5.0,0.0,3.0,1.585,62.45,15,0,7,0,0,0,0,0,0,0
3,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,4.771,494.652,6.0,0.0,5.0,3.475,92.60,26,2,6,1,0,0,0,0,0,0
4,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,2.335,268.310,3.0,0.0,1.0,2.337,42.43,16,2,2,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3949,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2.0,1.0,7.0,4.207,55.13,18,2,2,0,0,0,0,0,0,0
3950,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5.0,0.0,1.0,-1.736,70.16,17,4,3,0,0,0,0,0,0,0
3951,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3.0,1.0,3.0,1.792,69.72,20,3,3,0,0,0,0,0,0,0
3952,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3.0,2.0,2.0,0.423,69.64,13,3,2,0,1,0,1,0,0,0


In [16]:
def bond_frequency(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        bonds = mol.GetBonds()
        bond_counts = [bond.GetBondTypeAsDouble() for bond in bonds]
        bond_freq = dict(Counter(bond_counts))
        return bond_freq
    else:
        return {}

# dictionary 형태로 저장 및 pandas column 변환
df_total['Bond_Frequencies'] = df_total['SMILES'].apply(bond_frequency)
bond_freq_df = pd.DataFrame(df_total['Bond_Frequencies'].tolist()).fillna(0).astype(int)
df_total = pd.concat([df_total, bond_freq_df], axis=1)

# 추출한 특성 drop
df_total = df_total.drop(columns=['Bond_Frequencies'])

# 결합 column name 변경
new_column_names = {
    1.0: 'Bond_1.0',
    1.5: 'Bond_1.5',
    2.0: 'Bond_2.0',
    3.0: 'Bond_3.0'
}
df_total = df_total.rename(columns=new_column_names)

df_total

Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,C,O,...,F,Cl,Br,Se,I,P,Bond_1.0,Bond_1.5,Bond_2.0,Bond_3.0
0,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,3.259,400.495,5.0,2.0,8.0,3.259,117.37,20,3,...,0,0,0,0,0,0,13,16,1,0
1,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,2.169,301.407,2.0,1.0,2.0,2.172,73.47,16,1,...,0,0,0,0,0,0,11,11,1,0
2,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,1.593,297.358,5.0,0.0,3.0,1.585,62.45,15,0,...,0,0,0,0,0,0,10,15,0,0
3,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,4.771,494.652,6.0,0.0,5.0,3.475,92.60,26,2,...,0,0,0,0,0,0,20,17,2,0
4,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,2.335,268.310,3.0,0.0,1.0,2.337,42.43,16,2,...,0,0,0,0,0,0,9,12,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3949,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2.0,1.0,7.0,4.207,55.13,18,2,...,0,0,0,0,0,0,17,5,1,0
3950,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5.0,0.0,1.0,-1.736,70.16,17,4,...,0,0,0,0,0,0,22,0,4,0
3951,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3.0,1.0,3.0,1.792,69.72,20,3,...,0,0,0,0,0,0,14,12,3,0
3952,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3.0,2.0,2.0,0.423,69.64,13,3,...,1,0,1,0,0,0,7,12,2,0


In [17]:
df_train = df_total.iloc[:len(df_train), :].copy()
df_train.reset_index(drop = True, inplace = True)
df_train.drop(['SMILES'], axis=1, inplace = True)
df_train

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,C,O,N,...,F,Cl,Br,Se,I,P,Bond_1.0,Bond_1.5,Bond_2.0,Bond_3.0
0,3.259,400.495,5.0,2.0,8.0,3.259,117.37,20,3,4,...,0,0,0,0,0,0,13,16,1,0
1,2.169,301.407,2.0,1.0,2.0,2.172,73.47,16,1,3,...,0,0,0,0,0,0,11,11,1,0
2,1.593,297.358,5.0,0.0,3.0,1.585,62.45,15,0,7,...,0,0,0,0,0,0,10,15,0,0
3,4.771,494.652,6.0,0.0,5.0,3.475,92.60,26,2,6,...,0,0,0,0,0,0,20,17,2,0
4,2.335,268.310,3.0,0.0,1.0,2.337,42.43,16,2,2,...,0,0,0,0,0,0,9,12,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3466,4.282,553.480,5.0,0.0,7.0,4.009,65.98,29,3,4,...,0,2,0,0,0,0,22,18,2,0
3467,4.304,459.520,6.0,1.0,4.0,4.304,113.53,24,3,5,...,0,0,0,0,0,0,7,27,3,0
3468,-1.133,385.417,7.0,0.0,4.0,-1.133,108.65,19,4,5,...,0,0,0,0,0,0,22,6,2,1
3469,-0.533,302.289,6.0,1.0,2.0,-0.533,111.44,13,3,6,...,0,0,0,0,0,0,15,6,2,1


In [18]:
df_test = df_total.iloc[len(df_train):, :].copy()
df_test.reset_index(drop = True, inplace = True)
df_test.drop(['SMILES'], axis=1, inplace = True)
df_test

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,C,O,N,...,F,Cl,Br,Se,I,P,Bond_1.0,Bond_1.5,Bond_2.0,Bond_3.0
0,2.641,361.505,4.0,2.0,7.0,2.635,92.76,18,1,5,...,0,0,0,0,0,0,16,11,0,0
1,0.585,370.399,5.0,0.0,3.0,0.585,68.31,20,5,2,...,0,0,0,0,0,0,16,12,2,0
2,4.276,347.414,4.0,4.0,5.0,4.290,92.86,20,1,5,...,0,0,0,0,0,0,8,18,2,0
3,1.795,345.358,5.0,0.0,2.0,1.795,81.21,18,1,7,...,0,0,0,0,0,0,8,21,1,0
4,1.219,353.418,4.0,0.0,2.0,0.169,61.15,19,2,5,...,0,0,0,0,0,0,12,15,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,4.207,306.443,2.0,1.0,7.0,4.207,55.13,18,2,2,...,0,0,0,0,0,0,17,5,1,0
479,-0.608,335.398,5.0,0.0,1.0,-1.736,70.16,17,4,3,...,0,0,0,0,0,0,22,0,4,0
480,1.792,349.383,3.0,1.0,3.0,1.792,69.72,20,3,3,...,0,0,0,0,0,0,14,12,3,0
481,0.790,341.132,3.0,2.0,2.0,0.423,69.64,13,3,2,...,1,0,1,0,0,0,7,12,2,0


In [19]:
# optuna study
num_of_study = 2000

### target MLM

In [20]:
df_train_target_MLM = df_train_target['MLM']
df_train_target_MLM

0       26.0100
1       29.2700
2        5.5860
3        5.7100
4       93.2700
         ...   
3466    56.0435
3467    56.3130
3468    70.2150
3469    56.6085
3470     6.1730
Name: MLM, Length: 3471, dtype: float64

In [21]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import optuna
from sklearn.model_selection import train_test_split

def objective(trial):

    X_train, X_valid, y_train, y_valid = train_test_split(df_train, df_train_target_MLM, test_size = 0.2, random_state=42)

    param = {
        # constant
        'eval_metric': 'rmse',
        'random_state': 10,
        'n_estimators': 3000,
        'early_stopping_rounds': 300,
        'n_jobs': -1,
        # log_uniform
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-2, 100.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 100.0),
        # suggest_discrete_uniform
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.05),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.05),
        'learning_rate': trial.suggest_discrete_uniform('learning_rate', 0.001, 0.003, 0.001),
        # suggest_int
        'max_depth': trial.suggest_int('max_depth', 3, 10, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }
    model = XGBRegressor(**param)  
    model.fit(X_train, y_train, eval_set=[(X_valid,y_valid)], verbose=False)
    preds = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, preds, squared=False)
    
    return rmse

In [22]:
study = optuna.create_study(direction='minimize', study_name=None)
with tqdm(total=num_of_study) as pbar:  
    def callback(study, trial):
        pbar.update(1)
    
    study.optimize(objective, n_trials=num_of_study, callbacks=[callback])

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2023-09-08 17:03:01,280] A new study created in memory with name: no-name-9bdd2b7e-6ac9-474a-aabf-3fddf3597098
  0%|          | 0/2000 [00:00<?, ?it/s][I 2023-09-08 17:03:03,591] Trial 0 finished with value: 30.959240309761988 and parameters: {'reg_alpha': 45.76712527025111, 'reg_lambda': 0.42688851063562777, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.003, 'max_depth': 6, 'min_child_weight': 8}. Best is trial 0 with value: 30.959240309761988.
  0%|          | 1/2000 [00:02<1:16:55,  2.31s/it][I 2023-09-08 17:03:06,110] Trial 1 finished with value: 31.15247260931395 and parameters: {'reg_alpha': 0.019307748784433112, 'reg_lambda': 0.537523900087957, 'colsample_bytree': 0.95, 'subsample': 0.8500000000000001, 'learning_rate': 0.003, 'max_depth': 8, 'min_child_weight': 5}. Best is trial 0 with value: 30.959240309761988.
  0%|          | 2/2000 [00:04<1:21:01,  2.43s/it][I 2023-09-08 17:03:08,902] Trial 2 finished with value: 30.971508364005796 and parameters: {'reg_al

Number of finished trials: 2000
Best trial: {'reg_alpha': 0.048927910215553386, 'reg_lambda': 0.058241609679905534, 'colsample_bytree': 0.8500000000000001, 'subsample': 0.55, 'learning_rate': 0.003, 'max_depth': 4, 'min_child_weight': 6}


In [23]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_weight,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,30.959240,2023-09-08 17:03:01.283339,2023-09-08 17:03:03.590367,0 days 00:00:02.307028,0.70,0.003,6,8,45.767125,0.426889,0.50,COMPLETE
1,1,31.152473,2023-09-08 17:03:03.592368,2023-09-08 17:03:06.110444,0 days 00:00:02.518076,0.95,0.003,8,5,0.019308,0.537524,0.85,COMPLETE
2,2,30.971508,2023-09-08 17:03:06.111444,2023-09-08 17:03:08.902084,0 days 00:00:02.790640,0.60,0.002,6,10,14.760156,0.341005,0.55,COMPLETE
3,3,30.999034,2023-09-08 17:03:08.903084,2023-09-08 17:03:12.336858,0 days 00:00:03.433774,1.00,0.002,6,1,0.042330,1.243688,0.80,COMPLETE
4,4,31.010322,2023-09-08 17:03:12.338859,2023-09-08 17:03:14.143272,0 days 00:00:01.804413,0.75,0.003,7,1,5.399988,0.015044,0.60,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,30.956523,2023-09-08 18:23:33.617578,2023-09-08 18:23:35.941605,0 days 00:00:02.324027,0.85,0.003,3,5,3.876662,11.181023,0.50,COMPLETE
1996,1996,30.702975,2023-09-08 18:23:35.943605,2023-09-08 18:23:38.392158,0 days 00:00:02.448553,0.85,0.003,4,5,0.018551,0.016307,0.55,COMPLETE
1997,1997,30.743767,2023-09-08 18:23:38.393158,2023-09-08 18:23:40.861720,0 days 00:00:02.468562,0.85,0.003,4,4,0.020190,0.188744,0.55,COMPLETE
1998,1998,30.825783,2023-09-08 18:23:40.864721,2023-09-08 18:23:43.299269,0 days 00:00:02.434548,0.85,0.003,4,5,0.014778,0.648968,0.80,COMPLETE


In [24]:
optuna.visualization.plot_optimization_history(study)

In [25]:
optuna.visualization.plot_param_importances(study)

In [26]:
params=study.best_params
params['eval_metric'] = 'rmse'
params['early_stopping_rounds'] = 300
params['random_state'] = 10
params['n_estimators'] = 3000
params['n_jobs'] = -1

params

{'reg_alpha': 0.048927910215553386,
 'reg_lambda': 0.058241609679905534,
 'colsample_bytree': 0.8500000000000001,
 'subsample': 0.55,
 'learning_rate': 0.003,
 'max_depth': 4,
 'min_child_weight': 6,
 'eval_metric': 'rmse',
 'early_stopping_rounds': 300,
 'random_state': 10,
 'n_estimators': 3000,
 'n_jobs': -1}

In [27]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=20, random_state=42, shuffle=True)
preds_MLM = np.zeros(df_test.shape[0])
rmse=[]
n=0

for trn_idx, test_idx in kf.split(df_train, df_train_target_MLM):
    X_tr, X_val=df_train.iloc[trn_idx], df_train.iloc[test_idx]
    y_tr, y_val=df_train_target_MLM.iloc[trn_idx], df_train_target_MLM.iloc[test_idx]

    model = XGBRegressor(**params)
    model.fit(X_tr, y_tr, eval_set=[(X_val,y_val)], verbose=False)

    preds_MLM+=model.predict(df_test)/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(f'trial {n+1}:', round(rmse[n], 5))
    n+=1

print(f'\naverage: {round(np.mean(rmse), 5)}')

trial 1: 33.48509
trial 2: 30.47477
trial 3: 29.59723
trial 4: 28.73014
trial 5: 34.33606
trial 6: 33.26685
trial 7: 29.0479
trial 8: 28.71523
trial 9: 32.29424
trial 10: 31.87763
trial 11: 31.15511
trial 12: 28.71792
trial 13: 32.43777
trial 14: 29.98492
trial 15: 30.97795
trial 16: 30.46549
trial 17: 31.22981
trial 18: 33.11681
trial 19: 30.32198
trial 20: 30.96991

average: 31.06014


### target HLM

In [28]:
df_train_target_HLM = df_train_target['HLM']
df_train_target_HLM

0       50.680
1       50.590
2       80.892
3        2.000
4       99.990
         ...  
3466    66.126
3467    63.732
3468    93.840
3469    49.519
3470    10.867
Name: HLM, Length: 3471, dtype: float64

In [29]:
def objective(trial):

    X_train, X_valid, y_train, y_valid = train_test_split(df_train, df_train_target_HLM, test_size = 0.2, random_state=42)

    param = {
        # constant
        'eval_metric': 'rmse',
        'random_state': 10,
        'n_estimators': 3000,
        'early_stopping_rounds': 300,
        'n_jobs': -1,
        # log_uniform
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-2, 100.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 100.0),
        # suggest_discrete_uniform
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.05),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.05),
        'learning_rate': trial.suggest_discrete_uniform('learning_rate', 0.001, 0.003, 0.001),
        # suggest_int
        'max_depth': trial.suggest_int('max_depth', 3, 10, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }
    model = XGBRegressor(**param)  
    model.fit(X_train, y_train, eval_set=[(X_valid,y_valid)], verbose=False)
    preds = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, preds, squared=False)
    
    return rmse

In [30]:
study = optuna.create_study(direction='minimize', study_name=None)
with tqdm(total=num_of_study) as pbar:  
    def callback(study, trial):
        pbar.update(1)  
    
    study.optimize(objective, n_trials=num_of_study, callbacks=[callback])

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2023-09-08 18:28:22,552] A new study created in memory with name: no-name-255fd3ae-2010-4c34-837a-f4082a44b3c5
  0%|          | 0/2000 [00:00<?, ?it/s][I 2023-09-08 18:28:25,549] Trial 0 finished with value: 32.054924791157234 and parameters: {'reg_alpha': 1.1989357094712396, 'reg_lambda': 30.966982652260445, 'colsample_bytree': 0.95, 'subsample': 0.75, 'learning_rate': 0.001, 'max_depth': 5, 'min_child_weight': 10}. Best is trial 0 with value: 32.054924791157234.
  0%|          | 1/2000 [00:02<1:39:40,  2.99s/it][I 2023-09-08 18:28:28,100] Trial 1 finished with value: 31.855764617150264 and parameters: {'reg_alpha': 55.10436186344119, 'reg_lambda': 0.5521222052786733, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.003, 'max_depth': 10, 'min_child_weight': 4}. Best is trial 1 with value: 31.855764617150264.
  0%|          | 2/2000 [00:05<1:30:58,  2.73s/it][I 2023-09-08 18:28:31,982] Trial 2 finished with value: 31.897926345927363 and parameters: {'reg_alpha': 0.01855

Number of finished trials: 2000
Best trial: {'reg_alpha': 1.4882737253586376, 'reg_lambda': 0.046814846307223545, 'colsample_bytree': 0.8, 'subsample': 0.55, 'learning_rate': 0.002, 'max_depth': 5, 'min_child_weight': 3}


In [31]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_weight,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,32.054925,2023-09-08 18:28:22.559321,2023-09-08 18:28:25.549996,0 days 00:00:02.990675,0.95,0.001,5,10,1.198936,30.966983,0.75,COMPLETE
1,1,31.855765,2023-09-08 18:28:25.550996,2023-09-08 18:28:28.100088,0 days 00:00:02.549092,0.70,0.003,10,4,55.104362,0.552122,0.60,COMPLETE
2,2,31.897926,2023-09-08 18:28:28.101088,2023-09-08 18:28:31.982466,0 days 00:00:03.881378,0.55,0.001,7,4,0.018560,4.773373,0.70,COMPLETE
3,3,31.990875,2023-09-08 18:28:31.984466,2023-09-08 18:28:34.539545,0 days 00:00:02.555079,1.00,0.001,3,10,0.072796,0.063366,0.80,COMPLETE
4,4,31.638029,2023-09-08 18:28:34.540545,2023-09-08 18:28:36.962105,0 days 00:00:02.421560,0.50,0.003,8,1,3.835101,0.056997,0.50,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,31.463543,2023-09-08 20:05:56.501327,2023-09-08 20:05:59.608028,0 days 00:00:03.106701,0.80,0.002,5,3,1.222854,0.022663,0.55,COMPLETE
1996,1996,31.529403,2023-09-08 20:05:59.611028,2023-09-08 20:06:02.674720,0 days 00:00:03.063692,0.85,0.002,5,4,2.167058,0.066198,0.55,COMPLETE
1997,1997,31.504698,2023-09-08 20:06:02.676720,2023-09-08 20:06:05.717414,0 days 00:00:03.040694,0.80,0.002,5,3,6.692476,0.053513,0.55,COMPLETE
1998,1998,31.469147,2023-09-08 20:06:05.719414,2023-09-08 20:06:08.459032,0 days 00:00:02.739618,0.75,0.002,6,3,1.430566,0.033499,0.55,COMPLETE


In [32]:
optuna.visualization.plot_optimization_history(study)

In [33]:
optuna.visualization.plot_param_importances(study)

In [34]:
params=study.best_params
params['eval_metric'] = 'rmse'
params['early_stopping_rounds'] = 300
params['random_state'] = 10
params['n_estimators'] = 3000
params['n_jobs'] = -1

params

{'reg_alpha': 1.4882737253586376,
 'reg_lambda': 0.046814846307223545,
 'colsample_bytree': 0.8,
 'subsample': 0.55,
 'learning_rate': 0.002,
 'max_depth': 5,
 'min_child_weight': 3,
 'eval_metric': 'rmse',
 'early_stopping_rounds': 300,
 'random_state': 10,
 'n_estimators': 3000,
 'n_jobs': -1}

In [35]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=20, random_state=42, shuffle=True)
preds_HLM = np.zeros(df_test.shape[0])
rmse=[]
n=0

for trn_idx, test_idx in kf.split(df_train, df_train_target_HLM):
    X_tr, X_val=df_train.iloc[trn_idx], df_train.iloc[test_idx]
    y_tr, y_val=df_train_target_HLM.iloc[trn_idx], df_train_target_HLM.iloc[test_idx]

    model = XGBRegressor(**params)
    model.fit(X_tr, y_tr, eval_set=[(X_val,y_val)], verbose= False)

    preds_HLM+=model.predict(df_test)/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(f'trial {n+1}:', round(rmse[n], 5))
    n+=1

print(f'\naverage: {round(np.mean(rmse), 5)}')

trial 1: 33.4818
trial 2: 30.47022
trial 3: 31.79986
trial 4: 29.56791
trial 5: 32.40442
trial 6: 30.91349
trial 7: 31.70711
trial 8: 30.62197
trial 9: 31.37212
trial 10: 31.8483
trial 11: 31.97428
trial 12: 30.45308
trial 13: 34.44262
trial 14: 32.54199
trial 15: 32.31536
trial 16: 32.53306
trial 17: 31.58602
trial 18: 34.37121
trial 19: 30.94835
trial 20: 32.79642

average: 31.90748


## export csv

In [36]:
df_sample['MLM'] = preds_MLM
df_sample['HLM'] = preds_HLM
df_sample.to_csv("./submission/submission_02_study2000.csv", index = False, encoding = "utf-8-sig")

In [37]:
df_sample

Unnamed: 0,id,MLM,HLM
0,TEST_000,24.642236,46.663122
1,TEST_001,59.943156,71.603817
2,TEST_002,36.561156,52.326027
3,TEST_003,49.971835,72.245228
4,TEST_004,65.738522,76.872831
...,...,...,...
478,TEST_478,8.917876,28.626252
479,TEST_479,76.300281,80.029239
480,TEST_480,43.682520,67.167583
481,TEST_481,67.181660,69.642870
