## data preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_train = pd.read_csv('./data/train.csv')
df_train

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43
...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51


In [3]:
df_test = pd.read_csv('./data/test.csv')
df_test

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.290,92.86
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15
...,...,...,...,...,...,...,...,...,...
478,TEST_478,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2,1,7,4.207,55.13
479,TEST_479,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5,0,1,-1.736,70.16
480,TEST_480,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3,1,3,1.792,69.72
481,TEST_481,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3,2,2,0.423,69.64


In [4]:
df_sample = pd.read_csv('./data/sample_submission.csv')
df_sample

Unnamed: 0,id,MLM,HLM
0,TEST_000,0,0
1,TEST_001,0,0
2,TEST_002,0,0
3,TEST_003,0,0
4,TEST_004,0,0
...,...,...,...
478,TEST_478,0,0
479,TEST_479,0,0
480,TEST_480,0,0
481,TEST_481,0,0


In [5]:
# column 순서 조절 및 'id' drop
new_column_order = ['id', 'MLM', 'HLM', 'SMILES', 'AlogP', 'Molecular_Weight',
                    'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
                    'Molecular_PolarSurfaceArea']

df_train = df_train[new_column_order]
df_train = df_train.drop('id', axis=1)
df_test = df_test.drop('id', axis=1)

print(f'train size : {df_train.shape}')
df_train.head()

train size : (3498, 10)


Unnamed: 0,MLM,HLM,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,26.01,50.68,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,3.259,400.495,5,2,8,3.259,117.37
1,29.27,50.59,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,2.169,301.407,2,1,2,2.172,73.47
2,5.586,80.892,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,1.593,297.358,5,0,3,1.585,62.45
3,5.71,2.0,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,4.771,494.652,6,0,5,3.475,92.6
4,93.27,99.99,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,2.335,268.31,3,0,1,2.337,42.43


In [6]:
# train data 같은 인풋 값 대비 다른 아웃풋 average로 변환
duplicated_rows = df_train[df_train.iloc[:, 2:].duplicated(keep=False)]
sorted_duplicates = duplicated_rows.sort_values(by='SMILES', ascending=False)
average_df = sorted_duplicates.groupby('SMILES').mean().reset_index()

# column 순서 조절
new_column_order = ['MLM', 'HLM', 'SMILES', 'AlogP', 'Molecular_Weight',
                    'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
                    'Molecular_PolarSurfaceArea']

average_df = average_df[new_column_order]
average_df

Unnamed: 0,MLM,HLM,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,0.9225,28.0615,C(=C/c1nnn(Cc2ccccc2)n1)\c1ccccc1,3.556,262.309,3.0,0.0,4.0,3.556,43.6
1,62.2175,77.911,CC(=O)Nc1ccc(N2N=C(c3ccc(O)cc3)C(C)CC2=O)cc1,2.172,337.372,4.0,2.0,3.0,2.169,82.0
2,68.631,64.669,CC(=O)Nc1nc2ccc(-c3nn(C(C)C)c4nc(N)ncc34)cc2s1,2.293,367.428,5.0,2.0,3.0,2.307,139.85
3,32.1435,85.685,CC(C)(C)OC(=O)N1CCC(c2n[nH]c3nc(S(C)(=O)=O)ncc...,1.684,381.45,7.0,1.0,4.0,1.684,126.52
4,3.687,43.3385,CC(C)COc1cc(=O)n2c(c1C(=O)N1CCC(C)CC1)CCCCC2,2.843,360.49,3.0,0.0,4.0,2.843,49.85
5,62.1085,68.1015,CC(C)NC(=O)c1c(Cl)nn(C)c1NC(=O)c1cc(Br)nn1-c1n...,3.75,501.165,5.0,2.0,5.0,3.75,106.72
6,43.17,31.13,CC1CC(=O)N(c2ccc(-c3cccc(C#N)c3)cc2)N=C1c1ccc(...,4.449,381.427,4.0,1.0,3.0,4.446,76.69
7,2.3395,36.8145,CCCCC/N=c1\n(C)c(=O)nc2sccn12,2.484,252.336,4.0,0.0,4.0,2.474,73.56
8,3.442,3.6015,CCOC(=O)CC1(NC(=O)N2Cc3c(sc4c3CCCC4)-n3cccc3C2...,6.727,537.736,3.0,1.0,6.0,6.727,120.05
9,73.545,1.1345,CCc1nc2cc(Br)c(C(=O)OC)nc2n1CC(=O)c1ccccc1,3.815,402.242,5.0,0.0,6.0,3.819,74.08


In [7]:
# 중복값 제거한 train unique value
unique_df = df_train[~df_train.iloc[:, 2:].duplicated(keep=False)]

# unique value 와 average value concat
df_train = pd.concat([unique_df, average_df])
df_train.reset_index(drop = True, inplace = True)
df_train

Unnamed: 0,MLM,HLM,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,26.0100,50.680,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,3.259,400.495,5.0,2.0,8.0,3.259,117.37
1,29.2700,50.590,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,2.169,301.407,2.0,1.0,2.0,2.172,73.47
2,5.5860,80.892,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,1.593,297.358,5.0,0.0,3.0,1.585,62.45
3,5.7100,2.000,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,4.771,494.652,6.0,0.0,5.0,3.475,92.60
4,93.2700,99.990,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,2.335,268.310,3.0,0.0,1.0,2.337,42.43
...,...,...,...,...,...,...,...,...,...,...
3466,56.0435,66.126,Cc1ccc(-c2ccc(C(CN3CCCC3)N(C)C(=O)CN3C(=O)COc4...,4.282,553.480,5.0,0.0,7.0,4.009,65.98
3467,56.3130,63.732,Cc1ccccc1-c1nc2ccccc2cc1C(C)n1c(=O)[nH]c2c(S(C...,4.304,459.520,6.0,1.0,4.0,4.304,113.53
3468,70.2150,93.840,N#Cc1nccnc1OC1CCN(C(=O)C2CC(=O)N(C3CCOCC3)C2)C1,-1.133,385.417,7.0,0.0,4.0,-1.133,108.65
3469,56.6085,49.519,N#Cc1nccnc1OC1CCN(C(=O)N2CCNC2=O)C1,-0.533,302.289,6.0,1.0,2.0,-0.533,111.44


In [8]:
print(f'test size : {df_test.shape}')
df_test.head()

test size : (483, 8)


Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76
1,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31
2,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.29,92.86
3,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21
4,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15


### merge train and test data for feature engineering

In [9]:
# make target dataframe
df_train_target = df_train[['MLM','HLM']].copy()
df_train_target

Unnamed: 0,MLM,HLM
0,26.0100,50.680
1,29.2700,50.590
2,5.5860,80.892
3,5.7100,2.000
4,93.2700,99.990
...,...,...
3466,56.0435,66.126
3467,56.3130,63.732
3468,70.2150,93.840
3469,56.6085,49.519


In [10]:
df_train = df_train.drop(['MLM','HLM'], axis = 1)
df_total = pd.concat([df_train, df_test])
df_total = df_total.reset_index(drop = True)
df_total.head()

Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,3.259,400.495,5.0,2.0,8.0,3.259,117.37
1,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,2.169,301.407,2.0,1.0,2.0,2.172,73.47
2,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,1.593,297.358,5.0,0.0,3.0,1.585,62.45
3,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,4.771,494.652,6.0,0.0,5.0,3.475,92.6
4,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,2.335,268.31,3.0,0.0,1.0,2.337,42.43


In [11]:
# Alop 결측값 처리
df_total[df_total.AlogP.isna()]

Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
2752,[H][C@]1(CC[C@@]2([H])[C@@H](C)C=CC3=C[C@H](C)...,,418.566,5.0,1.0,7.0,4.634,72.83
3334,COc1cc2c(cc1OC)/C(=N\c1ccccc1)N(Cc1ccccc1F)CC2,,390.45,3.0,0.0,5.0,4.911,34.06
3481,[H]C1(C(O)c2ccc3c(c2)OCO3)C(=O)Oc2cc(OC)ccc2C1...,,404.412,6.0,1.0,4.0,3.942,74.22


In [12]:
# 결측치 LogD 값으로 대체 (corr = 0.96)
df_total['AlogP'].fillna(df_total['LogD'], inplace=True)
row_2752 = df_total.loc[[2752]]
row_3334 = df_total.loc[[3334]]
row_3481 = df_total.loc[[3481]]

missing_df = pd.concat([row_2752, row_3334, row_3481])
missing_df

Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
2752,[H][C@]1(CC[C@@]2([H])[C@@H](C)C=CC3=C[C@H](C)...,4.634,418.566,5.0,1.0,7.0,4.634,72.83
3334,COc1cc2c(cc1OC)/C(=N\c1ccccc1)N(Cc1ccccc1F)CC2,4.911,390.45,3.0,0.0,5.0,4.911,34.06
3481,[H]C1(C(O)c2ccc3c(c2)OCO3)C(=O)Oc2cc(OC)ccc2C1...,3.942,404.412,6.0,1.0,4.0,3.942,74.22


In [13]:
df_total.isna().sum()

SMILES                        0
AlogP                         0
Molecular_Weight              0
Num_H_Acceptors               0
Num_H_Donors                  0
Num_RotatableBonds            0
LogD                          0
Molecular_PolarSurfaceArea    0
dtype: int64

In [14]:
def resumetable(df):
    print(f'dataset size: {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=['data type'])
    summary = summary.reset_index()
    summary = summary.rename(columns={'index': 'feature'})
    summary['num of missing value'] = df.isnull().sum().values
    summary['num of unique value'] = df.nunique().values
    summary['First value'] = df.loc[0].values
    summary['Second value'] = df.loc[1].values
    summary['Third value'] = df.loc[2].values
    return summary

resumetable(df_total)

dataset size: (3954, 8)


Unnamed: 0,feature,data type,num of missing value,num of unique value,First value,Second value,Third value
0,SMILES,object,0,3954,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1
1,AlogP,float64,0,2837,3.259,2.169,1.593
2,Molecular_Weight,float64,0,3425,400.495,301.407,297.358
3,Num_H_Acceptors,float64,0,15,5.0,2.0,5.0
4,Num_H_Donors,float64,0,10,2.0,1.0,0.0
5,Num_RotatableBonds,float64,0,21,8.0,2.0,3.0
6,LogD,float64,0,2910,3.259,2.172,1.585
7,Molecular_PolarSurfaceArea,float64,0,2378,117.37,73.47,62.45


In [15]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Descriptors, AllChem
from rdkit.Chem.Draw import IPythonConsole
from collections import Counter

def count_atoms(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        atom_symbols = [atom.GetSymbol() for atom in mol.GetAtoms()]
        atom_counts = dict(Counter(atom_symbols))
        return atom_counts
    else:
        return {}

df_total['Atom_Counts'] = df_total['SMILES'].apply(count_atoms)
atom_counts_df = pd.DataFrame(df_total['Atom_Counts'].tolist()).fillna(0).astype(int)
df_total = pd.concat([df_total, atom_counts_df], axis=1)
df_total = df_total.drop(columns=['Atom_Counts'])

df_total

Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,C,O,N,S,F,Cl,Br,Se,I,P
0,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,3.259,400.495,5.0,2.0,8.0,3.259,117.37,20,3,4,1,0,0,0,0,0,0
1,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,2.169,301.407,2.0,1.0,2.0,2.172,73.47,16,1,3,1,0,0,0,0,0,0
2,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,1.593,297.358,5.0,0.0,3.0,1.585,62.45,15,0,7,0,0,0,0,0,0,0
3,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,4.771,494.652,6.0,0.0,5.0,3.475,92.60,26,2,6,1,0,0,0,0,0,0
4,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,2.335,268.310,3.0,0.0,1.0,2.337,42.43,16,2,2,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3949,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2.0,1.0,7.0,4.207,55.13,18,2,2,0,0,0,0,0,0,0
3950,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5.0,0.0,1.0,-1.736,70.16,17,4,3,0,0,0,0,0,0,0
3951,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3.0,1.0,3.0,1.792,69.72,20,3,3,0,0,0,0,0,0,0
3952,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3.0,2.0,2.0,0.423,69.64,13,3,2,0,1,0,1,0,0,0


In [16]:
def bond_frequency(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        bonds = mol.GetBonds()
        bond_counts = [bond.GetBondTypeAsDouble() for bond in bonds]
        bond_freq = dict(Counter(bond_counts))
        return bond_freq
    else:
        return {}

# dictionary 형태로 저장 및 pandas column 변환
df_total['Bond_Frequencies'] = df_total['SMILES'].apply(bond_frequency)
bond_freq_df = pd.DataFrame(df_total['Bond_Frequencies'].tolist()).fillna(0).astype(int)
df_total = pd.concat([df_total, bond_freq_df], axis=1)

# 추출한 특성 drop
df_total = df_total.drop(columns=['Bond_Frequencies'])

# 결합 column name 변경
new_column_names = {
    1.0: 'Bond_1.0',
    1.5: 'Bond_1.5',
    2.0: 'Bond_2.0',
    3.0: 'Bond_3.0'
}
df_total = df_total.rename(columns=new_column_names)

df_total

Unnamed: 0,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,C,O,...,F,Cl,Br,Se,I,P,Bond_1.0,Bond_1.5,Bond_2.0,Bond_3.0
0,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,3.259,400.495,5.0,2.0,8.0,3.259,117.37,20,3,...,0,0,0,0,0,0,13,16,1,0
1,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,2.169,301.407,2.0,1.0,2.0,2.172,73.47,16,1,...,0,0,0,0,0,0,11,11,1,0
2,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,1.593,297.358,5.0,0.0,3.0,1.585,62.45,15,0,...,0,0,0,0,0,0,10,15,0,0
3,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,4.771,494.652,6.0,0.0,5.0,3.475,92.60,26,2,...,0,0,0,0,0,0,20,17,2,0
4,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,2.335,268.310,3.0,0.0,1.0,2.337,42.43,16,2,...,0,0,0,0,0,0,9,12,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3949,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2.0,1.0,7.0,4.207,55.13,18,2,...,0,0,0,0,0,0,17,5,1,0
3950,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5.0,0.0,1.0,-1.736,70.16,17,4,...,0,0,0,0,0,0,22,0,4,0
3951,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3.0,1.0,3.0,1.792,69.72,20,3,...,0,0,0,0,0,0,14,12,3,0
3952,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3.0,2.0,2.0,0.423,69.64,13,3,...,1,0,1,0,0,0,7,12,2,0


In [17]:
df_train = df_total.iloc[:len(df_train), :].copy()
df_train.reset_index(drop = True, inplace = True)
df_train.drop(['SMILES'], axis=1, inplace = True)
df_train

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,C,O,N,...,F,Cl,Br,Se,I,P,Bond_1.0,Bond_1.5,Bond_2.0,Bond_3.0
0,3.259,400.495,5.0,2.0,8.0,3.259,117.37,20,3,4,...,0,0,0,0,0,0,13,16,1,0
1,2.169,301.407,2.0,1.0,2.0,2.172,73.47,16,1,3,...,0,0,0,0,0,0,11,11,1,0
2,1.593,297.358,5.0,0.0,3.0,1.585,62.45,15,0,7,...,0,0,0,0,0,0,10,15,0,0
3,4.771,494.652,6.0,0.0,5.0,3.475,92.60,26,2,6,...,0,0,0,0,0,0,20,17,2,0
4,2.335,268.310,3.0,0.0,1.0,2.337,42.43,16,2,2,...,0,0,0,0,0,0,9,12,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3466,4.282,553.480,5.0,0.0,7.0,4.009,65.98,29,3,4,...,0,2,0,0,0,0,22,18,2,0
3467,4.304,459.520,6.0,1.0,4.0,4.304,113.53,24,3,5,...,0,0,0,0,0,0,7,27,3,0
3468,-1.133,385.417,7.0,0.0,4.0,-1.133,108.65,19,4,5,...,0,0,0,0,0,0,22,6,2,1
3469,-0.533,302.289,6.0,1.0,2.0,-0.533,111.44,13,3,6,...,0,0,0,0,0,0,15,6,2,1


In [18]:
df_test = df_total.iloc[len(df_train):, :].copy()
df_test.reset_index(drop = True, inplace = True)
df_test.drop(['SMILES'], axis=1, inplace = True)
df_test

Unnamed: 0,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,C,O,N,...,F,Cl,Br,Se,I,P,Bond_1.0,Bond_1.5,Bond_2.0,Bond_3.0
0,2.641,361.505,4.0,2.0,7.0,2.635,92.76,18,1,5,...,0,0,0,0,0,0,16,11,0,0
1,0.585,370.399,5.0,0.0,3.0,0.585,68.31,20,5,2,...,0,0,0,0,0,0,16,12,2,0
2,4.276,347.414,4.0,4.0,5.0,4.290,92.86,20,1,5,...,0,0,0,0,0,0,8,18,2,0
3,1.795,345.358,5.0,0.0,2.0,1.795,81.21,18,1,7,...,0,0,0,0,0,0,8,21,1,0
4,1.219,353.418,4.0,0.0,2.0,0.169,61.15,19,2,5,...,0,0,0,0,0,0,12,15,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,4.207,306.443,2.0,1.0,7.0,4.207,55.13,18,2,2,...,0,0,0,0,0,0,17,5,1,0
479,-0.608,335.398,5.0,0.0,1.0,-1.736,70.16,17,4,3,...,0,0,0,0,0,0,22,0,4,0
480,1.792,349.383,3.0,1.0,3.0,1.792,69.72,20,3,3,...,0,0,0,0,0,0,14,12,3,0
481,0.790,341.132,3.0,2.0,2.0,0.423,69.64,13,3,2,...,1,0,1,0,0,0,7,12,2,0


In [19]:
# optuna study
num_of_study = 1000

### target MLM

In [20]:
df_train_target_MLM = df_train_target['MLM']
df_train_target_MLM

0       26.0100
1       29.2700
2        5.5860
3        5.7100
4       93.2700
         ...   
3466    56.0435
3467    56.3130
3468    70.2150
3469    56.6085
3470     6.1730
Name: MLM, Length: 3471, dtype: float64

In [21]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import optuna
from sklearn.model_selection import train_test_split

def objective(trial):

    X_train, X_valid, y_train, y_valid = train_test_split(df_train, df_train_target_MLM, test_size = 0.2, random_state=42)

    param = {
        ### constant
        'boosting_type': 'gbdt',
        'metric': 'rmse', 
        'random_state': 10,
        'n_estimators': 10000,
        'early_stopping_round ' : 100,
        'verbose' : -1,
        'n_jobs' : -1,
        ### log_uniform
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-2, 1000),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 1000),
        ### suggest_discrete_uniform
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.9, 1.0, 0.05),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.7, 1.0, 0.05),
        'learning_rate': trial.suggest_discrete_uniform('learning_rate', 0.0005, 0.002, 0.0001),
        ### suggest_int
        'num_leaves' : trial.suggest_int('num_leaves', 2, 80),
        'min_child_samples': trial.suggest_int('min_child_samples', 30, 70),
        'max_depth': trial.suggest_int('max_depth', 1, 8, 1),
    }
    model = LGBMRegressor(**param)  
    model.fit(X_train, y_train, eval_set=[(X_valid,y_valid)])
    preds = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, preds, squared=False)
    
    return rmse

In [22]:
study = optuna.create_study(direction='minimize', study_name=None)
with tqdm(total=num_of_study) as pbar:  
    def callback(study, trial):
        pbar.update(1)  
    
    study.optimize(objective, n_trials=num_of_study, callbacks=[callback])

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2023-09-06 07:54:26,368] A new study created in memory with name: no-name-14ca6ecf-6a30-444d-8564-d3a92b5fb4fb
  0%|          | 0/1000 [00:00<?, ?it/s][I 2023-09-06 07:54:29,584] Trial 0 finished with value: 30.8942348150044 and parameters: {'reg_alpha': 22.572233359943205, 'reg_lambda': 1.8320139790504586, 'colsample_bytree': 0.9500000000000001, 'subsample': 0.85, 'learning_rate': 0.0006000000000000001, 'num_leaves': 16, 'min_child_samples': 59, 'max_depth': 6}. Best is trial 0 with value: 30.8942348150044.
  0%|          | 1/1000 [00:03<53:31,  3.21s/it][I 2023-09-06 07:54:34,227] Trial 1 finished with value: 31.05175734030434 and parameters: {'reg_alpha': 0.04375536012311945, 'reg_lambda': 0.25073728465108697, 'colsample_bytree': 0.9, 'subsample': 0.7, 'learning_rate': 0.0006000000000000001, 'num_leaves': 28, 'min_child_samples': 59, 'max_depth': 8}. Best is trial 0 with value: 30.8942348150044.
  0%|          | 2/1000 [00:07<1:07:26,  4.05s/it][I 2023-09-06 07:54:35,856] Trial 2

Number of finished trials: 1000
Best trial: {'reg_alpha': 0.13478529457183241, 'reg_lambda': 39.435686018672214, 'colsample_bytree': 1.0, 'subsample': 0.85, 'learning_rate': 0.002, 'num_leaves': 64, 'min_child_samples': 43, 'max_depth': 3}





In [23]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_samples,params_num_leaves,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,30.894235,2023-09-06 07:54:26.370724,2023-09-06 07:54:29.583595,0 days 00:00:03.212871,0.95,0.0006,6,59,16,22.572233,1.832014,0.85,COMPLETE
1,1,31.051757,2023-09-06 07:54:29.585596,2023-09-06 07:54:34.226643,0 days 00:00:04.641047,0.90,0.0006,8,59,28,0.043755,0.250737,0.70,COMPLETE
2,2,30.933225,2023-09-06 07:54:34.228643,2023-09-06 07:54:35.855010,0 days 00:00:01.626367,0.95,0.0018,2,39,4,39.248526,0.188694,0.90,COMPLETE
3,3,31.191829,2023-09-06 07:54:35.858011,2023-09-06 07:54:37.440368,0 days 00:00:01.582357,0.95,0.0020,2,52,77,0.030961,127.848643,0.80,COMPLETE
4,4,31.013121,2023-09-06 07:54:37.441368,2023-09-06 07:54:39.628861,0 days 00:00:02.187493,1.00,0.0011,5,63,16,1.086336,0.068902,0.95,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,30.571697,2023-09-06 08:22:10.516079,2023-09-06 08:22:12.046424,0 days 00:00:01.530345,1.00,0.0019,3,44,77,0.074996,39.622902,0.85,COMPLETE
996,996,30.561845,2023-09-06 08:22:12.048425,2023-09-06 08:22:13.589772,0 days 00:00:01.541347,1.00,0.0017,3,43,68,1.310420,28.032949,0.80,COMPLETE
997,997,31.049295,2023-09-06 08:22:13.591773,2023-09-06 08:22:15.175130,0 days 00:00:01.583357,1.00,0.0020,3,47,65,0.017164,238.372510,0.85,COMPLETE
998,998,30.681358,2023-09-06 08:22:15.177130,2023-09-06 08:22:16.826503,0 days 00:00:01.649373,1.00,0.0019,3,40,73,0.010073,56.724614,0.85,COMPLETE


In [24]:
study.trials_dataframe().params_max_depth.value_counts().head(10)

3    817
4     64
2     63
5     17
1     12
6     11
7      9
8      7
Name: params_max_depth, dtype: int64

In [25]:
study.trials_dataframe().params_min_child_samples.value_counts().head(10)

43    247
44    181
42    171
45     93
41     89
46     35
40     32
47     13
53     12
52     10
Name: params_min_child_samples, dtype: int64

In [26]:
study.trials_dataframe().params_learning_rate.value_counts().head(10)

0.0020    416
0.0019    324
0.0018     88
0.0017     22
0.0015     22
0.0012     22
0.0014     18
0.0013     18
0.0016     17
0.0011     14
Name: params_learning_rate, dtype: int64

In [27]:
study.trials_dataframe().params_subsample.value_counts().head(10)

0.85    696
0.80     78
0.90     77
0.75     53
0.70     48
0.95     32
1.00     16
Name: params_subsample, dtype: int64

In [28]:
study.trials_dataframe().params_colsample_bytree.value_counts().head(10)

1.00    945
0.95     35
0.90     20
Name: params_colsample_bytree, dtype: int64

In [29]:
optuna.visualization.plot_optimization_history(study)

In [30]:
optuna.visualization.plot_param_importances(study)

In [31]:
params=study.best_params
params['metric'] = 'rmse'
params['random_state'] = 10
params['n_estimators'] = 10000 
params['early_stopping_round'] = 500
params['verbose'] = -1
params['n_jobs'] = -1

params

{'reg_alpha': 0.13478529457183241,
 'reg_lambda': 39.435686018672214,
 'colsample_bytree': 1.0,
 'subsample': 0.85,
 'learning_rate': 0.002,
 'num_leaves': 64,
 'min_child_samples': 43,
 'max_depth': 3,
 'metric': 'rmse',
 'random_state': 10,
 'n_estimators': 10000,
 'early_stopping_round': 500,
 'verbose': -1,
 'n_jobs': -1}

In [32]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=30, random_state=48, shuffle=True)
preds_MLM = np.zeros(df_test.shape[0])
rmse=[]
n=0

for trn_idx, test_idx in kf.split(df_train, df_train_target_MLM):
    X_tr, X_val=df_train.iloc[trn_idx], df_train.iloc[test_idx]
    y_tr, y_val=df_train_target_MLM.iloc[trn_idx], df_train_target_MLM.iloc[test_idx]

    model = LGBMRegressor(**params)
    model.fit(X_tr, y_tr, eval_set=[(X_val,y_val)])

    preds_MLM+=model.predict(df_test)/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(f'trial {n+1}:', round(rmse[n], 5))
    n+=1

print(f'\naverage: {round(np.mean(rmse), 5)}')

trial 1: 30.0741
trial 2: 27.00646
trial 3: 29.89635
trial 4: 32.73215
trial 5: 33.86883
trial 6: 33.22771
trial 7: 33.17156
trial 8: 32.14509
trial 9: 30.14426
trial 10: 32.64252
trial 11: 29.77459
trial 12: 31.43363
trial 13: 29.28622
trial 14: 31.18033
trial 15: 30.12738
trial 16: 31.06633
trial 17: 34.94514
trial 18: 30.88632
trial 19: 32.29707
trial 20: 32.30611
trial 21: 33.99992
trial 22: 30.24898
trial 23: 29.76802
trial 24: 26.82009
trial 25: 30.08929
trial 26: 30.58366
trial 27: 32.50601
trial 28: 33.02336
trial 29: 33.56776
trial 30: 31.82285

average: 31.35474


### target HLM

In [33]:
df_train_target_HLM = df_train_target['HLM']
df_train_target_HLM

0       50.680
1       50.590
2       80.892
3        2.000
4       99.990
         ...  
3466    66.126
3467    63.732
3468    93.840
3469    49.519
3470    10.867
Name: HLM, Length: 3471, dtype: float64

In [34]:
def objective(trial):

    X_train, X_valid, y_train, y_valid = train_test_split(df_train, df_train_target_HLM, test_size = 0.2, random_state=42)

    param = {
        ### constant
        'boosting_type': 'gbdt',
        'metric': 'rmse', 
        'random_state': 10,
        'n_estimators': 10000,
        'early_stopping_round ' : 100,
        'verbose' : -1,
        'n_jobs' : -1,
        ### log_uniform
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-2, 100.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 100.0),
        ### suggest_discrete_uniform
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.05),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.05),
        'learning_rate': trial.suggest_discrete_uniform('learning_rate', 0.001, 0.003, 0.001),
        ### suggest_int
        'num_leaves' : trial.suggest_int('num_leaves', 3, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 10, 1),
    }
    model = LGBMRegressor(**param)  
    model.fit(X_train, y_train, eval_set=[(X_valid,y_valid)])
    preds = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, preds, squared=False)
    
    return rmse

In [35]:
study = optuna.create_study(direction='minimize', study_name=None)
with tqdm(total=num_of_study) as pbar:  
    def callback(study, trial):
        pbar.update(1)  
    
    study.optimize(objective, n_trials=num_of_study, callbacks=[callback])

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2023-09-06 08:23:36,941] A new study created in memory with name: no-name-c867af18-1218-49ea-b5e6-d624c29b02cd
  0%|          | 0/1000 [00:00<?, ?it/s][I 2023-09-06 08:23:38,466] Trial 0 finished with value: 31.867990837450375 and parameters: {'reg_alpha': 9.127065017087677, 'reg_lambda': 50.871585635016544, 'colsample_bytree': 0.65, 'subsample': 0.5, 'learning_rate': 0.002, 'num_leaves': 12, 'min_child_samples': 26, 'max_depth': 3}. Best is trial 0 with value: 31.867990837450375.
  0%|          | 1/1000 [00:01<25:19,  1.52s/it][I 2023-09-06 08:23:40,756] Trial 1 finished with value: 31.832716522209676 and parameters: {'reg_alpha': 3.031464533618575, 'reg_lambda': 0.13595618173771495, 'colsample_bytree': 0.65, 'subsample': 0.8, 'learning_rate': 0.001, 'num_leaves': 80, 'min_child_samples': 47, 'max_depth': 5}. Best is trial 1 with value: 31.832716522209676.
  0%|          | 2/1000 [00:03<32:49,  1.97s/it][I 2023-09-06 08:23:43,007] Trial 2 finished with value: 32.5577748483863 and p

Number of finished trials: 1000
Best trial: {'reg_alpha': 68.43069754871684, 'reg_lambda': 0.03719086628283284, 'colsample_bytree': 1.0, 'subsample': 0.8500000000000001, 'learning_rate': 0.001, 'num_leaves': 70, 'min_child_samples': 43, 'max_depth': 3}





In [36]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_samples,params_num_leaves,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,31.867991,2023-09-06 08:23:36.946624,2023-09-06 08:23:38.466966,0 days 00:00:01.520342,0.65,0.002,3,26,12,9.127065,50.871586,0.50,COMPLETE
1,1,31.832717,2023-09-06 08:23:38.467966,2023-09-06 08:23:40.756483,0 days 00:00:02.288517,0.65,0.001,5,47,80,3.031465,0.135956,0.80,COMPLETE
2,2,32.557775,2023-09-06 08:23:40.757483,2023-09-06 08:23:43.006991,0 days 00:00:02.249508,0.90,0.003,9,50,12,75.174260,0.511505,0.75,COMPLETE
3,3,31.997121,2023-09-06 08:23:43.008991,2023-09-06 08:23:45.880640,0 days 00:00:02.871649,0.50,0.002,10,21,11,0.010847,15.377199,0.65,COMPLETE
4,4,32.303720,2023-09-06 08:23:45.882640,2023-09-06 08:23:48.844308,0 days 00:00:02.961668,0.50,0.002,6,37,91,0.041648,0.039402,0.75,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,31.653801,2023-09-06 08:52:12.983568,2023-09-06 08:52:14.604933,0 days 00:00:01.621365,1.00,0.001,3,44,73,0.400035,0.078035,0.80,COMPLETE
996,996,31.646083,2023-09-06 08:52:14.606934,2023-09-06 08:52:16.158284,0 days 00:00:01.551350,1.00,0.001,3,42,49,54.253994,0.153488,0.85,COMPLETE
997,997,31.724742,2023-09-06 08:52:16.160285,2023-09-06 08:52:17.708633,0 days 00:00:01.548348,1.00,0.001,3,46,91,0.021847,10.352033,0.75,COMPLETE
998,998,32.195230,2023-09-06 08:52:17.710634,2023-09-06 08:52:21.746544,0 days 00:00:04.035910,1.00,0.001,9,44,53,72.669045,0.016044,0.80,COMPLETE


In [37]:
optuna.visualization.plot_optimization_history(study)

In [38]:
optuna.visualization.plot_param_importances(study)

In [39]:
params=study.best_params
params['metric'] = 'rmse'
params['random_state'] = 10
params['n_estimators'] = 10000 
params['early_stopping_round'] = 500
params['verbose'] = -1
params['n_jobs'] = -1

params

{'reg_alpha': 68.43069754871684,
 'reg_lambda': 0.03719086628283284,
 'colsample_bytree': 1.0,
 'subsample': 0.8500000000000001,
 'learning_rate': 0.001,
 'num_leaves': 70,
 'min_child_samples': 43,
 'max_depth': 3,
 'metric': 'rmse',
 'random_state': 10,
 'n_estimators': 10000,
 'early_stopping_round': 500,
 'verbose': -1,
 'n_jobs': -1}

In [40]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=30, random_state=48, shuffle=True)
preds_HLM = np.zeros(df_test.shape[0])
rmse=[]
n=0

for trn_idx, test_idx in kf.split(df_train, df_train_target_HLM):
    X_tr, X_val=df_train.iloc[trn_idx], df_train.iloc[test_idx]
    y_tr, y_val=df_train_target_HLM.iloc[trn_idx], df_train_target_HLM.iloc[test_idx]

    model = LGBMRegressor(**params)
    model.fit(X_tr, y_tr, eval_set=[(X_val,y_val)])

    preds_HLM+=model.predict(df_test)/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(f'trial {n+1}:', round(rmse[n], 5))
    n+=1

print(f'\naverage: {round(np.mean(rmse), 5)}')

trial 1: 30.33677
trial 2: 32.73839
trial 3: 31.92891
trial 4: 34.24441
trial 5: 31.47752
trial 6: 34.67137
trial 7: 34.00506
trial 8: 30.02102
trial 9: 31.84672
trial 10: 29.97639
trial 11: 31.60835
trial 12: 31.17246
trial 13: 30.26428
trial 14: 30.79227
trial 15: 33.36801
trial 16: 33.127
trial 17: 35.66635
trial 18: 32.70842
trial 19: 32.85151
trial 20: 32.79655
trial 21: 29.77825
trial 22: 31.22371
trial 23: 30.13178
trial 24: 29.77777
trial 25: 31.15184
trial 26: 32.54016
trial 27: 33.78799
trial 28: 32.3875
trial 29: 30.83139
trial 30: 33.43227

average: 32.02148


## export csv

In [41]:
df_sample['MLM'] = preds_MLM
df_sample['HLM'] = preds_HLM
df_sample.to_csv("submission.csv", index = False, encoding = "utf-8-sig")

In [42]:
df_sample

Unnamed: 0,id,MLM,HLM
0,TEST_000,27.986085,52.786078
1,TEST_001,58.449114,73.962461
2,TEST_002,35.097823,47.239391
3,TEST_003,46.584554,67.848027
4,TEST_004,65.471971,73.483455
...,...,...,...
478,TEST_478,10.995399,30.660779
479,TEST_479,70.634273,78.272293
480,TEST_480,45.715057,66.963018
481,TEST_481,70.725188,78.022323
