In [1]:
import numpy as np
import pandas as pd
import random
import torch
import dgl
import os
import sys

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 현재 파일의 상위 디렉토리를 sys.path에 추가
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 폰트 설정
import matplotlib as mpl
import matplotlib.font_manager as fm

font_path = r"C:\Windows\Fonts\malgun.ttf"  
font_prop = fm.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = font_prop.get_name()
mpl.rcParams['axes.unicode_minus'] = False

In [3]:
# 재현성 난수 고정
def SET_SEED():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    dgl.random.seed(SEED)

    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 100
SET_SEED()

In [4]:
dataset = 'solubility_only3' # freesolv / esol / lipo / scgas / solubility_only3

In [5]:
path = f'../datasets/{dataset}.csv'
df = pd.read_csv(path)
smiles_list = df['smiles'].tolist()

# target 정의
target = df.iloc[:,-1]

print(smiles_list[:5])
print(target[:5])

['CCCCCCCCCCCCCCCCCC(=O)Nc1ccc(cc1)NC(=O)CCCCCCCCCCCCCCCCC', 'C/C(=C\\CC/C=C(/CC/C=C(/CCC=C(C)C)\\C)\\C)/CC/C=C(/CCC=C(C)C)\\C', 'ClCCN(c1ccc(cc1)CC(=O)O[C@H]1CC[C@]2(C(=CC[C@@H]3[C@@H]2CC[C@]2([C@H]3CC[C@@H]2[C@@H](CCCC(C)C)C)C)C1)C)CCCl', 'C[C@@H](CCC[C@]1(C)CCc2c(O1)c(C)c(c(c2C)OC(=O)c1cccnc1)C)CCC[C@@H](CCCC(C)C)C', 'CCCCCCCOc1ccc(c2c1cccc2)C(=N)[NH+](CCCCCCCC)CCCCCCCC.[Cl-]']
0   -17.468457
1   -14.592367
2   -14.523872
3   -13.798574
4   -13.619359
Name: solubility, dtype: float64


In [6]:
from descriptor_selection.utils import MolecularFeatureExtractor
# 분자 특성 추출 및 데이터프레임 정의
extractor = MolecularFeatureExtractor()
df_all_features = extractor.extract_molecular_features(smiles_list)
df_all_features['target'] = target
df_all_features

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,target
0,12.348360,0.083005,12.348360,0.083005,0.073635,641.082,564.474,640.590680,266,0,...,0,0,0,0,0,0,0,26,0,-17.468457
1,2.433950,1.170647,2.433950,1.170647,0.185870,410.730,360.330,410.391252,170,0,...,0,0,0,0,0,0,0,1,0,-14.592367
2,13.030324,-0.104670,13.030324,0.011100,0.121593,644.812,585.340,643.392285,246,0,...,0,0,0,0,0,0,0,0,0,-14.523872
3,12.743711,-0.370444,12.743711,0.143661,0.178364,535.813,482.389,535.402545,216,0,...,0,0,0,0,0,0,0,0,0,-13.798574
4,9.286665,0.000000,9.286665,0.000000,0.104454,545.296,487.840,544.415942,216,0,...,0,0,0,0,0,0,0,12,0,-13.619359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8784,3.222222,1.250000,3.222222,1.250000,0.434794,71.123,62.051,71.073499,30,0,...,0,0,0,0,0,0,0,0,0,1.150000
8785,9.000000,-0.833333,9.000000,0.833333,0.429883,60.052,56.020,60.021129,24,0,...,0,0,0,0,0,0,0,0,0,1.220000
8786,4.597222,1.652778,4.597222,1.652778,0.273315,46.073,40.025,46.053098,20,0,...,0,0,0,0,0,0,0,0,0,1.340000
8787,7.000000,1.000000,7.000000,1.000000,0.385284,32.042,28.010,32.026215,14,0,...,0,0,0,0,0,0,0,0,0,1.570000


In [9]:
# 통계량
re1 = pd.DataFrame(df_all_features.iloc[:, -1].describe()).T
re2 = df_all_features.drop(columns = ['target']).describe().T

re3 = pd.concat([re1, re2], axis = 0)
print(re3)

# overleaf 양식에 맞게 통계량 출력
for i in range(len(re3)):
    print(f'{re3.iloc[i].name.replace("_", "")} & {re3.iloc[i]["mean"]:.2f} & {re3.iloc[i]["std"]:.2f} & {re3.iloc[i]["min"]:.2f} & {re3.iloc[i]["25%"]:.2f} & {re3.iloc[i]["50%"]:.2f} & {re3.iloc[i]["75%"]:.2f} & {re3.iloc[i]["max"]:.2f} \\\\')
    

                    count      mean       std        min       25%        50%  \
target             8789.0 -2.873561  2.061027 -17.468457 -4.143210  -2.752655   
MaxEStateIndex     8789.0  9.505784  3.264105   0.000000  6.255995  10.608230   
MinEStateIndex     8789.0 -0.643083  1.266189  -6.876314 -0.920139  -0.351157   
MaxAbsEStateIndex  8789.0  9.505784  3.264105   0.000000  6.255995  10.608230   
MinAbsEStateIndex  8789.0  0.226141  0.335807   0.000000  0.000000   0.104074   
...                   ...       ...       ...        ...       ...        ...   
fr_thiazole        8789.0  0.009216  0.101341   0.000000  0.000000   0.000000   
fr_thiocyan        8789.0  0.001479  0.038433   0.000000  0.000000   0.000000   
fr_thiophene       8789.0  0.009330  0.100768   0.000000  0.000000   0.000000   
fr_unbrch_alkane   8789.0  0.482194  1.744373   0.000000  0.000000   0.000000   
fr_urea            8789.0  0.034247  0.189834   0.000000  0.000000   0.000000   

                         75