In [1]:
import numpy as np
import pandas as pd
import random
import torch
import dgl
import os
import sys

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 현재 파일의 상위 디렉토리를 sys.path에 추가
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 폰트 설정
import matplotlib as mpl
import matplotlib.font_manager as fm

font_path = r"C:\Windows\Fonts\malgun.ttf"  
font_prop = fm.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = font_prop.get_name()
mpl.rcParams['axes.unicode_minus'] = False

In [3]:
# 재현성 난수 고정
def SET_SEED():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    dgl.random.seed(SEED)

    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 100
SET_SEED()

In [4]:
dataset = 'logvp_cp_0927' # freesolv / esol / lipo / VP / logvp_cp_0927

In [5]:
path = f'../datasets/{dataset}.csv'
df = pd.read_csv(path)
smiles_list = df['smiles'].tolist()

# target 정의
target = df.iloc[:,-1]

print(smiles_list[:5])
print(target[:5])

['COC(F)(F)C(F)(F)C(F)(F)F', 'COC(F)(F)C(F)(F)F', 'Brc1cc(Br)c(cc1)Oc1ccc(Br)c(Br)c1Br', 'Clc1c(Oc2ccccc2)c(Cl)ccc1Cl', 'Clc1cc(Oc2ccccc2)c(Cl)c(Cl)c1']
0    2.75
1    3.21
2   -8.14
3   -3.55
4   -3.66
Name: logvp, dtype: float64


In [6]:
from utils.utils import MolecularFeatureExtractor
# 분자 특성 추출 및 데이터프레임 정의
extractor = MolecularFeatureExtractor()
df_all_features = extractor.extract_molecular_features(smiles_list)
df_all_features['target'] = target
df_all_features

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,target
0,11.734375,-6.326875,11.734375,0.055208,0.621657,200.053,197.029,200.007212,74,0,...,0,0,0,0,0,0,0,0,0,2.750000
1,11.312500,-5.609375,11.312500,0.285208,0.517465,150.046,147.022,150.010406,56,0,...,0,0,0,0,0,0,0,0,0,3.210000
2,5.865324,0.743609,5.865324,0.743609,0.348269,564.691,559.651,559.625725,94,0,...,0,0,0,0,0,0,0,0,0,-8.140000
3,6.007009,0.317611,6.007009,0.317611,0.662096,273.546,266.490,271.956248,82,0,...,0,0,0,0,0,0,0,0,0,-3.550000
4,5.999509,0.356407,5.999509,0.356407,0.662096,273.546,266.490,271.956248,82,0,...,0,0,0,0,0,0,0,0,0,-3.660000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3127,2.448171,0.882574,2.448171,0.882574,0.354496,268.529,228.209,268.313001,116,0,...,0,0,0,0,0,0,0,0,0,-2.356547
3128,12.494448,0.134167,12.494448,0.134167,0.445095,230.266,220.186,230.073165,84,0,...,0,0,0,0,0,0,0,0,0,-6.657577
3129,11.425278,-1.121250,11.425278,0.872593,0.706401,260.336,244.208,260.120115,98,0,...,0,0,0,0,0,0,0,0,0,-7.522879
3130,2.274907,1.278241,2.274907,1.278241,0.384372,254.332,240.220,254.109550,94,0,...,0,0,0,0,0,0,0,0,0,-7.045757


In [7]:
# 통계량
re1 = pd.DataFrame(df_all_features.iloc[:, -1].describe()).T
re2 = df_all_features.drop(columns = ['target']).describe().T

re3 = pd.concat([re1, re2], axis = 0)
print(re3)

# overleaf 양식에 맞게 통계량 출력
for i in range(len(re3)):
    print(f'{re3.iloc[i].name.replace("_", "")} & {re3.iloc[i]["mean"]:.2f} & {re3.iloc[i]["std"]:.2f} & {re3.iloc[i]["min"]:.2f} & {re3.iloc[i]["25%"]:.2f} & {re3.iloc[i]["50%"]:.2f} & {re3.iloc[i]["75%"]:.2f} & {re3.iloc[i]["max"]:.2f} \\\\')
    

                    count      mean       std        min       25%       50%  \
target             3132.0 -1.532371  3.085369 -10.450000 -3.688685 -0.700000   
MaxEStateIndex     3132.0  7.093560  3.609337   0.000000  3.658321  6.553684   
MinEStateIndex     3132.0 -0.141644  1.537893  -9.810281 -0.508355  0.287614   
MaxAbsEStateIndex  3132.0  7.093560  3.609337   0.000000  3.658321  6.553684   
MinAbsEStateIndex  3132.0  0.651687  0.661040   0.000000  0.192576  0.481505   
...                   ...       ...       ...        ...       ...       ...   
fr_thiazole        3132.0  0.003193  0.056424   0.000000  0.000000  0.000000   
fr_thiocyan        3132.0  0.000319  0.017869   0.000000  0.000000  0.000000   
fr_thiophene       3132.0  0.007982  0.089000   0.000000  0.000000  0.000000   
fr_unbrch_alkane   3132.0  0.672733  2.276450   0.000000  0.000000  0.000000   
fr_urea            3132.0  0.008301  0.097533   0.000000  0.000000  0.000000   

                         75%        max