In [1]:
import numpy as np
import pandas as pd
import random
import torch
import dgl
import os
import sys

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 현재 파일의 상위 디렉토리를 sys.path에 추가
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 폰트 설정
import matplotlib as mpl
import matplotlib.font_manager as fm

font_path = r"C:\Windows\Fonts\malgun.ttf"  
font_prop = fm.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = font_prop.get_name()
mpl.rcParams['axes.unicode_minus'] = False

In [3]:
# 재현성 난수 고정
def SET_SEED():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    dgl.random.seed(SEED)

    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 100
SET_SEED()

In [4]:
dataset = 'VP' # freesolv / esol / lipo / VP
save_dir = r'..\results_figure\pca'

In [5]:
path = f'../datasets/{dataset}.csv'
df = pd.read_csv(path)
smiles_list = df['smiles'].tolist()

# target 정의
target = df.iloc[:,-1]

print(smiles_list[:5])
print(target[:5])

['COC(F)(F)C(F)(F)C(F)(F)F', 'COC(F)(F)C(F)(F)F', 'Brc1cc(Br)c(cc1)Oc1ccc(Br)c(Br)c1Br', 'Clc1c(Oc2ccccc2)c(Cl)ccc1Cl', 'Clc1cc(Oc2ccccc2)c(Cl)c(Cl)c1']
0    2.75
1    3.21
2   -8.14
3   -3.55
4   -3.66
Name: logvp, dtype: float64


In [6]:
from utils.utils import MolecularFeatureExtractor
# 분자 특성 추출 및 데이터프레임 정의
extractor = MolecularFeatureExtractor()
df_all_features = extractor.extract_molecular_features(smiles_list)
df_all_features['target'] = target
df_all_features

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,target
0,11.734375,-6.326875,11.734375,0.055208,0.621657,200.053,197.029,200.007212,74,0,...,0,0,0,0,0,0,0,0,0,2.750000
1,11.312500,-5.609375,11.312500,0.285208,0.517465,150.046,147.022,150.010406,56,0,...,0,0,0,0,0,0,0,0,0,3.210000
2,5.865324,0.743609,5.865324,0.743609,0.348269,564.691,559.651,559.625725,94,0,...,0,0,0,0,0,0,0,0,0,-8.140000
3,6.007009,0.317611,6.007009,0.317611,0.662096,273.546,266.490,271.956248,82,0,...,0,0,0,0,0,0,0,0,0,-3.550000
4,5.999509,0.356407,5.999509,0.356407,0.662096,273.546,266.490,271.956248,82,0,...,0,0,0,0,0,0,0,0,0,-3.660000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3568,12.369448,-0.052914,12.369448,0.045833,0.650472,236.270,224.174,236.083730,88,0,...,0,0,0,0,0,0,0,0,0,-5.991400
3569,12.494448,0.134167,12.494448,0.134167,0.445095,230.266,220.186,230.073165,84,0,...,0,0,0,0,0,0,0,0,0,-6.657577
3570,11.425278,-1.121250,11.425278,0.872593,0.706401,260.336,244.208,260.120115,98,0,...,0,0,0,0,0,0,0,0,0,-7.522879
3571,2.274907,1.278241,2.274907,1.278241,0.384372,254.332,240.220,254.109550,94,0,...,0,0,0,0,0,0,0,0,0,-7.045757


In [7]:
# 통계량
re1 = pd.DataFrame(df_all_features.iloc[:, -1].describe()).T
re2 = df_all_features.drop(columns = ['target']).describe().T

re3 = pd.concat([re1, re2], axis = 0)
print(re3)

# overleaf 양식에 맞게 통계량 출력
for i in range(len(re3)):
    print(f'{re3.iloc[i].name.replace("_", "")} & {re3.iloc[i]["mean"]:.2f} & {re3.iloc[i]["std"]:.2f} & {re3.iloc[i]["min"]:.2f} & {re3.iloc[i]["25%"]:.2f} & {re3.iloc[i]["50%"]:.2f} & {re3.iloc[i]["75%"]:.2f} & {re3.iloc[i]["max"]:.2f} \\\\')
    

                    count      mean       std        min       25%       50%  \
target             3573.0 -1.745140  3.325475 -13.680000 -4.020000 -0.886057   
MaxEStateIndex     3573.0  7.191745  3.657327   0.000000  3.672037  7.902778   
MinEStateIndex     3573.0 -0.147459  1.542461  -9.810281 -0.534395  0.260000   
MaxAbsEStateIndex  3573.0  7.191745  3.657327   0.000000  3.672037  7.902778   
MinAbsEStateIndex  3573.0  0.645418  0.660606   0.000000  0.182685  0.455123   
...                   ...       ...       ...        ...       ...       ...   
fr_thiazole        3573.0  0.003638  0.060218   0.000000  0.000000  0.000000   
fr_thiocyan        3573.0  0.000280  0.016730   0.000000  0.000000  0.000000   
fr_thiophene       3573.0  0.007277  0.085005   0.000000  0.000000  0.000000   
fr_unbrch_alkane   3573.0  0.698013  2.493393   0.000000  0.000000  0.000000   
fr_urea            3573.0  0.012035  0.114075   0.000000  0.000000  0.000000   

                         75%        max

In [8]:
# from utils.utils import MolecularFeatureExtractor
# # 분자 특성 추출 및 데이터프레임 정의
# extractor = MolecularFeatureExtractor()
# df_all_features = extractor.extract_molecular_features(smiles_list)

# df_all_features['target'] = target
# df_all_features

# num_all_features = df_all_features.shape[1] - 1 
# print("초기 변수 개수:", num_all_features)

# # na handling
# # NA 확인
# df_all_features[df_all_features.isna().any(axis = 1)] # 행방향

# # 결측치가 포함된 feature 개수
# print('결측치가 포함된 열 개수:', df_all_features.isna().any(axis = 0).sum(), '\n')
# print(df_all_features.isna().any(axis = 0))

# print('결측치가 포함된 행 개수:', df_all_features.isna().any(axis = 1).sum(), '\n')
# print(df_all_features.isna().any(axis = 1))

# df_removed_features = df_all_features.dropna()

# # 결측치가 포함된 feature 제거
# # df_removed_features = df_all_features.dropna(axis = 1)
# num_removed_features = df_removed_features.shape[1] - 1  # logvp 열 제외

# print("제거 후 남은 feature 개수:", num_removed_features)

# # 결측치가 제거된 data frame
# df_removed_features

# # 결측치가 포함된 feature 개수
# print('결측치가 포함된 열 개수:', df_removed_features.isna().any(axis = 0).sum(), '\n')
# print(df_removed_features.isna().any(axis = 0))

# print('결측치가 포함된 행 개수:', df_removed_features.isna().any(axis = 1).sum(), '\n')
# print(df_removed_features.isna().any(axis = 1))



# # nunique == 1 인 경우는 제
# unique_columns = list(df_removed_features.loc[:, df_removed_features.nunique() == 1].columns)
# print('nunique == 1인 feature : \n', unique_columns, '\n')

# # nunique == 1인 feature 제거
# #df_removed_features.drop(columns = unique_columns, inplace = True)
# df_removed_features = df_removed_features.drop(columns = unique_columns).copy()

# num_removed_features = df_removed_features.shape[1] - 1  # logvp 열 제외

# print("제거 후 남은 feature 개수:", num_removed_features, '\n')
# print(df_removed_features.shape)


# # 너무 낮은 vairnace를 가지는 경
# low_variances = sorted(df_removed_features.var())
# low_variances[:10]

# columns_low_variances = []

# for i in low_variances:
#     if i < 0.001:
#         column = df_removed_features.loc[:, df_removed_features.var() == i].columns
#         columns_low_variances.append(column)
# columns_low_variances = [item for index in columns_low_variances for item in index]

# # 2. 중복 제거 및 유니크 값 추출
# columns_low_variances = list(set(columns_low_variances))
# print(columns_low_variances)

# # 낮은 분산의 변수 제거
# df_removed_features = df_removed_features.drop(columns = columns_low_variances).reset_index(drop=True).copy()
# num_removed_features = df_removed_features.shape[1] - 1  # logvp 열 제외

# print("제거 후 남은 feature 개수:", num_removed_features, '\n')
# print(df_removed_features.shape)

# df_removed_features