In [63]:
import numpy as np
import pandas as pd
from rdkit import Chem
from mordred import Calculator, descriptors
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [37]:
#Constants
RANDOM_STATE = 15

# Homework 1

In [38]:
#Sample n rows from dataset
def sample_rows(dataset: pd.DataFrame, n_rows: int, random_state=RANDOM_STATE) -> pd.DataFrame:
    dataset_sampled = dataset.sample(n=n_rows, random_state=random_state)
    dataset_sampled.reset_index(drop=True, inplace=True)
    return dataset_sampled

In [39]:
init_dataset = pd.read_csv('C:/Users/User/Desktop/Магистр Мира/Утёба/Algorithms and Big D---/Homework/datasets/qm9.csv')

dataset = sample_rows(init_dataset, 20000)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mol_id  20000 non-null  object 
 1   smiles  20000 non-null  object 
 2   A       20000 non-null  float64
 3   B       20000 non-null  float64
 4   C       20000 non-null  float64
 5   mu      20000 non-null  float64
 6   alpha   20000 non-null  float64
 7   homo    20000 non-null  float64
 8   lumo    20000 non-null  float64
 9   gap     20000 non-null  float64
 10  r2      20000 non-null  float64
 11  zpve    20000 non-null  float64
 12  u0      20000 non-null  float64
 13  u298    20000 non-null  float64
 14  h298    20000 non-null  float64
 15  g298    20000 non-null  float64
 16  cv      20000 non-null  float64
dtypes: float64(15), object(2)
memory usage: 2.6+ MB


In [40]:
dataset.head()

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,r2,zpve,u0,u298,h298,g298,cv
0,gdb_79961,CN1C2C=CC(C)(O)C12,2.56049,1.63058,1.4043,2.3964,79.95,-0.2193,0.0006,0.22,1037.2613,0.169876,-403.119257,-403.110436,-403.109492,-403.15187,34.693
1,gdb_119265,CCCC1(COC)CN1,1.77297,1.11189,0.76745,0.8327,86.61,-0.2401,0.0751,0.3152,1621.0929,0.215816,-405.5168,-405.505848,-405.504904,-405.554043,39.035
2,gdb_34718,N#CC1C2C3C=CC2C13,5.22871,1.23907,1.13556,4.1975,77.6,-0.2402,-0.0135,0.2266,1050.1241,0.126456,-363.569591,-363.56308,-363.562135,-363.600437,26.475
3,gdb_6359,COC(=O)NC(=O)N,5.70446,1.13827,0.96107,5.1276,56.63,-0.2606,0.0127,0.2734,1152.93,0.106529,-453.070527,-453.061776,-453.060832,-453.104809,29.574
4,gdb_65681,OC1(CC=O)C2CCC12,3.76072,0.98897,0.94831,2.0087,75.88,-0.2439,-0.0239,0.22,1301.467,0.157352,-422.999497,-422.990475,-422.989531,-423.033909,33.331


# Homework 2

In [41]:
# Add mordred descriptors to initial dataset
def get_mordred_descriptors(dataset: pd.DataFrame) -> pd.DataFrame:
    smiles_series = dataset['smiles']
    molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_series]
    mordred_descriptors = Calculator(descriptors, ignore_3D=True).pandas(molecules)
    dataset_modified = pd.concat([dataset, mordred_descriptors], axis=1)
    return dataset_modified

# Add rdkit descriptors to initial dataset
def get_rdkit_descriptors(dataset: pd.DataFrame) -> pd.DataFrame:
    smiles_series = dataset['smiles']
    molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_series]
    rdkit_list = []
    for mol in molecules:
        rdkit_list.append(Chem.Descriptors.CalcMolDescriptors(mol))
    rdkit_descriptors = pd.DataFrame(rdkit_list)
    dataset_modified = pd.concat([dataset, rdkit_descriptors], axis=1)
    return dataset_modified

# Transfrom smiles to numeric labels
def smiles_to_labels(dataset: pd.DataFrame) -> tuple[LabelEncoder, pd.DataFrame]:
    smiles_encoder = LabelEncoder().fit(dataset['smiles'])
    dataset['smiles'] = smiles_encoder.fit_transform(dataset['smiles']).astype('int64')
    return smiles_encoder, dataset

# Drop columns with types bool and object
def drop_bool_and_object_columns(dataset: pd.DataFrame) -> pd.DataFrame:
    bool_object_cols = dataset.select_dtypes(include=['bool', 'object']).columns
    dataset.drop(columns=bool_object_cols, inplace=True)
    return dataset

# Drop outliers by IQR-method
# def drop_outliers_IQR(dataset: pd.DataFrame) -> pd.DataFrame:
#     q1=dataset.quantile(0.25)
#     q3=dataset.quantile(0.75)
#     IQR=q3-q1

#     non_outliers = dataset[~((dataset<(q1-1.5*IQR)) | (dataset>(q3+1.5*IQR)))]
#     outliers_dropped = non_outliers.dropna().reset_index(drop=True)
#     return outliers_dropped

# Drop rows with NaN and dublicates
def drop_nan_and_duplicates(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset.dropna(inplace=True)
    dataset.drop_duplicates(inplace=True)
    dataset.reset_index(drop=True, inplace=True)
    return dataset

# Drop columns with high correlation
def drop_high_corr_features(dataset: pd.DataFrame, corr_limit: float) -> pd.DataFrame:
    # save gap and smiles because they are may drop
    save_data = pd.DataFrame()
    save_data['smiles'] = dataset['smiles']
    save_data['gap'] = dataset['gap']
    dataset.drop(columns=['gap', 'smiles'], inplace=True)

    corr_matrix = dataset.corr()

    high_corr_descriptors = set()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > corr_limit:
                high_corr_descriptors.add(corr_matrix.columns[i])

    dataset.drop(high_corr_descriptors, axis=1, inplace=True)
    filtered_dataset = pd.concat([save_data, dataset], axis=1)
    return filtered_dataset

# Drop columns with standard deviation = 0
def drop_zero_std_features(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset = dataset.loc[:, dataset.std() > 0]
    return dataset

# Drop columns that contain high values of zero
def drop_columns_with_zeros(dataset: pd.DataFrame, threshold: float) -> pd.DataFrame:
    total_rows = len(dataset)
    
    for column in dataset.columns:
        if ((dataset[column] == 0).sum() / total_rows) >= threshold:
            dataset.drop(columns=column, inplace=True)
    return dataset

In [42]:
# Add descriptors
dataset = get_mordred_descriptors(dataset)

dataset = get_rdkit_descriptors(dataset)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Columns: 1840 entries, mol_id to fr_urea
dtypes: bool(2), float64(968), int64(428), object(442)
memory usage: 280.5+ MB


In [23]:
# Clean data and feature selection
dataset.drop(columns=['homo', 'lumo'], inplace=True) # drop lumo and homo because target feature "gap" is calculated from this values

smiles_encoder, dataset = smiles_to_labels(dataset) # save smiles_encoder for inverse transform

dataset = drop_bool_and_object_columns(dataset)

dataset = drop_nan_and_duplicates(dataset)

dataset = drop_high_corr_features(dataset, 0.75) # drop features with correlation = 0.75 or -0.75

dataset = drop_zero_std_features(dataset)

dataset = drop_columns_with_zeros(dataset, 0.7) # drop features with 70% of zeroes

'''Я убрал этап удаления выбросов, так как слишком много дескрипторов
 и в некоторых из них некорректно удалять выбросы (например: количество функциональных групп).
 После применения этой функции остаётся 1-2% данных'''
# dataset = drop_outliers_IQR(dataset) 

dataset.drop(columns='Unnamed: 0', inplace=True)

dataset['smiles'] = smiles_encoder.inverse_transform(dataset['smiles'])

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Columns: 109 entries, smiles to fr_bicyclic
dtypes: float64(89), int64(19), object(1)
memory usage: 16.6+ MB


In [81]:
dataset.head()

Unnamed: 0,smiles,gap,A,B,mu,alpha,zpve,u0,cv,SpAbs_A,...,GGI5,JGI4,Radius,TopoShapeIndex,MinAbsEStateIndex,qed,MinPartialCharge,BCUT2D_MRHI,BCUT2D_MRLOW,fr_bicyclic
0,CN1C2C=CC(C)(O)C12,0.22,2.56049,1.63058,2.3964,79.95,0.169876,-403.119257,34.693,11.053989,...,0.0,0.037037,2,1.0,0.372685,0.363338,-0.384366,5.310269,0.10001,1
1,CCCC1(COC)CN1,0.3152,1.77297,1.11189,0.8327,86.61,0.215816,-405.5168,39.035,10.946346,...,0.055556,0.047143,3,1.0,0.378472,0.568468,-0.382721,5.026358,0.165193,0
2,N#CC1C2C3C=CC2C13,0.2266,5.22871,1.23907,4.1975,77.6,0.126456,-363.569591,26.475,11.409307,...,0.08,0.086806,3,0.666667,0.439815,0.437034,-0.198011,5.344492,-0.137629,0
3,COC(=O)NC(=O)N,0.2734,5.70446,1.13827,5.1276,56.63,0.106529,-453.070527,29.574,8.565187,...,0.0,0.032,3,0.666667,0.849537,0.446141,-0.452538,5.88926,0.171419,0
4,OC1(CC=O)C2CCC12,0.22,3.76072,0.98897,2.0087,75.88,0.157352,-422.999497,33.331,10.428639,...,0.08,0.05,3,0.666667,0.363426,0.541797,-0.388839,5.536476,-0.109746,1


In [80]:
dataset.to_csv('C:/Users/User/Desktop/Магистр Мира/Утёба/Algorithms and Big D---/Homework/datasets/clear_dataset.csv', index=False)