In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import numpy as np
from sklearn.preprocessing import StandardScaler
import math

In [57]:
# Import the file path
file_path = 'sample_data.csv'

df = pd.read_csv(file_path)
print(f'''
Feature list:
      {df.info()}''')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10019 entries, 0 to 10018
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         10019 non-null  int64  
 1   SMILE              10019 non-null  object 
 2   ID                 10019 non-null  object 
 3   MOL                10019 non-null  object 
 4   Activity           10019 non-null  int64  
 5   MolLogP            10019 non-null  float64
 6   MolWt              10019 non-null  float64
 7   HBD                10019 non-null  int64  
 8   HBA                10019 non-null  int64  
 9   NumAromaticRings   10019 non-null  int64  
 10  NumRotatableBonds  10019 non-null  int64  
 11  TPSA               10019 non-null  float64
 12  MolMR              10019 non-null  float64
 13  BertzCT            10019 non-null  float64
 14  BalabanJ           10019 non-null  float64
 15  FpDensityMorgan1   10019 non-null  float64
 16  FpDensityMorgan2   100

In [52]:
# Fill the features that you want to do the PCA
feature_list = ['MolLogP','MolWt','HBD','HBA','NumAromaticRings','NumRotatableBonds','BalabanJ']

df_prep = df[feature_list]

In [53]:
def variety_selection_pca_based(df_prep, n_components=2, n_obs_each_corner=1, n_center_point=1):
    '''Function annatation: 
    Overall, the function performs PCA decomposition and select specific data points based on their distance to the origin point on the PCA coordinate system.

    The function takes in a prepared dataframe (df_prep) and performs PCA decomposition with a manually specified number of components. 
    It then calculates the distance of each data point to the origin point on the PCA coordinate system and uses this metric to select certain data points. 

    All observations are classified by a combination of the signs of all principal components (-1 or 1). 
    For each class, a certain number of maximum distance points are chosen as corner points. 
    
    Additionally, a certain number of minimum distance points are chosen as center points. 


    Arguments : 
    1) df_prep: a preprocessed dataset that filters out features requiring PCA decomposition.
    2) n_components: number of PCs
    3) n_obs_each_corner: number of observation on each corner of factorial desgin.
    4) n_center_point: number of center points.
    '''
    
    # Standardize data
    df_prep_standard = StandardScaler().fit_transform(df_prep)
    
    # Do PCA
    pca = PCA(n_components=n_components)
    df_pca = pca.fit_transform(df_prep_standard)
    df_pca = pd.DataFrame(df_pca, columns=[f'PC{i+1}' for i in range(n_components)])
    
    # Calculate distance to original PCA points
    df_pca['Distance'] = (df_pca ** 2).sum(axis=1)
    
    # Create sign columns
    for column in df_pca.columns[:-1]:
        df_pca[f'sign_{column}'] = np.sign(df_pca[column])
    
    # Groupsign and choose n_maxmum point at each corner
    global max_index
    sign_columns = [column for column in df_pca.columns if column.startswith('sign_')]
    df_pca = df_pca.sort_values(by=sign_columns + ['Distance'])

    max_index = df_pca.groupby(sign_columns)['Distance'].nlargest(n_obs_each_corner).index.get_level_values(-1)
    
    # Choose minimum distance as center point
    min_index = df_pca['Distance'].nsmallest(n_center_point).index.get_level_values(-1)

    # Generate final index that need to extract
    final_index = pd.Index(np.concatenate([max_index, min_index]))
    
    df_train = df.iloc[final_index]
    df_test = df.drop(df_train.index)
    # Extract corresponding index
    return df_train, df_test

In [67]:
df_train, df_test = variety_selection_pca_based(df_prep, n_components=2, n_obs_each_corner=20,n_center_point=20)

In [68]:
df_train

Unnamed: 0.1,Unnamed: 0,SMILE,ID,MOL,Activity,MolLogP,MolWt,HBD,HBA,NumAromaticRings,...,MolMR,BertzCT,BalabanJ,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,FractionCSP3,HeavyAtomCount,HeavyAtomWt,RingCount
1717,1717,c1cccc(c12)oc(n2)SCc3c(OC)cc(c(c3)OC)CSc(n4)oc...,3281-0451nan,<rdkit.Chem.rdchem.Mol object at 0x143a6fc30>,0,6.57080,464.568,0,8,5,...,126.6860,1207.478270,1.333000,0.687500,1.125000,1.531250,0.166667,32,444.408,5
1799,1799,c1cccc(c12)cccc2CSc3sc(nn3)SCc4cccc(c45)cccc5,R839019nan,<rdkit.Chem.rdchem.Mol object at 0x143a7e0a0>,0,7.42910,430.623,0,5,5,...,126.9270,1171.116412,1.260138,0.551724,1.000000,1.517241,0.083333,29,412.479,5
1640,1640,c1cc(C)ccc1-c(cs2)c(c23)c(ncn3)SCC(=O)Nc4scc(n...,ASN4462110.0,<rdkit.Chem.rdchem.Mol object at 0x143a6da80>,0,6.52102,474.636,1,7,5,...,134.4917,1377.688511,1.311080,0.968750,1.781250,2.593750,0.083333,32,456.492,5
3589,3589,n1c(C)cc(C)c(c12)c3c(s2)c(Sc4ccccc4)nc(n3)SCc5...,5R-1315nan,<rdkit.Chem.rdchem.Mol object at 0x143a1b680>,0,7.60826,459.665,0,6,5,...,133.4950,1376.855780,1.609550,0.870968,1.645161,2.419355,0.160000,31,438.497,5
2736,2736,Clc1cccc(c12)c(cc(n2)C(F)(F)F)N3CCC(CC3)C(c4cc...,KM9879.0,<rdkit.Chem.rdchem.Mol object at 0x143a2bdf0>,0,7.95550,480.961,0,2,4,...,131.4190,1228.076954,1.465831,0.823529,1.411765,1.970588,0.250000,34,456.769,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3077,3077,Fc1ccc(cc1)CS/C(N)=N/N=C/c2ccc(cc2)OC,S12409.0,<rdkit.Chem.rdchem.Mol object at 0x143a214d0>,0,3.41630,317.389,1,4,2,...,89.9054,654.096066,1.854482,1.181818,1.863636,2.500000,0.125000,22,301.261,2
2122,2122,FC(F)(F)c1sc(nn1)NC(=O)c2c(SCC)cccc2,7841438nan,<rdkit.Chem.rdchem.Mol object at 0x143a36f10>,0,3.92120,333.360,1,5,2,...,75.7042,642.673043,2.166968,1.333333,2.047619,2.666667,0.250000,21,323.280,2
2530,2530,n12c3c(CC1)cccc3c(O)c(c2=O)C(=O)Nc4c(C)cc(C)cc4,L199893nan,<rdkit.Chem.rdchem.Mol object at 0x143a2e340>,0,3.13234,334.375,2,4,3,...,97.4460,1096.272069,1.850178,1.200000,2.000000,2.760000,0.200000,25,316.231,4
6648,6648,c1ccccc1CCNC(=O)CCc2ccc(cc2)OC,R333336nan,<rdkit.Chem.rdchem.Mol object at 0x1439dbca0>,0,2.98670,283.371,1,2,2,...,84.4707,549.190826,1.669821,0.952381,1.619048,2.285714,0.277778,21,262.203,2


In [69]:
df_test

Unnamed: 0.1,Unnamed: 0,SMILE,ID,MOL,Activity,MolLogP,MolWt,HBD,HBA,NumAromaticRings,...,MolMR,BertzCT,BalabanJ,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,FractionCSP3,HeavyAtomCount,HeavyAtomWt,RingCount
0,0,Cc1nnsc1C(=O)Nc2ccc(cc2)[N+]3=NC(C)=C(C3C)C(=O...,SEW4170.0,<rdkit.Chem.rdchem.Mol object at 0x1439906d0>,0,3.43422,386.457,1,7,2,...,100.1846,946.161631,1.730604,1.259259,1.962963,2.555556,0.333333,27,366.297,3
1,1,O=[*](O)c1cc(ccc1)NC(=O)c2snnc2C,CD5498.0,<rdkit.Chem.rdchem.Mol object at 0x143990740>,0,0.59762,251.267,2,5,2,...,61.5805,613.920302,2.195491,1.588235,2.411765,3.058824,0.100000,17,242.195,2
2,2,Clc1cc(ccc1)NC(=O)c2snnc2C,CD5493.0,<rdkit.Chem.rdchem.Mol object at 0x1439907b0>,0,2.75222,253.714,1,4,2,...,64.0512,526.287623,2.171011,1.500000,2.312500,2.937500,0.100000,16,245.650,2
3,3,Cc1nnsc1C(=O)NCc2nc(sc2)-c3sccc3,SCR1304.0,<rdkit.Chem.rdchem.Mol object at 0x143990820>,0,2.96152,322.440,1,7,3,...,81.2432,716.735581,1.646334,1.400000,2.300000,3.000000,0.166667,20,312.360,3
4,4,n1ncsc1NC(=O)c2ccc(cc2)N(C3=O)C(=O)[C@@H]([C@H...,4682-0061nan,<rdkit.Chem.rdchem.Mol object at 0x143990890>,0,2.10190,366.402,1,6,2,...,94.5142,907.855630,1.372770,1.076923,1.730769,2.307692,0.277778,26,352.290,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10014,10014,Brc1c(C)ccc(c12)NC(=O)C\2=N/N(C3=O)C(=O)c(c34)...,AG-690/08986005nan,<rdkit.Chem.rdchem.Mol object at 0x143b54ac0>,0,2.70992,384.189,1,4,2,...,90.9877,946.041215,1.846783,1.000000,1.625000,2.250000,0.058824,24,374.109,4
10015,10015,Brc1sc(cc1Br)S(=O)(=O)N2CCCCC2,SEW3416.0,<rdkit.Chem.rdchem.Mol object at 0x143b54b30>,0,3.44770,389.134,0,3,1,...,72.3288,458.198468,2.302123,1.187500,1.812500,2.312500,0.555556,16,378.046,2
10016,10016,BrC1=C[N+](CC)=NC1C(=O)Nc2c(F)cc(F)cc2,7493387nan,<rdkit.Chem.rdchem.Mol object at 0x143b54ba0>,0,3.00640,331.140,1,2,1,...,69.0151,586.337270,2.141505,1.526316,2.263158,2.842105,0.250000,19,320.052,2
10017,10017,BrC1=C[N+](C)=NC1C(=O)N(CC2)Cc(c23)cccc3,AK-968/11737007nan,<rdkit.Chem.rdchem.Mol object at 0x143b54c10>,0,2.28430,321.198,0,2,1,...,74.9734,594.108803,1.787945,1.421053,2.157895,2.842105,0.357143,19,306.078,3
