# Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.model_selection import PredefinedSplit


In [2]:
import tabulate

# Functions

In [3]:
# creates the feature name with the mz and rt

def feature_name_creation(xcms_file_path):
    table = pd.read_csv(xcms_file_path, index_col=[0]) 
    
    # no need for decimal on m/z (low resolution) and only one decimal for rt
    table.mz = table.mz.round(0).astype(int)
    table.rt = table.rt.round(1)

    # creating the feature name: mz_rt
    features = table["mz"].astype(str) + "_" + table["rt"].astype(str)
    table.insert(0, 'features', features) # first column
    
    # drop as we don't know how many columns the table will have. Drop the known ones. 
    # There should only be the 'features' column and the samples
    
    table_clean = table.drop(['isotopes', 'adduct','pcgroup'], axis=1) #'npeaks','NEG_GROUP', 'POS_GROUP',
    
    return table_clean

In [4]:
# rounds the mz and rt columns along with its min and max

def rounder(dataframe):
    table = dataframe 
    
    table.mz = table.mz.round(0).astype(int)
    table.mzmin = table.mzmin.round(0).astype(int)
    table.mzmax = table.mzmax.round(0).astype(int)
    
    table.rt = table.rt.div(60).round(1)
    table.rtmin = table.rtmin.div(60).round(1)
    table.rtmax = table.rtmax.div(60).round(1)

    
    return table

# Data Prep Pipeline

`Train` and `Val` sets were processed separately on `xcms` - excludes the possibility of data leakage 
But, when processing is separated, the features can be slightly different. The compounds are almost the same, but due to processing steps, there can be shifts on the decimals of `mz` or `rt`. 
For this reason, creating the feature name concatenating `mz_rt` on train and val might not produce the same features, and machine learning training is not possible with that. 

Errors observed in this case are related to the fact that features observed in train were not present in validation and vice versa or the order of the features were different in both datasets. This 'pipeline' fixes this issue.

**Steps:**

1. Creates the name for the features on `Train` set - this is the set used as reference. Whatever features where observed here, should appear on `Val`. The name is created concatenating `mz` and `rt` columns (`mz_rt`)
2. Creates a correspondance between the feature on `Train` and `Val` set, giving val set the same column names as the train, when the feature is present 
    1. round `mz` and `rt` from `Val` and `Train` 
    2. for each `mz` in `Val`, search for a range on `mzmin` and `mzmax` on train that fits. The `mz_val` need to be between `mzmin_train` and `mzmax_train` 
    3. If a match is found, for each `rt`,`rtmin` and `rtmax` on `Val` search for a range on `rtmin` and `rtmax` on `Train` that fits. The `rt` values need to be between `rtmin_train` and `rtmax_train`. The `rtmin` and `rtmax` from `Val` are used in this case because ocasionally, the range on `Val` or train is too big (big difference in `rt` between samples)
    4. if a match, take the feature name from `Train` and apply on the match
    
**With the features names created:**

3. Features on `Train` and `Val` are ordered 
4. Duplicates are deleted based on the `npeaks` columnn
5. Features that were observed in `Val` but no correspondence was found in `Train` have names filled with `nan`. These are deleted.
4. Features that are on `Train`and were not found in `Val` are added to `Val` and filled with zero (no presence of that feature)
 
 
**To fix: **
 The code for the feature correspondence is not optimized. 
 - After the match with `mz`, the loop searches on the whole dataset for a match in `rt`. This takes more computation, unecessary. 
 - If there is a match of two features, the last one is kept. Could keep both, filter later? 
 


## Feature reference creation - train set

In [5]:
# train is loaded using the function to create the feature names - feature names are created using mz and rt.  
maytenus_train = feature_name_creation('7. Gridsearch/maytenus_train_processing_masters_balanced.csv').reset_index(drop=True)

In [6]:
maytenus_train.head()

Unnamed: 0,features,mz,mzmin,mzmax,rt,rtmin,rtmax,npeaks,NEG_GROUP,POS_GROUP,...,IL9_3,IL96_1,IL96_2,IL96_3,IL97_1,IL97_2,IL97_3,IL99_1,IL99_2,IL99_3
0,106_559.5,106,105.890462,106.619987,559.5,554.904,566.201,898,450,448,...,634995300.0,668886200.0,371563300.0,327618800.0,408899900.0,396868100.0,419346600.0,748293400.0,594141900.0,912260300.0
1,106_466.4,106,105.880025,106.460218,466.4,456.486,502.44,872,443,428,...,267447600.0,533976700.0,555431800.0,404434500.0,303750800.0,488412800.0,351315700.0,172781800.0,239983800.0,299802300.0
2,114_53.0,114,113.128105,114.118386,53.0,36.06,116.686,147,69,78,...,25470110.0,30496280.0,28930300.0,29945070.0,25938740.0,26048110.0,22151560.0,22687410.0,24383740.0,24372330.0
3,116_57.6,116,115.12145,116.121342,57.6,49.191,129.09,504,268,230,...,26400490.0,29866010.0,28970320.0,33085710.0,27867530.0,28336730.0,27477980.0,25180950.0,25730600.0,25855090.0
4,116_55.7,116,116.122339,117.106564,55.7,49.404,176.879,112,56,55,...,24945920.0,30313440.0,29048030.0,30676530.0,26317010.0,25837400.0,23162310.0,23346970.0,24010650.0,24636470.0


## Loading validation val set

In [7]:
# val will be loaded using regular read_csv - the names of the features will come based on comparison
maytenus_val = pd.read_csv('7. Gridsearch/maytenus_val_processing_masters_balanced.csv',index_col=[0]).reset_index(drop=True).drop(['isotopes', 'adduct','pcgroup'], axis=1) #'npeaks','NEG_GROUP', 'POS_GROUP',

## Rounding mz and rt

In [8]:
# rouding all mz and all rt
maytenus_val = rounder(maytenus_val)
maytenus_train = rounder(maytenus_train)

In [9]:
display(maytenus_val.iloc[:,0:7].head())
display(maytenus_train.iloc[:,0:7].head())

Unnamed: 0,mz,mzmin,mzmax,rt,rtmin,rtmax,npeaks
0,106,106,107,9.3,9.2,9.4,232
1,106,106,106,7.8,7.6,8.0,219
2,115,115,116,0.9,0.8,1.8,127
3,117,116,117,7.3,7.2,7.5,164
4,133,133,133,0.9,0.8,1.0,229


Unnamed: 0,features,mz,mzmin,mzmax,rt,rtmin,rtmax
0,106_559.5,106,106,107,9.3,9.2,9.4
1,106_466.4,106,106,106,7.8,7.6,8.4
2,114_53.0,114,113,114,0.9,0.6,1.9
3,116_57.6,116,115,116,1.0,0.8,2.2
4,116_55.7,116,116,117,0.9,0.8,2.9


In [10]:
display(maytenus_val)
display(maytenus_train)

Unnamed: 0,mz,mzmin,mzmax,rt,rtmin,rtmax,npeaks,NEG_GROUP,POS_GROUP,AQ15_1,...,IL88_3,IL89_1,IL89_2,IL89_3,IL90_1,IL90_2,IL90_3,IL93_1,IL93_2,IL93_3
0,106,106,107,9.3,9.2,9.4,232,116,116,461500700.0,...,314596700.0,461774900.0,720180500.0,369951100.0,407041800.0,453488000.0,467032100.0,429995700.0,465883400.0,412128800.0
1,106,106,106,7.8,7.6,8.0,219,110,109,395362000.0,...,591879700.0,544013800.0,314930200.0,534282000.0,519121400.0,445463900.0,489225500.0,406041400.0,335261900.0,432434500.0
2,115,115,116,0.9,0.8,1.8,127,63,61,24740230.0,...,37745570.0,28316450.0,27031310.0,28154680.0,27578050.0,27451410.0,26719130.0,24955080.0,25110500.0,24732490.0
3,117,116,117,7.3,7.2,7.5,164,87,77,43185740.0,...,76590820.0,61833500.0,65291680.0,63419330.0,78728280.0,77134400.0,77243430.0,46284430.0,43245590.0,47525960.0
4,133,133,133,0.9,0.8,1.0,229,113,116,456751500.0,...,485041000.0,321160800.0,315903200.0,310996000.0,305069300.0,299285500.0,299757800.0,321721100.0,319003200.0,325932100.0
5,145,144,145,9.3,9.2,9.4,101,51,50,70973080.0,...,227411900.0,68526220.0,78810600.0,134152200.0,90675780.0,62250510.0,97738180.0,96890010.0,72756010.0,73415780.0
6,162,162,163,0.7,0.6,0.7,118,56,62,38364510.0,...,49515750.0,44336390.0,47190790.0,48205980.0,42603560.0,42655660.0,43367680.0,44757140.0,46908120.0,46416580.0
7,181,180,181,0.7,0.6,0.8,118,41,77,67410460.0,...,90431280.0,126933700.0,119224000.0,116709200.0,119110200.0,119144600.0,121521800.0,78464610.0,80448620.0,75243610.0
8,191,191,191,1.4,0.7,1.7,191,96,90,190705700.0,...,272426600.0,256464800.0,252373800.0,250245300.0,233979200.0,230970800.0,230467300.0,240047600.0,238147400.0,238019500.0
9,195,194,195,0.7,0.6,0.8,54,50,4,30596240.0,...,21344050.0,20257730.0,20728940.0,19573260.0,21184740.0,19810310.0,22505580.0,18241270.0,20410670.0,19189550.0


Unnamed: 0,features,mz,mzmin,mzmax,rt,rtmin,rtmax,npeaks,NEG_GROUP,POS_GROUP,...,IL9_3,IL96_1,IL96_2,IL96_3,IL97_1,IL97_2,IL97_3,IL99_1,IL99_2,IL99_3
0,106_559.5,106,106,107,9.3,9.2,9.4,898,450,448,...,6.349953e+08,6.688862e+08,3.715633e+08,3.276188e+08,4.088999e+08,3.968681e+08,4.193466e+08,7.482934e+08,5.941419e+08,9.122603e+08
1,106_466.4,106,106,106,7.8,7.6,8.4,872,443,428,...,2.674476e+08,5.339767e+08,5.554318e+08,4.044345e+08,3.037508e+08,4.884128e+08,3.513157e+08,1.727818e+08,2.399838e+08,2.998023e+08
2,114_53.0,114,113,114,0.9,0.6,1.9,147,69,78,...,2.547011e+07,3.049628e+07,2.893030e+07,2.994507e+07,2.593874e+07,2.604811e+07,2.215156e+07,2.268741e+07,2.438374e+07,2.437233e+07
3,116_57.6,116,115,116,1.0,0.8,2.2,504,268,230,...,2.640049e+07,2.986601e+07,2.897032e+07,3.308571e+07,2.786753e+07,2.833673e+07,2.747798e+07,2.518095e+07,2.573060e+07,2.585509e+07
4,116_55.7,116,116,117,0.9,0.8,2.9,112,56,55,...,2.494592e+07,3.031344e+07,2.904803e+07,3.067653e+07,2.631701e+07,2.583740e+07,2.316231e+07,2.334697e+07,2.401065e+07,2.463647e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,893_42.7,893,892,893,0.7,0.7,0.8,93,39,54,...,1.976996e+07,1.878941e+07,1.878887e+07,1.813212e+07,2.007749e+07,2.032207e+07,1.903845e+07,1.769231e+07,1.827080e+07,1.766570e+07
302,894_42.9,894,893,894,0.7,0.6,0.8,169,84,85,...,1.950175e+07,2.071258e+07,1.901446e+07,2.009669e+07,2.017525e+07,2.072174e+07,1.866410e+07,2.073306e+07,1.802566e+07,1.818957e+07
303,894_576.6,894,893,894,9.6,9.5,9.7,131,64,67,...,1.378702e+07,1.193479e+07,1.367345e+07,1.249813e+07,1.228192e+07,1.090482e+07,1.260544e+07,1.780989e+07,1.133627e+07,1.065290e+07
304,902_221.9,902,901,902,3.7,3.7,4.1,77,72,5,...,7.522417e+06,8.047253e+06,9.279076e+06,9.610303e+06,8.076853e+06,1.065710e+07,9.148790e+06,3.711126e+06,6.212511e+06,4.968906e+06


## Feature creation and correspondance on val set - create a function of this

In [11]:
# creating the column
maytenus_val['features'] = np.nan

In [12]:
# loop over maytenus_val items. 
# Each mz will be tested against all mzmin and mzmax range from train. 
# if in range, test for rt.
# if in range, use the same feature name from train

maytenus_val = maytenus_val.sort_values('npeaks', ascending=False,ignore_index=True)
maytenus_train_ref = maytenus_train.sort_values('npeaks', ascending=False,ignore_index=True)

for i in range(len(maytenus_val)):
    for j in range(len(maytenus_train_ref)):


        if ((maytenus_val.loc[i,'mz'] <= maytenus_train_ref.loc[j,'mzmax']) 
              & (maytenus_val.loc[i,'mz'] >= maytenus_train_ref.loc[j,'mzmin'])):
            
            #maybe subset maytenus train and then perform things on the subset? 
            
            if (
                ((maytenus_val.loc[i,'rt'] <= maytenus_train_ref.loc[j,'rtmax']) 
                  & (maytenus_val.loc[i,'rt'] >= maytenus_train_ref.loc[j,'rtmin'])) or
            
               ((maytenus_val.loc[i,'rtmin'] <= maytenus_train_ref.loc[j,'rtmax']) 
                  & (maytenus_val.loc[i,'rtmin'] >= maytenus_train_ref.loc[j,'rtmin'])) or
                
               ((maytenus_val.loc[i,'rtmax'] <= maytenus_train_ref.loc[j,'rtmax']) 
                & (maytenus_val.loc[i,'rtmax'] >= maytenus_train_ref.loc[j,'rtmin']))
            ):
                
                maytenus_val.loc[i,'features'] = maytenus_train_ref.loc[j,'features']
            break

In [13]:
maytenus_val

Unnamed: 0,mz,mzmin,mzmax,rt,rtmin,rtmax,npeaks,NEG_GROUP,POS_GROUP,AQ15_1,...,IL89_1,IL89_2,IL89_3,IL90_1,IL90_2,IL90_3,IL93_1,IL93_2,IL93_3,features
0,561,561,562,3.9,2.3,6.0,244,26,116,95865150.0,...,102131700.0,109029100.0,104731600.0,142469600.0,141738300.0,137501000.0,135359600.0,133429000.0,143654400.0,561_236.1
1,106,106,107,9.3,9.2,9.4,232,116,116,461500700.0,...,461774900.0,720180500.0,369951100.0,407041800.0,453488000.0,467032100.0,429995700.0,465883400.0,412128800.0,106_559.5
2,133,133,133,0.9,0.8,1.0,229,113,116,456751500.0,...,321160800.0,315903200.0,310996000.0,305069300.0,299285500.0,299757800.0,321721100.0,319003200.0,325932100.0,133_55.4
3,289,289,290,3.8,2.7,4.6,221,52,116,99710910.0,...,192359800.0,195436900.0,192350400.0,215315900.0,211612100.0,209927300.0,137295600.0,139412200.0,142479800.0,289_226.5
4,106,106,106,7.8,7.6,8.0,219,110,109,395362000.0,...,544013800.0,314930200.0,534282000.0,519121400.0,445463900.0,489225500.0,406041400.0,335261900.0,432434500.0,
5,191,191,191,1.4,0.7,1.7,191,96,90,190705700.0,...,256464800.0,252373800.0,250245300.0,233979200.0,230970800.0,230467300.0,240047600.0,238147400.0,238019500.0,191_90.0
6,579,578,579,3.8,3.0,6.2,166,12,112,22279910.0,...,133159200.0,142102400.0,135580000.0,149572100.0,151167400.0,150123400.0,114791800.0,121911900.0,119934900.0,579_224.5
7,117,116,117,7.3,7.2,7.5,164,87,77,43185740.0,...,61833500.0,65291680.0,63419330.0,78728280.0,77134400.0,77243430.0,46284430.0,43245590.0,47525960.0,117_439.5
8,834,833,834,3.9,2.2,5.2,138,10,92,21352410.0,...,15105760.0,19186310.0,17096630.0,33738970.0,35225430.0,21563960.0,28348030.0,21640070.0,28071130.0,834_230.7
9,433,432,433,4.4,3.2,5.2,132,51,66,29870720.0,...,41602690.0,35587600.0,40974660.0,40975250.0,40484430.0,40257610.0,54279500.0,53519370.0,50817970.0,433_265.2


In [14]:
# the process can create duplicates, so removing them is necessary
# the removal is based on the npeaks column. The feature with more npeaks, is kept.
maytenus_val = maytenus_val.sort_values('npeaks', ascending=False).drop_duplicates('features').sort_index()

# dropping unnecessary columns
maytenus_val = maytenus_val.drop(['mz', 'mzmin', 'mzmax', 'rt', 
                                  'rtmin', 'rtmax', 'npeaks','NEG_GROUP', 'POS_GROUP'], axis=1)

# removing the duplicates that might arise with the train is also necessary
# drop possible duplicates for train as well
maytenus_train_ref = maytenus_train_ref.sort_values('npeaks', ascending=False).drop_duplicates('features').sort_index()

# dropping unnecessary  columns
maytenus_train_ref = maytenus_train_ref.drop(['mz', 'mzmin', 'mzmax', 'rt', 
                                      'rtmin', 'rtmax', 'npeaks','NEG_GROUP', 'POS_GROUP'], axis=1)

# val set might have some feature that don't fit in any range - their feature names will be nan, so need to remove
# train might have some features that wont appear in the val. So, create them in val and set them to zero. 
# first, set index on both to be the features, so its possible to do that.
maytenus_train_ref= maytenus_train_ref.set_index('features')
maytenus_val = maytenus_val.dropna().set_index('features') # dropping na and making feature as index

# set method to get the set of index values that are unique 
# subtracting the sets to get the different indexes. 
# concat method to concatenate train and val
# filling the missing values on the concatenation with 0 using the fillna method.

unique_indexes = list(set(maytenus_train_ref.index) - set(maytenus_val.index))
maytenus_val = pd.concat([maytenus_val, pd.DataFrame(index=unique_indexes, columns=maytenus_val.columns)], sort=True).fillna(0)

# order both val and train features equally
# sort the features - the model needs them at the same sequence
maytenus_train_grid = maytenus_train_ref.reset_index().sort_values(by='features')
maytenus_val = maytenus_val.reset_index().sort_values(by='index')



In [15]:
maytenus_val

Unnamed: 0,index,AQ15_1,AQ15_2,AQ15_3,AQ24_1,AQ24_2,AQ24_3,AQ29_1,AQ29_2,AQ29_3,...,il3dez.16_1,il3dez.16_2,il3jun.17_1,il3jun.17_2,il4set.17_1,il4set.17_2,il5jun.17_1,il5jun.17_2,il5mar.17_1,il5mar.17_2
184,106_466.4,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,106_559.5,4.615007e+08,3.921856e+08,3.651777e+08,4.973705e+08,4.113189e+08,5.847028e+08,5.886159e+08,5.089171e+08,5.064581e+08,...,3.094679e+08,4.548742e+08,4.643353e+08,3.289325e+08,3.454915e+08,4.407722e+08,2.950395e+08,4.926769e+08,3.204264e+08,3.833416e+08
177,114_53.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
246,116_55.7,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
10,116_57.6,2.474023e+07,2.607785e+07,1.997996e+07,2.559537e+07,2.561218e+07,2.215755e+07,2.300451e+07,2.136276e+07,2.345134e+07,...,6.425616e+07,6.655169e+07,4.277934e+07,4.169153e+07,4.568345e+07,4.622658e+07,1.012460e+08,1.034758e+08,1.035189e+08,9.939187e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293,893_576.2,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
231,894_42.9,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
82,894_576.6,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
183,902_221.9,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


In [16]:
maytenus_train

Unnamed: 0,features,mz,mzmin,mzmax,rt,rtmin,rtmax,npeaks,NEG_GROUP,POS_GROUP,...,IL9_3,IL96_1,IL96_2,IL96_3,IL97_1,IL97_2,IL97_3,IL99_1,IL99_2,IL99_3
0,106_559.5,106,106,107,9.3,9.2,9.4,898,450,448,...,6.349953e+08,6.688862e+08,3.715633e+08,3.276188e+08,4.088999e+08,3.968681e+08,4.193466e+08,7.482934e+08,5.941419e+08,9.122603e+08
1,106_466.4,106,106,106,7.8,7.6,8.4,872,443,428,...,2.674476e+08,5.339767e+08,5.554318e+08,4.044345e+08,3.037508e+08,4.884128e+08,3.513157e+08,1.727818e+08,2.399838e+08,2.998023e+08
2,114_53.0,114,113,114,0.9,0.6,1.9,147,69,78,...,2.547011e+07,3.049628e+07,2.893030e+07,2.994507e+07,2.593874e+07,2.604811e+07,2.215156e+07,2.268741e+07,2.438374e+07,2.437233e+07
3,116_57.6,116,115,116,1.0,0.8,2.2,504,268,230,...,2.640049e+07,2.986601e+07,2.897032e+07,3.308571e+07,2.786753e+07,2.833673e+07,2.747798e+07,2.518095e+07,2.573060e+07,2.585509e+07
4,116_55.7,116,116,117,0.9,0.8,2.9,112,56,55,...,2.494592e+07,3.031344e+07,2.904803e+07,3.067653e+07,2.631701e+07,2.583740e+07,2.316231e+07,2.334697e+07,2.401065e+07,2.463647e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,893_42.7,893,892,893,0.7,0.7,0.8,93,39,54,...,1.976996e+07,1.878941e+07,1.878887e+07,1.813212e+07,2.007749e+07,2.032207e+07,1.903845e+07,1.769231e+07,1.827080e+07,1.766570e+07
302,894_42.9,894,893,894,0.7,0.6,0.8,169,84,85,...,1.950175e+07,2.071258e+07,1.901446e+07,2.009669e+07,2.017525e+07,2.072174e+07,1.866410e+07,2.073306e+07,1.802566e+07,1.818957e+07
303,894_576.6,894,893,894,9.6,9.5,9.7,131,64,67,...,1.378702e+07,1.193479e+07,1.367345e+07,1.249813e+07,1.228192e+07,1.090482e+07,1.260544e+07,1.780989e+07,1.133627e+07,1.065290e+07
304,902_221.9,902,901,902,3.7,3.7,4.1,77,72,5,...,7.522417e+06,8.047253e+06,9.279076e+06,9.610303e+06,8.076853e+06,1.065710e+07,9.148790e+06,3.711126e+06,6.212511e+06,4.968906e+06


## Bring the class data column

In [17]:
# load
classes_train = pd.read_csv('7. Gridsearch/classes_train_masters_maytenus.csv', index_col=[0])
classes_val = pd.read_csv('7. Gridsearch/classes_val_masters_maytenus.csv', index_col=[0])

# unite
maytenus_train_grid = maytenus_train_grid.set_index('features').T.join(classes_train)
display(maytenus_train_grid.head())

maytenus_val = maytenus_val.set_index('index').T.join(classes_val)
display(maytenus_val.head())

Unnamed: 0,106_466.4,106_559.5,114_53.0,116_55.7,116_57.6,117_439.5,118_109.9,128_565.1,129_463.4,129_561.0,...,866_204.8,867_232.0,868_226.2,893_42.7,893_576.2,894_42.9,894_576.6,902_221.9,918_205.7,class
AQ1_1,429035700.0,437896500.0,28495210.0,28199900.0,30809250.0,56356630.0,21249370.0,66356780.0,62888600.0,81867880.0,...,16042070.0,31334590.0,23242460.0,20505280.0,9858042.0,18016290.0,10277430.0,5111201.0,1559059.0,0
AQ1_2,353514100.0,406366500.0,25546810.0,25414060.0,28422400.0,62979560.0,19959130.0,60956250.0,63084770.0,81729750.0,...,14431760.0,34699230.0,20705750.0,17986810.0,10950860.0,18675970.0,11302910.0,5911845.0,1455983.0,0
AQ1_3,420212700.0,412808800.0,26531000.0,26797590.0,28479850.0,54334930.0,22295030.0,69606880.0,65373410.0,86637660.0,...,13649790.0,32325090.0,19548980.0,19262930.0,10801920.0,19442960.0,10829530.0,4796057.0,1183601.0,0
AQ10_1,509247500.0,426910100.0,29606000.0,29505820.0,31066440.0,75290400.0,7446117.0,53576280.0,91855380.0,72223010.0,...,14023720.0,28927840.0,15982720.0,17199810.0,11941920.0,17143940.0,12046980.0,4636775.0,1092701.0,0
AQ10_2,316808200.0,540357000.0,28537450.0,29227230.0,29737930.0,74558410.0,6591689.0,54765950.0,65998760.0,86219260.0,...,10513840.0,15965340.0,15036480.0,21025170.0,13166520.0,18717040.0,13347610.0,4663278.0,600886.2,0


Unnamed: 0,106_466.4,106_559.5,114_53.0,116_55.7,116_57.6,117_439.5,118_109.9,128_565.1,129_463.4,129_561.0,...,866_204.8,867_232.0,868_226.2,893_42.7,893_576.2,894_42.9,894_576.6,902_221.9,918_205.7,class
AQ15_1,0.0,461500700.0,0.0,0.0,24740230.0,43185740.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
AQ15_2,0.0,392185600.0,0.0,0.0,26077850.0,47966160.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
AQ15_3,0.0,365177700.0,0.0,0.0,19979960.0,41201920.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
AQ24_1,0.0,497370500.0,0.0,0.0,25595370.0,52027500.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
AQ24_2,0.0,411318900.0,0.0,0.0,25612180.0,54134680.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [18]:
maytenus_train_grid

Unnamed: 0,106_466.4,106_559.5,114_53.0,116_55.7,116_57.6,117_439.5,118_109.9,128_565.1,129_463.4,129_561.0,...,866_204.8,867_232.0,868_226.2,893_42.7,893_576.2,894_42.9,894_576.6,902_221.9,918_205.7,class
AQ1_1,4.290357e+08,4.378965e+08,2.849521e+07,2.819990e+07,3.080925e+07,5.635663e+07,2.124937e+07,6.635678e+07,6.288860e+07,8.186788e+07,...,1.604207e+07,3.133459e+07,2.324246e+07,2.050528e+07,9.858042e+06,1.801629e+07,1.027743e+07,5.111201e+06,1.559059e+06,0
AQ1_2,3.535141e+08,4.063665e+08,2.554681e+07,2.541406e+07,2.842240e+07,6.297956e+07,1.995913e+07,6.095625e+07,6.308477e+07,8.172975e+07,...,1.443176e+07,3.469923e+07,2.070575e+07,1.798681e+07,1.095086e+07,1.867597e+07,1.130291e+07,5.911845e+06,1.455983e+06,0
AQ1_3,4.202127e+08,4.128088e+08,2.653100e+07,2.679759e+07,2.847985e+07,5.433493e+07,2.229503e+07,6.960688e+07,6.537341e+07,8.663766e+07,...,1.364979e+07,3.232509e+07,1.954898e+07,1.926293e+07,1.080192e+07,1.944296e+07,1.082953e+07,4.796057e+06,1.183601e+06,0
AQ10_1,5.092475e+08,4.269101e+08,2.960600e+07,2.950582e+07,3.106644e+07,7.529040e+07,7.446117e+06,5.357628e+07,9.185538e+07,7.222301e+07,...,1.402372e+07,2.892784e+07,1.598272e+07,1.719981e+07,1.194192e+07,1.714394e+07,1.204698e+07,4.636775e+06,1.092701e+06,0
AQ10_2,3.168082e+08,5.403570e+08,2.853745e+07,2.922723e+07,2.973793e+07,7.455841e+07,6.591689e+06,5.476595e+07,6.599876e+07,8.621926e+07,...,1.051384e+07,1.596534e+07,1.503648e+07,2.102517e+07,1.316652e+07,1.871704e+07,1.334761e+07,4.663278e+06,6.008862e+05,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IL97_2,4.884128e+08,3.968681e+08,2.604811e+07,2.583740e+07,2.833673e+07,6.378108e+07,2.450308e+07,5.522429e+07,6.496408e+07,7.291878e+07,...,1.988848e+07,2.736400e+07,5.377363e+07,2.032207e+07,1.126592e+07,2.072174e+07,1.090482e+07,1.065710e+07,3.307328e+06,1
IL97_3,3.513157e+08,4.193466e+08,2.215156e+07,2.316231e+07,2.747798e+07,5.918665e+07,2.062946e+07,5.806267e+07,6.288025e+07,7.606871e+07,...,1.230893e+07,4.936625e+07,1.748761e+07,1.903845e+07,1.220539e+07,1.866410e+07,1.260544e+07,9.148790e+06,2.661092e+05,1
IL99_1,1.727818e+08,7.482934e+08,2.268741e+07,2.334697e+07,2.518095e+07,5.603864e+07,7.564978e+06,5.869960e+07,5.638383e+07,7.606257e+07,...,1.332012e+07,2.163348e+07,1.985294e+07,1.769231e+07,1.066751e+07,2.073306e+07,1.780989e+07,3.711126e+06,6.765164e+05,1
IL99_2,2.399838e+08,5.941419e+08,2.438374e+07,2.401065e+07,2.573060e+07,6.067079e+07,6.868867e+06,5.563497e+07,6.095756e+07,7.377506e+07,...,1.235675e+07,4.543994e+07,1.991397e+07,1.827080e+07,1.149048e+07,1.802566e+07,1.133627e+07,6.212511e+06,6.761015e+05,1


In [19]:
maytenus_train.to_csv('features_train_comparison.csv')

Data is now ready for ANY machine learning process

# Machine learning

## X y split

In [20]:
X_train = maytenus_train_grid.drop("class", axis=1)
y_train = maytenus_train_grid["class"]

X_val = maytenus_val.drop("class", axis=1)
y_val = maytenus_val["class"]

In [21]:
y_train.value_counts()

class
0    460
1    460
Name: count, dtype: int64

In [22]:
y_val.value_counts()

class
1    133
0    116
Name: count, dtype: int64

## Training

In [23]:
# https://stackoverflow.com/questions/31948879/using-explicit-predefined-validation-set-for-grid-search-with-sklearn
# https://stackoverflow.com/questions/48390601/explicitly-specifying-test-train-sets-in-gridsearchcv

# Create a list of indices for the training and validation sets
train_indices = np.ones(len(X_train))
val_indices = np.zeros(len(X_val))
cv_indices = np.concatenate((train_indices, val_indices))


# model
svm = SVC()
rf = RandomForestClassifier(random_state=2187)
knn = KNeighborsClassifier()

# params of each model

param_svm = {}
param_svm['model'] = [svm]
param_svm['model__kernel'] = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
param_svm['model__C'] = [1, 0.9]
param_svm['model__kernel'] = ['rbf']

param_rf = {}
param_rf['model'] = [rf]
#param_rf['model__max_depth'] = [5,10,15]
param_rf['model__min_samples_leaf'] = [5,10,15]
param_rf['model__n_estimators'] = [200,250, 300]
param_rf['model__criterion'] = ['gini', 'entropy']


param_knn = {}
param_knn['model'] = [knn]
param_knn['model__n_neighbors'] = [5,15,25]
param_knn['model__weights'] = ['uniform','distance']


# uniting param to test in gridsearch

params_gridsearch = [param_svm,param_rf,param_knn]

# no need to encode or transform data. All is numeric and same scale

# pipe - starts with svm 
pipe = Pipeline([('model', svm)])

cv = PredefinedSplit(cv_indices)

# gridsearch 
grid = GridSearchCV(pipe, params_gridsearch, 
                    cv = cv,
                   scoring = ['f1','matthews_corrcoef'],
                   return_train_score = True, 
                   refit = 'matthews_corrcoef',
                   verbose = 3)

In [24]:
gridsearch_fit = grid.fit(np.vstack((X_train, X_val)), np.hstack((y_train, y_val)))


Fitting 2 folds for each of 26 candidates, totalling 52 fits
[CV 1/2] END model=SVC(), model__C=1, model__kernel=rbf; f1: (train=0.988, test=0.902) matthews_corrcoef: (train=0.976, test=0.785) total time=   0.4s
[CV 2/2] END model=SVC(), model__C=1, model__kernel=rbf; f1: (train=1.000, test=0.930) matthews_corrcoef: (train=1.000, test=0.856) total time=   0.0s
[CV 1/2] END model=SVC(), model__C=0.9, model__kernel=rbf; f1: (train=0.987, test=0.927) matthews_corrcoef: (train=0.974, test=0.841) total time=   0.0s
[CV 2/2] END model=SVC(), model__C=0.9, model__kernel=rbf; f1: (train=1.000, test=0.938) matthews_corrcoef: (train=1.000, test=0.874) total time=   0.0s
[CV 1/2] END model=RandomForestClassifier(random_state=2187), model__criterion=gini, model__min_samples_leaf=5, model__n_estimators=200; f1: (train=0.997, test=0.696) matthews_corrcoef: (train=0.993, test=0.000) total time=   0.8s
[CV 2/2] END model=RandomForestClassifier(random_state=2187), model__criterion=gini, model__min_samp

[CV 2/2] END model=RandomForestClassifier(random_state=2187), model__criterion=entropy, model__min_samples_leaf=15, model__n_estimators=250; f1: (train=1.000, test=0.969) matthews_corrcoef: (train=1.000, test=0.940) total time=   0.2s
[CV 1/2] END model=RandomForestClassifier(random_state=2187), model__criterion=entropy, model__min_samples_leaf=15, model__n_estimators=300; f1: (train=0.989, test=1.000) matthews_corrcoef: (train=0.978, test=1.000) total time=   1.4s
[CV 2/2] END model=RandomForestClassifier(random_state=2187), model__criterion=entropy, model__min_samples_leaf=15, model__n_estimators=300; f1: (train=1.000, test=0.969) matthews_corrcoef: (train=1.000, test=0.940) total time=   0.2s
[CV 1/2] END model=KNeighborsClassifier(), model__n_neighbors=5, model__weights=uniform; f1: (train=0.988, test=0.709) matthews_corrcoef: (train=0.976, test=0.327) total time=   0.6s
[CV 2/2] END model=KNeighborsClassifier(), model__n_neighbors=5, model__weights=uniform; f1: (train=1.000, test=

In [46]:
(0.785 + 0.856 + 0.841 + 0.874)/ 4 

0.8390000000000001

In [47]:
((0.785 + 0.856)/2 + (0.841 + 0.874)/2 )/ 2 

0.839

In [25]:
grid.best_params_

{'model': RandomForestClassifier(min_samples_leaf=10, n_estimators=300, random_state=2187),
 'model__criterion': 'gini',
 'model__min_samples_leaf': 10,
 'model__n_estimators': 300}

In [26]:
# 0.8574613792005097 before balancing the classes and fixing the issues
grid.best_score_



0.9709026022357814

In [27]:
round(0.9709026022357814,3)

0.971

In [28]:
results = pd.DataFrame(grid.cv_results_)

In [44]:
results[['param_model','mean_test_f1','std_test_f1','mean_test_matthews_corrcoef','std_test_matthews_corrcoef']]

Unnamed: 0,param_model,mean_test_f1,std_test_f1,mean_test_matthews_corrcoef,std_test_matthews_corrcoef
0,SVC(),0.915651,0.013956,0.820579,0.035888
1,SVC(),0.932525,0.005695,0.857639,0.016635
2,"RandomForestClassifier(min_samples_leaf=10, n_...",0.832681,0.136346,0.46985,0.46985
3,"RandomForestClassifier(min_samples_leaf=10, n_...",0.83211,0.135775,0.468799,0.468799
4,"RandomForestClassifier(min_samples_leaf=10, n_...",0.832681,0.136346,0.46985,0.46985
5,"RandomForestClassifier(min_samples_leaf=10, n_...",0.984513,0.015487,0.96985,0.03015
6,"RandomForestClassifier(min_samples_leaf=10, n_...",0.983942,0.016058,0.968799,0.031201
7,"RandomForestClassifier(min_samples_leaf=10, n_...",0.985083,0.014917,0.970903,0.029097
8,"RandomForestClassifier(min_samples_leaf=10, n_...",0.980335,0.015891,0.961641,0.030326
9,"RandomForestClassifier(min_samples_leaf=10, n_...",0.983942,0.016058,0.968799,0.031201


In [29]:
results[['param_model','mean_train_matthews_corrcoef','mean_test_matthews_corrcoef']].groupby('param_model',sort=False).mean(['mean_train_matthews_corrcoef','mean_test_matthews_corrcoef']).round(3)

Unnamed: 0_level_0,mean_train_matthews_corrcoef,mean_test_matthews_corrcoef
param_model,Unnamed: 1_level_1,Unnamed: 2_level_1
SVC(),0.988,0.839
"RandomForestClassifier(min_samples_leaf=10, n_estimators=300, random_state=2187)",0.992,0.803
KNeighborsClassifier(),0.987,0.796


In [30]:
results[['param_model','mean_train_f1','mean_test_f1']].groupby('param_model',sort=False).mean(['mean_train_f1','mean_test_f1']).round(3)

Unnamed: 0_level_0,mean_train_f1,mean_test_f1
param_model,Unnamed: 1_level_1,Unnamed: 2_level_1
SVC(),0.994,0.924
"RandomForestClassifier(min_samples_leaf=10, n_estimators=300, random_state=2187)",0.996,0.934
KNeighborsClassifier(),0.993,0.904


In [31]:
results[['param_model','mean_test_f1','std_test_f1','mean_test_matthews_corrcoef','std_test_matthews_corrcoef']].groupby('param_model',sort=False).mean(['mean_test_f1','std_test_f1','mean_test_matthews_corrcoef','std_test_matthews_corrcoef']).round(3)

Unnamed: 0_level_0,mean_test_f1,std_test_f1,mean_test_matthews_corrcoef,std_test_matthews_corrcoef
param_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVC(),0.924,0.01,0.839,0.026
"RandomForestClassifier(min_samples_leaf=10, n_estimators=300, random_state=2187)",0.934,0.056,0.803,0.177
KNeighborsClassifier(),0.904,0.055,0.796,0.126


In [42]:
results[['param_model','mean_test_f1','std_test_f1','mean_test_matthews_corrcoef','std_test_matthews_corrcoef']].groupby('param_model',sort=False).std().round(3)

Unnamed: 0_level_0,mean_test_f1,std_test_f1,mean_test_matthews_corrcoef,std_test_matthews_corrcoef
param_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVC(),0.012,0.006,0.026,0.014
"RandomForestClassifier(min_samples_leaf=10, n_estimators=300, random_state=2187)",0.073,0.059,0.242,0.213
KNeighborsClassifier(),0.053,0.056,0.13,0.136


# Exploratory

In [32]:
gridsearch_fit.best_estimator_['model']

In [33]:
feat_imp = gridsearch_fit.best_estimator_['model'].feature_importances_

In [34]:
data = {'feature': X_train.columns, 'importance': feat_imp}
pd.DataFrame(data).sort_values(by='importance', ascending=False).to_csv('feat_imp.csv')