In [176]:
from warnings import simplefilter
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import f1_score
import xgboost
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np

In [177]:
df = pd.read_csv("Genotyped.csv", index_col='index')
output = pd.read_csv("Phenotypes.csv", index_col="index")

In [178]:
num_steps = 5
num_bins = 3
split = 0.2

In [179]:
df.shape

(599, 1279)

In [180]:
df['avg_phen'] = output['average phenotypes']
df_sort = df.sort_values('avg_phen', ascending=False)
df_sort = df_sort.reset_index(drop=True)

In [181]:
highest = df_sort.iloc[0, -1:][0]
lowest = df_sort.iloc[-1, -1:][0]
highest, lowest

(1.7967644880000002, -2.339534051)

In [182]:
step = (highest - lowest) / num_steps
step

0.8272597078

In [183]:
border = highest
df_final = df_sort.iloc[:, :-1]

for i in range(num_bins):
    df_final.loc[df_sort['avg_phen'] <= border, 'bin_n'] = i
    border = border - step
    

In [184]:
X_base = df_final.drop(columns=['bin_n'])
X_base.shape

(599, 1279)

In [185]:
y_base = df_final['bin_n']
y_base.shape

(599,)

In [186]:
def selectNFeatures(X, y, n):
    skb = SelectKBest(f_regression, k=n)
    skb.fit(X, y)
    print("\nSample with ", n, " best paramenets")
    return skb.transform(X)

In [187]:
def doSMOTE(X, y):
    smote = SMOTE()
    X_after, y_after = smote.fit_resample(X, y)
    print("\nSMOTE applied")
    print("Before SMOTE: ", Counter(y))
    print("After SMOTE: ", Counter(y_after), '\n')
    return X_after, y_after

In [188]:
def trainModel(X, y, X_train, X_test, y_train, y_test):
    parameters = {
        'n_estimators': [10,50,100],
        'max_depth' : [4,5,7,10],
        'learning_rate' : [0.0001, 0.001, 0.01]
    }
    classifier = xgboost.XGBClassifier()
    random_search = RandomizedSearchCV(
        classifier,
        param_distributions=parameters,
        n_iter=5,
        scoring='f1_micro',
        n_jobs=-1,
        cv=5,
        verbose=3,
    )
    random_search.fit(X, y)
    best_estimator = random_search.best_estimator_
    best_params = random_search.best_params_
    print ("\nBest parameters: ", best_estimator, "\n")
    
    best_clf = xgboost.XGBClassifier(objective='reg:squarederror',
                        n_estimators=best_params['n_estimators'], 
                        max_depth=best_params['max_depth'], 
                        learning_rate=best_params['learning_rate'])
    
    best_clf.fit(X_train,y_train)
    
    y_pred = best_clf.predict(X_test)
    y_pred_tr = best_clf.predict(X_train)
    
    return y_pred, y_pred_tr
    

In [189]:
def evaluate(y_train, y_pred_tr, y_test, y_pred):
    print('Training set:',f1_score(y_train,y_pred_tr, average='macro'))
    print('Test set:',f1_score(y_test,y_pred, average='macro'))

In [190]:
def confusionMatrix(y_test, y_pred):
    print('\n\n', pd.crosstab(y_test, y_pred), '\n\n')

In [191]:
for n in range(100, 700, 100):
    y = y_base
    X = selectNFeatures(X_base, y, n)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42)
        
    # Train the model without SMOTE
    y_pred, y_pred_tr = trainModel(X, y, X_train, X_test, y_train, y_test)
    
    # Evaluating the model without SMOTE
    print('Evaluation for n=', n, '. Without SMOTE')
    evaluate(y_train, y_pred_tr, y_test, y_pred)
    confusionMatrix(y_test, y_pred)
    
    X, y = doSMOTE(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42)
    
    #training the model with SMOTE
    y_pred, y_pred_tr = trainModel(X, y, X_train, X_test, y_train, y_test)
    
    # Evaluating the model with SMOTE
    print('Evaluation for n=', n, '. With SMOTE')
    evaluate(y_train, y_pred_tr, y_test, y_pred)
    confusionMatrix(y_test, y_pred)
    


Sample with  100  best paramenets
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    4.7s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    5.0s finished



Best parameters:  XGBClassifier(learning_rate=0.01, max_depth=7, n_estimators=50,
              objective='multi:softprob') 

Evaluation for n= 100 . Without SMOTE
Training set: 0.7665113681491853
Test set: 0.42235092235092236


 col_0  0.0  1.0  2.0
bin_n               
0.0      0    5    2
1.0      0   22   25
2.0      1   10   55 



SMOTE applied
Before SMOTE:  Counter({2.0: 342, 1.0: 218, 0.0: 39})
After SMOTE:  Counter({0.0: 342, 1.0: 342, 2.0: 342}) 

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    1.6s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    3.1s finished



Best parameters:  XGBClassifier(learning_rate=0.001, max_depth=10, n_estimators=50,
              objective='multi:softprob') 

Evaluation for n= 100 . With SMOTE
Training set: 0.9106815933529466
Test set: 0.7788387323271043


 col_0  0.0  1.0  2.0
bin_n               
0.0     60    6    1
1.0     10   44    8
2.0      3   17   57 



Sample with  200  best paramenets
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    5.1s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    5.7s finished



Best parameters:  XGBClassifier(learning_rate=0.01, max_depth=4, objective='multi:softprob') 

Evaluation for n= 200 . Without SMOTE
Training set: 0.7050754613290163
Test set: 0.38368942885071916


 col_0  0.0  1.0  2.0
bin_n               
0.0      0    5    2
1.0      1   16   30
2.0      0    9   57 



SMOTE applied
Before SMOTE:  Counter({2.0: 342, 1.0: 218, 0.0: 39})
After SMOTE:  Counter({0.0: 342, 1.0: 342, 2.0: 342}) 

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    3.0s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    3.9s finished



Best parameters:  XGBClassifier(learning_rate=0.01, max_depth=5, objective='multi:softprob') 

Evaluation for n= 200 . With SMOTE
Training set: 0.8671365116184079
Test set: 0.6927755819060167


 col_0  0.0  1.0  2.0
bin_n               
0.0     63    3    1
1.0     18   33   11
2.0      6   22   49 



Sample with  300  best paramenets
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    1.4s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    2.3s finished



Best parameters:  XGBClassifier(learning_rate=0.001, max_depth=7, n_estimators=10,
              objective='multi:softprob') 

Evaluation for n= 300 . Without SMOTE
Training set: 0.8608778352625666
Test set: 0.3773337700622339


 col_0  0.0  1.0  2.0
bin_n               
0.0      0    4    3
1.0      4   19   24
2.0      1   16   49 



SMOTE applied
Before SMOTE:  Counter({2.0: 342, 1.0: 218, 0.0: 39})
After SMOTE:  Counter({0.0: 342, 1.0: 342, 2.0: 342}) 

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:   11.2s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   13.2s finished



Best parameters:  XGBClassifier(learning_rate=0.01, max_depth=10, objective='multi:softprob') 

Evaluation for n= 300 . With SMOTE
Training set: 0.962403458250957
Test set: 0.7954525449249895


 col_0  0.0  1.0  2.0
bin_n               
0.0     65    2    0
1.0     11   43    8
2.0      4   16   57 



Sample with  400  best paramenets
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    3.9s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    5.0s finished



Best parameters:  XGBClassifier(learning_rate=0.0001, max_depth=7, n_estimators=10,
              objective='multi:softprob') 

Evaluation for n= 400 . Without SMOTE
Training set: 0.8322789367675143
Test set: 0.3481096681096681


 col_0  0.0  1.0  2.0
bin_n               
0.0      0    4    3
1.0      6   13   28
2.0      0   13   53 



SMOTE applied
Before SMOTE:  Counter({2.0: 342, 1.0: 218, 0.0: 39})
After SMOTE:  Counter({0.0: 342, 1.0: 342, 2.0: 342}) 

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    3.9s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    9.2s finished



Best parameters:  XGBClassifier(learning_rate=0.0001, max_depth=7, n_estimators=50,
              objective='multi:softprob') 

Evaluation for n= 400 . With SMOTE
Training set: 0.9041458862245452
Test set: 0.7057079525568734


 col_0  0.0  1.0  2.0
bin_n               
0.0     64    2    1
1.0     12   36   14
2.0      5   25   47 



Sample with  500  best paramenets
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    1.8s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    5.5s finished



Best parameters:  XGBClassifier(learning_rate=0.001, max_depth=7, n_estimators=10,
              objective='multi:softprob') 

Evaluation for n= 500 . Without SMOTE
Training set: 0.8162946870050742
Test set: 0.37693481578373667


 col_0  0.0  1.0  2.0
bin_n               
0.0      0    4    3
1.0      4   20   23
2.0      2   17   47 



SMOTE applied
Before SMOTE:  Counter({2.0: 342, 1.0: 218, 0.0: 39})
After SMOTE:  Counter({0.0: 342, 1.0: 342, 2.0: 342}) 

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:   16.7s remaining:    5.2s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   18.1s finished



Best parameters:  XGBClassifier(learning_rate=0.001, max_depth=10, n_estimators=50,
              objective='multi:softprob') 

Evaluation for n= 500 . With SMOTE
Training set: 0.9561826919689729
Test set: 0.719678938487855


 col_0  0.0  1.0  2.0
bin_n               
0.0     65    1    1
1.0      7   45   10
2.0      3   35   39 



Sample with  600  best paramenets
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    1.5s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    4.7s finished



Best parameters:  XGBClassifier(learning_rate=0.001, max_depth=4, n_estimators=10,
              objective='multi:softprob') 

Evaluation for n= 600 . Without SMOTE
Training set: 0.6592521661646086
Test set: 0.3228308563340411


 col_0  0.0  1.0  2.0
bin_n               
0.0      0    4    3
1.0      1   11   35
2.0      0   13   53 



SMOTE applied
Before SMOTE:  Counter({2.0: 342, 1.0: 218, 0.0: 39})
After SMOTE:  Counter({0.0: 342, 1.0: 342, 2.0: 342}) 

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:   19.8s remaining:    6.2s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   25.8s finished



Best parameters:  XGBClassifier(learning_rate=0.01, max_depth=7, n_estimators=50,
              objective='multi:softprob') 

Evaluation for n= 600 . With SMOTE
Training set: 0.9436437708679701
Test set: 0.7438226031793201


 col_0  0.0  1.0  2.0
bin_n               
0.0     65    1    1
1.0      8   45    9
2.0      4   29   44 


