# Classification approach

In [34]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

### Data preparation

In [43]:
data = pd.read_csv("Genotyped.csv", index_col='index')
data

Unnamed: 0_level_0,wPt.0538,wPt.8463,wPt.6348,wPt.9992,wPt.2838,wPt.8266,wPt.1100,wPt.0653,wPt.4418,wPt.2152,...,c.408290,c.408294,c.408330,c.408336,c.408375,c.408393,c.408422,c.408424,c.408426,c.408443
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,1,1,1,0,1,1,1,1,...,1,0,1,1,1,0,0,0,0,1
1,1,1,1,1,1,0,1,1,0,1,...,1,1,0,0,0,0,1,1,1,1
2,1,1,1,1,1,0,1,1,0,1,...,1,1,0,0,0,0,1,1,1,1
3,0,1,1,1,1,0,1,1,1,1,...,1,1,1,1,0,0,1,0,1,0
4,0,1,1,1,1,0,1,1,1,1,...,1,1,1,1,1,0,0,0,1,1
5,1,1,1,1,1,0,1,1,1,1,...,1,0,1,1,0,0,0,0,1,1
6,1,1,1,1,1,0,1,1,0,1,...,1,0,1,1,0,0,0,0,1,1
7,1,1,0,1,1,0,0,1,1,1,...,1,1,1,1,0,0,1,1,0,0
8,0,1,0,1,1,0,1,1,0,1,...,1,1,1,0,0,0,0,1,1,1
9,0,0,0,1,1,0,1,1,1,1,...,1,0,1,0,0,0,0,1,0,1


In [50]:
output = pd.read_csv("Phenotypes.csv", index_col="index")
sorted_by_avg_phen = output.sort_values('average phenotypes')
sorted_by_avg_phen

Unnamed: 0_level_0,V1,V2,V3,V4,average phenotypes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
165,-2.224246,-2.527187,-2.267169,-1.842812,-2.339534
32,-1.305113,-2.873682,-2.228349,-1.224804,-2.135715
179,-0.881059,-2.949626,-2.300642,-1.678445,-2.043776
397,-1.227085,-1.929760,-2.758520,-1.639111,-1.971788
178,-0.767604,-2.774005,-2.260327,-1.696806,-1.933979
142,-0.725475,-2.721003,-2.109903,-1.972140,-1.852127
196,-0.428965,-1.677606,-3.350597,-1.024195,-1.819056
143,-0.853540,-2.186229,-2.160582,-1.480471,-1.733451
88,-0.722958,-2.824635,-1.316521,-1.428024,-1.621371
8,-0.967639,-1.789263,-1.936040,-1.502758,-1.564314


In [89]:
num_bins = 5
pd_data = sorted_by_avg_phen.iloc[:,[4]]
np_array_bins = np.array_split(pd_data, num_bins)
np_array_bins

[       average phenotypes
 index                    
 165             -2.339534
 32              -2.135715
 179             -2.043776
 397             -1.971788
 178             -1.933979
 142             -1.852127
 196             -1.819056
 143             -1.733451
 88              -1.621371
 8               -1.564314
 9               -1.551885
 36              -1.523855
 502             -1.483623
 122             -1.471935
 114             -1.465136
 169             -1.460224
 383             -1.439912
 204             -1.421230
 181             -1.411206
 144             -1.410028
 158             -1.386118
 130             -1.376850
 92              -1.307364
 23              -1.294496
 170             -1.272072
 379             -1.259205
 478             -1.238863
 259             -1.231282
 302             -1.188289
 358             -1.177803
 ...                   ...
 584             -0.642458
 586             -0.641099
 305             -0.635254
 577             -0.627344
 

In [142]:
y = np.empty([2, num_bins])
            
for bin_n in range(num_bins):
    for each in np_array_bins[bin_n]:
        y = np.append(y, np.array([np_array_bins[bin_n].index, bin_n]))
            
y = np.sort(y)

Unnamed: 0_level_0,average phenotypes
index,Unnamed: 1_level_1
564,-0.468436
378,-0.461832
526,-0.460286
215,-0.460165
505,-0.450759
357,-0.449932
128,-0.448173
229,-0.444910
548,-0.443374
154,-0.443218


### Training the model

In [None]:
from sklearn.model_selection import train_test_split

split = 0.2 # use 20% of the data for testing.
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=split, random_state=42)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

classifier = XGBRegressor(objective ='reg:squarederror')
grid = {
    'n_estimators': [100, 200, 300],
    'max_depth' : [3, 4, 5],
    'learning_rate' : [0.01, 0.02, 0.001]
}

gd_sr = GridSearchCV(estimator=classifier,
                       param_grid=grid,
                       scoring='f1', # should we use f1 instead of r2?
                       cv=5, iid=False)


gd_sr.fit(X_train, y_train)

best_parameters = gd_sr.best_params_
print(best_parameters)

In [None]:
best_clf = XGBRegressor(objective='reg:squarederror',
                        n_estimators=best_parameters['n_estimators'], 
                        max_depth=best_parameters['max_depth'], 
                        learning_rate=best_parameters['learning_rate'])

# has best set of parameters, now retrain on all training data.

best_clf.fit(X_one_train,y_one_train)

In [None]:
y_pred = best_clf.predict(X_test)
y_pred_tr = best_clf.predict(X_train)

### Evaluation

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.xlabel("Actual values")
plt.ylabel("Predicted values")
plt.plot(y_test, y_pred, 'o', color='black')
plt.plot(y_train,y_pred_tr, 'x', color='red')
plt.show()

In [None]:
from sklearn.metrics import f1_score
print('Training set:',f1_score(y_train,y_pred_tr, average='macro'))
print('Test set:',f1_score(y_test,y_one_pred, average='macro'))