# Classification approach with equal range distribution
### With and without using SMOTE

In [2]:
from warnings import simplefilter
from sklearn.decomposition import PCA
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("Genotyped.csv", index_col='index')
output = pd.read_csv("Phenotypes.csv", index_col="index")

In [4]:
num_bins = 5

In [5]:
df.shape

(599, 1279)

## Data preparation

In [6]:
df['avg_phen'] = output['average phenotypes']

In [7]:
df.head(5)

Unnamed: 0_level_0,wPt.0538,wPt.8463,wPt.6348,wPt.9992,wPt.2838,wPt.8266,wPt.1100,wPt.0653,wPt.4418,wPt.2152,...,c.408294,c.408330,c.408336,c.408375,c.408393,c.408422,c.408424,c.408426,c.408443,avg_phen
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,1,1,1,0,1,1,1,1,...,0,1,1,1,0,0,0,0,1,-0.648708
1,1,1,1,1,1,0,1,1,0,1,...,1,0,0,0,0,1,1,1,1,0.155402
2,1,1,1,1,1,0,1,1,0,1,...,1,0,0,0,0,1,1,1,1,-0.36879
3,0,1,1,1,1,0,1,1,1,1,...,1,1,1,0,0,1,0,1,0,0.483285
4,0,1,1,1,1,0,1,1,1,1,...,1,1,1,1,0,0,0,1,1,0.778173


In [8]:
output_m = output.sort_values('average phenotypes')
output_m = output_m.reset_index(drop=True)
output_m.head(5)

Unnamed: 0,V1,V2,V3,V4,average phenotypes
0,-2.224246,-2.527187,-2.267169,-1.842812,-2.339534
1,-1.305113,-2.873682,-2.228349,-1.224804,-2.135715
2,-0.881059,-2.949626,-2.300642,-1.678445,-2.043776
3,-1.227085,-1.92976,-2.75852,-1.639111,-1.971788
4,-0.767604,-2.774005,-2.260327,-1.696806,-1.933979


In [9]:
print(output_m.get_value(0, 'average phenotypes'))
print(output_m.get_value(len(output)-1, 'average phenotypes'))

AttributeError: 'DataFrame' object has no attribute 'get_value'

**So, we know that the lowest avg phenotype is -2.33<br/>
And the highest avg phenotype is 1.79**

In [10]:
highest = output_m.get_value(len(output)-1, 'average phenotypes')
lowest = (output_m.get_value(0, 'average phenotypes'))

step = (highest - lowest) / num_bins
step

AttributeError: 'DataFrame' object has no attribute 'get_value'

In [11]:
# If you are facing a bug with get_values, run this
step = 0.8272597078
lowest = -2.339534051

In [12]:
bins = []
curr = lowest
for i in range(num_bins+1):
    bins.append(curr)
    curr = curr + step

bins

[-2.339534051,
 -1.5122743431999996,
 -0.6850146353999996,
 0.1422450724000004,
 0.9695047802000004,
 1.7967644880000004]

In [13]:
labels = []

for i in range(num_bins):
    labels.append(i)
    
labels

[0, 1, 2, 3, 4]

In [48]:
binned_df = pd.cut(df['avg_phen'], bins, labels=labels)
binned_df[0:5]

index
0    2
1    3
2    2
3    3
4    3
Name: avg_phen, dtype: category
Categories (5, int64): [0 < 1 < 2 < 3 < 4]

In [55]:
# transform categorical values into integers
binned_df = binned_df.cat.codes

In [56]:
df['bin_n'] = binned_df

In [57]:
df.head(10)

Unnamed: 0_level_0,wPt.0538,wPt.8463,wPt.6348,wPt.9992,wPt.2838,wPt.8266,wPt.1100,wPt.0653,wPt.4418,wPt.2152,...,c.408330,c.408336,c.408375,c.408393,c.408422,c.408424,c.408426,c.408443,avg_phen,bin_n
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,1,1,1,0,1,1,1,1,...,1,1,1,0,0,0,0,1,-0.648708,2
1,1,1,1,1,1,0,1,1,0,1,...,0,0,0,0,1,1,1,1,0.155402,3
2,1,1,1,1,1,0,1,1,0,1,...,0,0,0,0,1,1,1,1,-0.36879,2
3,0,1,1,1,1,0,1,1,1,1,...,1,1,0,0,1,0,1,0,0.483285,3
4,0,1,1,1,1,0,1,1,1,1,...,1,1,1,0,0,0,1,1,0.778173,3
5,1,1,1,1,1,0,1,1,1,1,...,1,1,0,0,0,0,1,1,1.012035,4
6,1,1,1,1,1,0,1,1,0,1,...,1,1,0,0,0,0,1,1,0.119381,2
7,1,1,0,1,1,0,0,1,1,1,...,1,1,0,0,1,1,0,0,-0.161635,2
8,0,1,0,1,1,0,1,1,0,1,...,1,0,0,0,0,1,1,1,-1.564314,0
9,0,0,0,1,1,0,1,1,1,1,...,1,0,0,0,0,1,0,1,-1.551885,0


### Final data preparation

In [58]:
X = df.drop(columns=['avg_phen', 'bin_n'])
X.shape

(599, 1279)

In [59]:
y = df['bin_n']
y.shape

(599,)

## Data scaling

In [60]:
df.groupby(['bin_n']).count()['avg_phen']

bin_n
-1      1
 0     11
 1     68
 2    262
 3    218
 4     39
Name: avg_phen, dtype: int64

**Split the data into test and train**

In [61]:
from sklearn.model_selection import train_test_split

split = 0.2 # use 20% of the data for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42)

In [62]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [63]:
X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train)

In [64]:
from collections import Counter
print("Before SMOTE: ", Counter(y_train))
print("After SMOTE: ", Counter(y_train_smote))

Before SMOTE:  Counter({2: 210, 3: 171, 1: 57, 4: 30, 0: 11})
After SMOTE:  Counter({2: 210, 0: 210, 1: 210, 3: 210, 4: 210})


## Training the model

In [65]:
parameters = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [0.0, 0.1, 0.2, 0.3, 0.4],
    "gamma": [0.0, 0.1, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7]
}

In [66]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\nTime taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [67]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost

In [68]:
classifier = xgboost.XGBClassifier()

In [84]:
random_search = RandomizedSearchCV(
    classifier,
    param_distributions=parameters,
    n_iter=5,
    scoring='f1_micro',
    n_jobs=-1,
    cv=5,
    verbose=3,
)

In [85]:
from datetime import datetime
start_time = timer(None)
random_search.fit(X, y)
timer(start_time)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:   32.8s remaining:   10.3s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   38.9s finished



Time taken: 0 hours 0 minutes and 46.8 seconds.


In [87]:
best_estimator = random_search.best_estimator_
best_estimator

XGBClassifier(colsample_bytree=0.3, gamma=0.4, learning_rate=0.25, max_depth=12,
              min_child_weight=0.3, objective='multi:softprob')

In [86]:
best_params = random_search.best_params_
best_params

{'min_child_weight': 0.3,
 'max_depth': 12,
 'learning_rate': 0.25,
 'gamma': 0.4,
 'colsample_bytree': 0.3}

In [108]:
best_clf = xgboost.XGBClassifier(
    colsample_bytree = best_params['colsample_bytree'],
    gamma = best_params['gamma'],
    learning_rate = best_params['learning_rate'],
    max_depth = best_params['max_depth'],
    min_child_weight = best_params['min_child_weight'],
    objective = 'multi:softprob'
)

best_clf_smote = xgboost.XGBClassifier(
    colsample_bytree = best_params['colsample_bytree'],
    gamma = best_params['gamma'],
    learning_rate = best_params['learning_rate'],
    max_depth = best_params['max_depth'],
    min_child_weight = best_params['min_child_weight'],
    objective = 'multi:softprob'
)

In [109]:
best_clf.fit(X_train,y_train)

best_clf_smote.fit(X_train_smote,y_train_smote)

XGBClassifier(colsample_bytree=0.3, gamma=0.4, learning_rate=0.25, max_depth=12,
              min_child_weight=0.3, objective='multi:softprob')

In [111]:
y_pred = best_clf.predict(X_test)
y_pred_tr = best_clf.predict(X_train)

y_pred_smote = best_clf_smote.predict(X_test)
y_pred_tr_smote = best_clf_smote.predict(X_train_smote)

## Evaluation

**With SMOTE**

In [118]:
pd.crosstab(y_test, y_pred_smote)

col_0,1,2,3,4
bin_n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0,1,0,0
1,1,9,1,0
2,5,28,16,3
3,2,20,24,1
4,0,3,6,0


In [115]:
print('Training set:',f1_score(y_train_smote,y_pred_tr_smote, average='macro'))
print('Test set:',f1_score(y_test,y_pred_smote, average='macro'))

Training set: 1.0
Test set: 0.22229533540120308


**Without SMOTE**

In [117]:
pd.crosstab(y_test, y_pred)

col_0,1,2,3,4
bin_n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0,1,0,0
1,1,9,1,0
2,2,35,14,1
3,1,22,22,2
4,0,5,2,2


In [116]:
from sklearn.metrics import f1_score
print('Training set:',f1_score(y_train,y_pred_tr, average='macro'))
print('Test set:',f1_score(y_test,y_pred, average='macro'))

Training set: 1.0
Test set: 0.2990383310113242
