# Classification approach with equally sized bins

In [1]:
from warnings import simplefilter
from sklearn.decomposition import PCA
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Genotyped.csv", index_col='index')
output = pd.read_csv("Phenotypes.csv", index_col="index")

In [3]:
num_bins = 3

In [4]:
df.shape

(599, 1279)

## Data preparation

In [5]:
df['avg_phen'] = output['average phenotypes']

In [6]:
df.head(5)

Unnamed: 0_level_0,wPt.0538,wPt.8463,wPt.6348,wPt.9992,wPt.2838,wPt.8266,wPt.1100,wPt.0653,wPt.4418,wPt.2152,...,c.408294,c.408330,c.408336,c.408375,c.408393,c.408422,c.408424,c.408426,c.408443,avg_phen
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,1,1,1,0,1,1,1,1,...,0,1,1,1,0,0,0,0,1,-0.648708
1,1,1,1,1,1,0,1,1,0,1,...,1,0,0,0,0,1,1,1,1,0.155402
2,1,1,1,1,1,0,1,1,0,1,...,1,0,0,0,0,1,1,1,1,-0.36879
3,0,1,1,1,1,0,1,1,1,1,...,1,1,1,0,0,1,0,1,0,0.483285
4,0,1,1,1,1,0,1,1,1,1,...,1,1,1,1,0,0,0,1,1,0.778173


In [7]:
df = df.sort_values('avg_phen')
df = df.reset_index(drop=True)
df.head(5)

Unnamed: 0,wPt.0538,wPt.8463,wPt.6348,wPt.9992,wPt.2838,wPt.8266,wPt.1100,wPt.0653,wPt.4418,wPt.2152,...,c.408294,c.408330,c.408336,c.408375,c.408393,c.408422,c.408424,c.408426,c.408443,avg_phen
0,1,0,1,1,1,0,0,0,0,1,...,0,1,1,0,0,1,0,1,1,-2.339534
1,0,0,0,1,1,0,1,1,1,1,...,0,1,0,1,0,1,0,1,1,-2.135715
2,0,0,1,0,0,0,1,0,1,1,...,0,1,0,1,0,1,0,1,1,-2.043776
3,1,1,1,1,1,0,1,1,0,1,...,0,1,1,1,0,1,1,1,1,-1.971788
4,1,0,1,1,1,0,1,1,1,1,...,0,1,0,1,0,0,0,1,1,-1.933979


In [8]:
bin_size = int(round(len(df)/num_bins))
bin_size

200

In [9]:
count = 0
bin_n = 0
bins = []

for i in range(len(df)):
    bins.append(bin_n)
    count = count + 1
    if count >= bin_size and bin_n < num_bins-1:
        count = 0
        bin_n = bin_n + 1

df['bin_n'] = bins
df['bin_n'].unique()

array([0, 1, 2], dtype=int64)

In [10]:
for i in range(num_bins):
    print("The number of elements in bin ", i, " is ", df['bin_n'].loc[df['bin_n'] == i].count())

The number of elements in bin  0  is  200
The number of elements in bin  1  is  200
The number of elements in bin  2  is  199


In [11]:
# Drop the phenotypes column
df = df.drop(columns=['avg_phen'])

In [12]:
df.head(5)

Unnamed: 0,wPt.0538,wPt.8463,wPt.6348,wPt.9992,wPt.2838,wPt.8266,wPt.1100,wPt.0653,wPt.4418,wPt.2152,...,c.408294,c.408330,c.408336,c.408375,c.408393,c.408422,c.408424,c.408426,c.408443,bin_n
0,1,0,1,1,1,0,0,0,0,1,...,0,1,1,0,0,1,0,1,1,0
1,0,0,0,1,1,0,1,1,1,1,...,0,1,0,1,0,1,0,1,1,0
2,0,0,1,0,0,0,1,0,1,1,...,0,1,0,1,0,1,0,1,1,0
3,1,1,1,1,1,0,1,1,0,1,...,0,1,1,1,0,1,1,1,1,0
4,1,0,1,1,1,0,1,1,1,1,...,0,1,0,1,0,0,0,1,1,0


In [13]:
X = df.iloc[:, :-1]
X.head(5)

Unnamed: 0,wPt.0538,wPt.8463,wPt.6348,wPt.9992,wPt.2838,wPt.8266,wPt.1100,wPt.0653,wPt.4418,wPt.2152,...,c.408290,c.408294,c.408330,c.408336,c.408375,c.408393,c.408422,c.408424,c.408426,c.408443
0,1,0,1,1,1,0,0,0,0,1,...,1,0,1,1,0,0,1,0,1,1
1,0,0,0,1,1,0,1,1,1,1,...,1,0,1,0,1,0,1,0,1,1
2,0,0,1,0,0,0,1,0,1,1,...,0,0,1,0,1,0,1,0,1,1
3,1,1,1,1,1,0,1,1,0,1,...,1,0,1,1,1,0,1,1,1,1
4,1,0,1,1,1,0,1,1,1,1,...,1,0,1,0,1,0,0,0,1,1


In [14]:
y = df.iloc[:, -1]
y.head(5)

0    0
1    0
2    0
3    0
4    0
Name: bin_n, dtype: int64

## Feature selection

In [15]:
from sklearn.feature_selection import SelectKBest, f_regression

skb = SelectKBest(f_regression, k=500)
skb.fit(X, y)
X = skb.transform(X)

## Data split

In [16]:
from sklearn.model_selection import train_test_split

split = 0.2 # use 20% of the data for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42)

## Training the model

In [17]:
parameters = {
    'n_estimators': [6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17],
    'max_depth' : [8, 10, 11, 12, 13, 15],
    'learning_rate' : [0.0001, 0.001, 0.01]
}

In [18]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\nTime taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [19]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBClassifier

In [20]:
classifier = XGBClassifier()

In [21]:
random_search = RandomizedSearchCV(
    classifier,
    param_distributions=parameters,
    n_iter=5,
    scoring='f1_micro',
    n_jobs=-1,
    cv=5,
    verbose=3,
)

In [22]:
from datetime import datetime
start_time = timer(None)
random_search.fit(X, y)
timer(start_time)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    5.0s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    6.3s finished



Time taken: 0 hours 0 minutes and 6.77 seconds.


In [23]:
best_estimator = random_search.best_estimator_
best_estimator

XGBClassifier(learning_rate=0.0001, max_depth=8, n_estimators=7,
              objective='multi:softprob')

In [24]:
best_parameters = random_search.best_params_

In [25]:
best_clf = XGBClassifier(objective='reg:squarederror',
                        n_estimators=best_parameters['n_estimators'], 
                        max_depth=best_parameters['max_depth'], 
                        learning_rate=best_parameters['learning_rate'])

In [26]:
best_clf.fit(X_train,y_train)

XGBClassifier(learning_rate=0.0001, max_depth=8, n_estimators=7,
              objective='multi:softprob')

In [27]:
y_pred = best_clf.predict(X_test)
y_pred_tr = best_clf.predict(X_train)

## Evaluation

In [28]:
pd.crosstab(y_test, y_pred)

col_0,0,1,2
bin_n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,16,17,10
1,7,19,10
2,8,12,21


In [29]:
from sklearn.metrics import f1_score
print('Training set:',f1_score(y_train,y_pred_tr, average='macro'))
print('Test set:',f1_score(y_test,y_pred, average='macro'))

Training set: 0.8830165221846854
Test set: 0.4656695022548681
