### $Imports$

In [20]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statistics
import seaborn as sns
import pickle
import os
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

## Hyperparameter optimization using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, SelectKBest, RFE,SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

import xgboost

### $Load-Data$

In [24]:
X_train = pd.read_csv("data/X_train.csv")
y_train = pd.read_csv("data/y_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_test = pd.read_csv("data/y_test.csv")
print(f"X_train data size is {X_train.shape}")
print(f"y_train data size is {y_train.shape}")
print(f"X_test data size is {X_test.shape}")
print(f"y_test data size is {y_test.shape}")
X_train.head()

X_train data size is (320, 2050)
y_train data size is (320, 1)
X_test data size is (80, 2050)
y_test data size is (80, 1)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2030nan,2031nan,2032nan,2033nan,2034nan,2035nan,2036nan,2037nan,2038nan,2039nan
0,0.677003,0.770278,0.464682,0.636718,0.557013,0.754993,0.333917,0.604475,0.463783,0.822309,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.294634,0.611371,0.567744,0.677142,0.397884,0.617367,0.353233,0.61604,0.726383,0.841969,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.555728,0.561163,0.500994,0.61033,0.56508,0.585206,0.458547,0.58255,0.601539,0.699926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.777906,0.700939,0.0,0.39203,1.0,0.314974,0.300878,0.48497,0.222519,0.306051,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.515798,0.690732,0.42455,0.767389,0.638821,0.641882,0.405122,0.682799,0.451246,0.646281,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### $Load-Variables$

In [4]:
def load_pickle_var(path):
    with open(path, 'rb') as file:  
        # Call load method to deserialze
        myvar = pickle.load(file)
    return myvar

In [11]:
selected_feat = load_pickle_var("data/selected_feat.pkl")
selected_feat

['5',
 '34',
 '41',
 '61',
 '75',
 '81',
 '89',
 '97',
 '124',
 '129',
 '130',
 '132',
 '140',
 '142',
 '174',
 '193',
 '194',
 '203',
 '212',
 '214',
 '220',
 '230',
 '241',
 '258',
 '265',
 '284',
 '301',
 '311',
 '324',
 '325',
 '328',
 '352',
 '353',
 '384',
 '385',
 '399',
 '430',
 '431',
 '448',
 '450',
 '506',
 '510',
 '528',
 '529',
 '549',
 '559',
 '566',
 '574',
 '575',
 '581',
 '585',
 '604',
 '642',
 '654',
 '657',
 '691',
 '720',
 '743',
 '749',
 '753',
 '755',
 '765',
 '769',
 '781',
 '787',
 '793',
 '799',
 '807',
 '811',
 '817',
 '851',
 '874',
 '900',
 '926',
 '930',
 '937',
 '947',
 '951',
 '960',
 '964',
 '981',
 '988',
 '998',
 '999',
 '1008',
 '1015',
 '1028',
 '1031',
 '1035',
 '1043',
 '1046',
 '1053',
 '1063',
 '1064',
 '1098',
 '1112',
 '1117',
 '1118',
 '1131',
 '1139',
 '1166',
 '1189',
 '1212',
 '1223',
 '1228',
 '1230',
 '1249',
 '1266',
 '1267',
 '1289',
 '1291',
 '1314',
 '1345',
 '1358',
 '1387',
 '1405',
 '1417',
 '1429',
 '1441',
 '1455',
 '1458',
 '14

### $Filter-Data-On-Sletected-Features$

In [13]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]

In [14]:
print(f"X_train data size is {X_train.shape}")
print(f"X_test data size is {X_test.shape}")
X_train.head()

X_train data size is (320, 168)
X_test data size is (80, 168)


Unnamed: 0,5,34,41,61,75,81,89,97,124,129,...,1969,1980,1982,1987,1989,1998,2008,2014,2034,2036
0,0.754993,0.47629,0.5497,0.223811,0.318243,0.450309,0.346974,0.582555,0.873543,0.583716,...,0.704871,0.609452,0.725294,0.599339,0.789853,0.505903,0.874758,0.639539,0.56976,0.335839
1,0.617367,0.532144,0.674371,0.25244,0.567918,0.091346,0.345141,0.464188,0.525994,0.541977,...,0.165528,0.472455,0.602483,0.556016,0.351939,0.629717,0.658036,0.598357,0.416565,0.622415
2,0.585206,0.456709,0.717187,0.576661,0.672607,0.195169,0.481008,0.555114,0.577618,0.340582,...,0.341482,0.303325,0.465385,0.356667,0.424172,0.377417,0.46032,0.479461,0.584632,0.373058
3,0.314974,0.388454,0.80731,0.526286,0.394517,0.493116,0.205396,0.547062,0.383084,0.256958,...,0.271429,0.313251,0.552571,0.24909,0.113548,0.255439,0.315554,0.856486,0.63698,0.433898
4,0.641882,0.708576,0.618695,0.264061,0.701706,0.568705,0.363877,0.719779,0.531637,0.524494,...,0.219181,0.517698,0.758323,0.461206,0.303419,0.679096,0.670523,0.467678,0.346997,0.594655


### $Basline-Model$

In [27]:
guess = statistics.mode(y_train["target"])
guess_y_train_pred = [guess] * len(y_train)
guess_y_test_pred = [guess] * len(y_test)
baseline_score_train = f1_score(y_train, guess_y_train_pred)
baseline_score_test = f1_score(y_test, guess_y_test_pred)
print(f"Baseline model f1_score For Train data is {baseline_score_train:.2f}")
print(f"Baseline model f1_score For Test data is {baseline_score_test:.2f}")

Baseline model f1_score For Train data is 0.67
Baseline model f1_score For Test data is 0.67
