In [2]:
import pandas as pd
import pickle
import statistics
from statistics import mean
from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

After the division for training, validation and test was determined in the previous notebook, and after the final features were also determined in the same notebook - in this notebook we will now load the data and update the relevant features.

In [3]:
with open('df_train.pkl', 'rb') as f:
    df_train = pickle.load(f)
df_train.shape

(69445, 2423)

In [4]:
with open('df_validation.pkl', 'rb') as f:
    df_validation = pickle.load(f)
df_validation.shape

(22537, 2423)

In [5]:
with open('df_test.pkl', 'rb') as f:
    df_test = pickle.load(f)
df_test.shape

(22289, 2423)

In [6]:
df_train_4 = df_train[[
'INTERVAL_IN_FILE',
'BAND_ARR_5',
'BAND_ARR_6',
'BAND_ARR_7',
'BAND_ARR_8',
'BAND_ARR_13',
'BIT_FLAG',
'MAX_VALUE',
'NTC',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_1',
'BAND_ARR_8__fft_coefficient__attr_"real"__coeff_4',
'VALUE'

]].reset_index()

df_validation_4 = df_validation[[
'INTERVAL_IN_FILE',
'BAND_ARR_5',
'BAND_ARR_6',
'BAND_ARR_7',
'BAND_ARR_8',
'BAND_ARR_13',
'BIT_FLAG',
'MAX_VALUE',
'NTC',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_1',
'BAND_ARR_8__fft_coefficient__attr_"real"__coeff_4',
'VALUE'

]].reset_index()

df_test_4 = df_test[[
'INTERVAL_IN_FILE',
'BAND_ARR_5',
'BAND_ARR_6',
'BAND_ARR_7',
'BAND_ARR_8',
'BAND_ARR_13',
'BIT_FLAG',
'MAX_VALUE',
'NTC',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_1',
'BAND_ARR_8__fft_coefficient__attr_"real"__coeff_4',
'VALUE'

]].reset_index()



We will now create a table that will help us compare the results of the models, based on different metrics. Apart from the classic metrics, in this table there are 4 other criteria related to the stability of the results in relation to the division of
session info id.
Two of these criteria have already been expressed in the previous notebook, and 2 are new: the number of cases in which the difference between the probability of error of a session and the general probability of error is above 0.1, and the number of cases in which this difference is above 0.2

In [7]:
df_for_model_tab = pd.DataFrame(columns = ['validation metric'])
df_for_model_tab['validation metric'] = ('F1', 'Accuracy', 'Precision', 'Recall', 'max difference from error mean', 'mean difference from error mean', 'number of bigger than 0.2 differnece', 'number of bigger than 0.1 differnece')
df_for_model_tab

Unnamed: 0,validation metric
0,F1
1,Accuracy
2,Precision
3,Recall
4,max difference from error mean
5,mean difference from error mean
6,number of bigger than 0.2 differnece
7,number of bigger than 0.1 differnece


We will define a function that trains models for a dataset, checks the model results and feeds into a table. This function is very similar to a function from the previous notebook, with the main differences being:
1. Here the comparison is between models and not between data sets
2. Added 2 additional categories of "amount of big riders"

In [8]:
def error_per_session(train_data, test_data, model, model_name):

    X_train = train_data.loc[:, train_data.columns != 'VALUE']
    X_test = test_data.loc[:, test_data.columns != 'VALUE']
    number = preprocessing.LabelEncoder()
    X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)       
    X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)       

    y_train = train_data['VALUE']
    y_test = test_data['VALUE']

    
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    prediction_row_check = pd.DataFrame(columns = ['y pred', 'y'])
    prediction_row_check['y']=y_test
    prediction_row_check['y pred']=y_pred
    prediction_row_check['SESSION_INFO_ID']=X_test['SESSION_INFO_ID']
    prediction_row_check['is prediction wrong'] = (prediction_row_check['y pred'] - prediction_row_check['y'])**2
    
    prediction_session_check=pd.DataFrame(prediction_row_check.groupby('SESSION_INFO_ID')['is prediction wrong'].mean())
    prediction_session_check = prediction_session_check.reset_index()
    
    prediction_session_check['abs difference from average'] = abs(prediction_session_check['is prediction wrong']-statistics.mean(prediction_row_check['is prediction wrong']))
    
    f1 = metrics.f1_score(y_test, y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    max_difference_from_error_mean = prediction_session_check['abs difference from average'].max()
    mean_difference_from_error_mean = prediction_session_check['abs difference from average'].mean()
    number_of_bigger_than_02_differnece = prediction_session_check[prediction_session_check['abs difference from average']>0.2]['SESSION_INFO_ID'].count()
    number_of_bigger_than_01_differnece = prediction_session_check[prediction_session_check['abs difference from average']>0.1]['SESSION_INFO_ID'].count()
    df_for_model_tab[model_name] = f1, accuracy, precision, recall, max_difference_from_error_mean , mean_difference_from_error_mean, number_of_bigger_than_02_differnece, number_of_bigger_than_01_differnece



We will now run the function on a number of possible models. Appears at the end of each run, updated in the comparison table

In [9]:
error_per_session(df_train_4, df_validation_4, RandomForestClassifier(), 'Random Forest')
df_for_model_tab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random Forest
0,F1,0.970488
1,Accuracy,0.975906
2,Precision,0.981638
3,Recall,0.959587
4,max difference from error mean,0.189126
5,mean difference from error mean,0.029396
6,number of bigger than 0.2 differnece,0.0
7,number of bigger than 0.1 differnece,2.0


In [10]:
error_per_session(df_train_4, df_validation_4, GradientBoostingClassifier(), 'Gradient Boost')
df_for_model_tab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random Forest,Gradient Boost
0,F1,0.970488,0.96727
1,Accuracy,0.975906,0.9732
2,Precision,0.981638,0.97541
3,Recall,0.959587,0.959265
4,max difference from error mean,0.189126,0.184287
5,mean difference from error mean,0.029396,0.031394
6,number of bigger than 0.2 differnece,0.0,0.0
7,number of bigger than 0.1 differnece,2.0,2.0


In [11]:
error_per_session(df_train_4, df_validation_4, AdaBoostClassifier(), 'Ada Boost')
df_for_model_tab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random Forest,Gradient Boost,Ada Boost
0,F1,0.970488,0.96727,0.946152
1,Accuracy,0.975906,0.9732,0.957004
2,Precision,0.981638,0.97541,0.979519
3,Recall,0.959587,0.959265,0.914983
4,max difference from error mean,0.189126,0.184287,0.255174
5,mean difference from error mean,0.029396,0.031394,0.04664
6,number of bigger than 0.2 differnece,0.0,0.0,1.0
7,number of bigger than 0.1 differnece,2.0,2.0,2.0


In [12]:
from sklearn.naive_bayes import GaussianNB

In [13]:
error_per_session(df_train_4, df_validation_4, GaussianNB(), 'Gaussian Naive Bayes')
df_for_model_tab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random Forest,Gradient Boost,Ada Boost,Gaussian Naive Bayes
0,F1,0.970488,0.96727,0.946152,0.817446
1,Accuracy,0.975906,0.9732,0.957004,0.864978
2,Precision,0.981638,0.97541,0.979519,0.925051
3,Recall,0.959587,0.959265,0.914983,0.732266
4,max difference from error mean,0.189126,0.184287,0.255174,0.644843
5,mean difference from error mean,0.029396,0.031394,0.04664,0.123599
6,number of bigger than 0.2 differnece,0.0,0.0,1.0,2.0
7,number of bigger than 0.1 differnece,2.0,2.0,2.0,43.0


In [14]:
from sklearn.neighbors import KNeighborsClassifier

In [15]:
error_per_session(df_train_4, df_validation_4, KNeighborsClassifier(), 'KNN')
df_for_model_tab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random Forest,Gradient Boost,Ada Boost,Gaussian Naive Bayes,KNN
0,F1,0.970488,0.96727,0.946152,0.817446,0.855209
1,Accuracy,0.975906,0.9732,0.957004,0.864978,0.894662
2,Precision,0.981638,0.97541,0.979519,0.925051,0.988579
3,Recall,0.959587,0.959265,0.914983,0.732266,0.753547
4,max difference from error mean,0.189126,0.184287,0.255174,0.644843,0.655259
5,mean difference from error mean,0.029396,0.031394,0.04664,0.123599,0.117612
6,number of bigger than 0.2 differnece,0.0,0.0,1.0,2.0,2.0
7,number of bigger than 0.1 differnece,2.0,2.0,2.0,43.0,45.0


In [16]:
from sklearn.neural_network import MLPClassifier

In [17]:
error_per_session(df_train_4, df_validation_4, MLPClassifier(), 'MLP')
df_for_model_tab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random Forest,Gradient Boost,Ada Boost,Gaussian Naive Bayes,KNN,MLP
0,F1,0.970488,0.96727,0.946152,0.817446,0.855209,0.936142
1,Accuracy,0.975906,0.9732,0.957004,0.864978,0.894662,0.945246
2,Precision,0.981638,0.97541,0.979519,0.925051,0.988579,0.902695
3,Recall,0.959587,0.959265,0.914983,0.732266,0.753547,0.972163
4,max difference from error mean,0.189126,0.184287,0.255174,0.644843,0.655259,0.249593
5,mean difference from error mean,0.029396,0.031394,0.04664,0.123599,0.117612,0.063435
6,number of bigger than 0.2 differnece,0.0,0.0,1.0,2.0,2.0,2.0
7,number of bigger than 0.1 differnece,2.0,2.0,2.0,43.0,45.0,4.0


Since MLP is a model of a neural network, very different from the baseline model, I thought it might be worth checking out results also with a data set in a version with more features.

In [18]:
df_test_at_least_2 = df_test[['INTERVAL_IN_FILE',
'SN',
'ACCL_X',
'ACCL_Y',
'BAND_ARR_1',
'BAND_ARR_2',
'BAND_ARR_3',
'BAND_ARR_4',
'BAND_ARR_5',
'BAND_ARR_6',
'BAND_ARR_7',
'BAND_ARR_8',
'BAND_ARR_9',
'BAND_ARR_10',
'BAND_ARR_11',
'BAND_ARR_12',
'BAND_ARR_13',
'BAND_ARR_14',
'BAND_ARR_15',
'BAND_ARR_16',
'BAND_ARR_17',
'BAND_ARR_21',
'BAND_ARR_25',
'BAND_ARR_27',
'BAND_ARR_28',
'BAND_ARR_30',
'BAND_ARR_32',
'BATTERY',
'BIT_FLAG',
'MAX_VALUE',
'MIC_TD_RAW',
'NTC',
'FIRST_PACKET_TS',
'NTC__mean',
'NTC__quantile__q_0.2',
'NTC__quantile__q_0.6',
'NTC__cwt_coefficients__coeff_2__w_2__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_3__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_4__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_6__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_7__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_8__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_9__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_9__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_10__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_11__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_12__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_12__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_13__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_14__w_20__widths_(2, 5, 10, 20)',
'NTC__ar_coefficient__coeff_0__k_10',
'NTC__fft_coefficient__attr_"real"__coeff_1',
'NTC__fft_coefficient__attr_"real"__coeff_4',
'NTC__fft_coefficient__attr_"real"__coeff_52',
'NTC__fft_coefficient__attr_"real"__coeff_90',
'NTC__fft_coefficient__attr_"imag"__coeff_3',
'NTC__fft_coefficient__attr_"imag"__coeff_4',
'NTC__fft_coefficient__attr_"imag"__coeff_17',
'NTC__fft_coefficient__attr_"imag"__coeff_26',
'NTC__fft_coefficient__attr_"imag"__coeff_27',
'NTC__fft_coefficient__attr_"imag"__coeff_28',
'NTC__fft_coefficient__attr_"imag"__coeff_29',
'NTC__fft_coefficient__attr_"imag"__coeff_33',
'NTC__fft_coefficient__attr_"imag"__coeff_37',
'NTC__fft_coefficient__attr_"imag"__coeff_59',
'NTC__fft_coefficient__attr_"imag"__coeff_93',
'NTC__fft_coefficient__attr_"abs"__coeff_8',
'NTC__fft_coefficient__attr_"angle"__coeff_9',
'NTC__fft_coefficient__attr_"angle"__coeff_10',
'NTC__fft_coefficient__attr_"angle"__coeff_45',
'NTC__fft_coefficient__attr_"angle"__coeff_46',
'NTC__fft_coefficient__attr_"angle"__coeff_59',
'NTC__fft_coefficient__attr_"angle"__coeff_77',
'NTC__fft_coefficient__attr_"angle"__coeff_86',
'NTC__fft_coefficient__attr_"angle"__coeff_95',
'NTC__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"max"',
'NTC__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"max"',
'BAND_ARR_2__sum_values',
'BAND_ARR_2__count_below_mean',
'BAND_ARR_2__spkt_welch_density__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"real"__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"real"__coeff_3',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_1',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_3',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_5',
'BAND_ARR_2__fft_coefficient__attr_"abs"__coeff_1',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_4',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_9',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_12',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_40',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_49',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_51',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_64',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_65',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_70',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_81',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_89',
'BAND_ARR_2__range_count__max_1__min_-1',
'BAND_ARR_2__agg_linear_trend__attr_"rvalue"__chunk_len_50__f_agg_"mean"',
'BAND_ARR_2__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"min"',
'BAND_ARR_8__sum_values',
'BAND_ARR_8__longest_strike_below_mean',
'BAND_ARR_8__c3__lag_1',
'BAND_ARR_8__quantile__q_0.8',
'BAND_ARR_8__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.6',
'BAND_ARR_8__fft_coefficient__attr_"real"__coeff_3',
'BAND_ARR_8__fft_coefficient__attr_"real"__coeff_4',
'BAND_ARR_8__fft_coefficient__attr_"abs"__coeff_0',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_2',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_7',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_14',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_15',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_38',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_58',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_62',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_64',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_65',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_69',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_72',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_78',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_95',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_96',
'BAND_ARR_8__max_langevin_fixed_point__m_3__r_30',
'VALUE'

]].reset_index()

df_validation_at_least_2 = df_validation[['INTERVAL_IN_FILE',
'SN',
'ACCL_X',
'ACCL_Y',
'BAND_ARR_1',
'BAND_ARR_2',
'BAND_ARR_3',
'BAND_ARR_4',
'BAND_ARR_5',
'BAND_ARR_6',
'BAND_ARR_7',
'BAND_ARR_8',
'BAND_ARR_9',
'BAND_ARR_10',
'BAND_ARR_11',
'BAND_ARR_12',
'BAND_ARR_13',
'BAND_ARR_14',
'BAND_ARR_15',
'BAND_ARR_16',
'BAND_ARR_17',
'BAND_ARR_21',
'BAND_ARR_25',
'BAND_ARR_27',
'BAND_ARR_28',
'BAND_ARR_30',
'BAND_ARR_32',
'BATTERY',
'BIT_FLAG',
'MAX_VALUE',
'MIC_TD_RAW',
'NTC',
'FIRST_PACKET_TS',
'NTC__mean',
'NTC__quantile__q_0.2',
'NTC__quantile__q_0.6',
'NTC__cwt_coefficients__coeff_2__w_2__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_3__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_4__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_6__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_7__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_8__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_9__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_9__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_10__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_11__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_12__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_12__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_13__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_14__w_20__widths_(2, 5, 10, 20)',
'NTC__ar_coefficient__coeff_0__k_10',
'NTC__fft_coefficient__attr_"real"__coeff_1',
'NTC__fft_coefficient__attr_"real"__coeff_4',
'NTC__fft_coefficient__attr_"real"__coeff_52',
'NTC__fft_coefficient__attr_"real"__coeff_90',
'NTC__fft_coefficient__attr_"imag"__coeff_3',
'NTC__fft_coefficient__attr_"imag"__coeff_4',
'NTC__fft_coefficient__attr_"imag"__coeff_17',
'NTC__fft_coefficient__attr_"imag"__coeff_26',
'NTC__fft_coefficient__attr_"imag"__coeff_27',
'NTC__fft_coefficient__attr_"imag"__coeff_28',
'NTC__fft_coefficient__attr_"imag"__coeff_29',
'NTC__fft_coefficient__attr_"imag"__coeff_33',
'NTC__fft_coefficient__attr_"imag"__coeff_37',
'NTC__fft_coefficient__attr_"imag"__coeff_59',
'NTC__fft_coefficient__attr_"imag"__coeff_93',
'NTC__fft_coefficient__attr_"abs"__coeff_8',
'NTC__fft_coefficient__attr_"angle"__coeff_9',
'NTC__fft_coefficient__attr_"angle"__coeff_10',
'NTC__fft_coefficient__attr_"angle"__coeff_45',
'NTC__fft_coefficient__attr_"angle"__coeff_46',
'NTC__fft_coefficient__attr_"angle"__coeff_59',
'NTC__fft_coefficient__attr_"angle"__coeff_77',
'NTC__fft_coefficient__attr_"angle"__coeff_86',
'NTC__fft_coefficient__attr_"angle"__coeff_95',
'NTC__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"max"',
'NTC__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"max"',
'BAND_ARR_2__sum_values',
'BAND_ARR_2__count_below_mean',
'BAND_ARR_2__spkt_welch_density__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"real"__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"real"__coeff_3',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_1',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_3',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_5',
'BAND_ARR_2__fft_coefficient__attr_"abs"__coeff_1',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_4',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_9',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_12',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_40',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_49',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_51',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_64',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_65',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_70',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_81',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_89',
'BAND_ARR_2__range_count__max_1__min_-1',
'BAND_ARR_2__agg_linear_trend__attr_"rvalue"__chunk_len_50__f_agg_"mean"',
'BAND_ARR_2__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"min"',
'BAND_ARR_8__sum_values',
'BAND_ARR_8__longest_strike_below_mean',
'BAND_ARR_8__c3__lag_1',
'BAND_ARR_8__quantile__q_0.8',
'BAND_ARR_8__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.6',
'BAND_ARR_8__fft_coefficient__attr_"real"__coeff_3',
'BAND_ARR_8__fft_coefficient__attr_"real"__coeff_4',
'BAND_ARR_8__fft_coefficient__attr_"abs"__coeff_0',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_2',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_7',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_14',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_15',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_38',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_58',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_62',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_64',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_65',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_69',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_72',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_78',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_95',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_96',
'BAND_ARR_8__max_langevin_fixed_point__m_3__r_30',
'VALUE'

]].reset_index()

df_train_at_least_2 = df_train[['INTERVAL_IN_FILE',
'SN',
'ACCL_X',
'ACCL_Y',
'BAND_ARR_1',
'BAND_ARR_2',
'BAND_ARR_3',
'BAND_ARR_4',
'BAND_ARR_5',
'BAND_ARR_6',
'BAND_ARR_7',
'BAND_ARR_8',
'BAND_ARR_9',
'BAND_ARR_10',
'BAND_ARR_11',
'BAND_ARR_12',
'BAND_ARR_13',
'BAND_ARR_14',
'BAND_ARR_15',
'BAND_ARR_16',
'BAND_ARR_17',
'BAND_ARR_21',
'BAND_ARR_25',
'BAND_ARR_27',
'BAND_ARR_28',
'BAND_ARR_30',
'BAND_ARR_32',
'BATTERY',
'BIT_FLAG',
'MAX_VALUE',
'MIC_TD_RAW',
'NTC',
'FIRST_PACKET_TS',
'NTC__mean',
'NTC__quantile__q_0.2',
'NTC__quantile__q_0.6',
'NTC__cwt_coefficients__coeff_2__w_2__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_3__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_4__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_6__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_7__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_8__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_9__w_5__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_9__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_10__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_11__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_12__w_10__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_12__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_13__w_20__widths_(2, 5, 10, 20)',
'NTC__cwt_coefficients__coeff_14__w_20__widths_(2, 5, 10, 20)',
'NTC__ar_coefficient__coeff_0__k_10',
'NTC__fft_coefficient__attr_"real"__coeff_1',
'NTC__fft_coefficient__attr_"real"__coeff_4',
'NTC__fft_coefficient__attr_"real"__coeff_52',
'NTC__fft_coefficient__attr_"real"__coeff_90',
'NTC__fft_coefficient__attr_"imag"__coeff_3',
'NTC__fft_coefficient__attr_"imag"__coeff_4',
'NTC__fft_coefficient__attr_"imag"__coeff_17',
'NTC__fft_coefficient__attr_"imag"__coeff_26',
'NTC__fft_coefficient__attr_"imag"__coeff_27',
'NTC__fft_coefficient__attr_"imag"__coeff_28',
'NTC__fft_coefficient__attr_"imag"__coeff_29',
'NTC__fft_coefficient__attr_"imag"__coeff_33',
'NTC__fft_coefficient__attr_"imag"__coeff_37',
'NTC__fft_coefficient__attr_"imag"__coeff_59',
'NTC__fft_coefficient__attr_"imag"__coeff_93',
'NTC__fft_coefficient__attr_"abs"__coeff_8',
'NTC__fft_coefficient__attr_"angle"__coeff_9',
'NTC__fft_coefficient__attr_"angle"__coeff_10',
'NTC__fft_coefficient__attr_"angle"__coeff_45',
'NTC__fft_coefficient__attr_"angle"__coeff_46',
'NTC__fft_coefficient__attr_"angle"__coeff_59',
'NTC__fft_coefficient__attr_"angle"__coeff_77',
'NTC__fft_coefficient__attr_"angle"__coeff_86',
'NTC__fft_coefficient__attr_"angle"__coeff_95',
'NTC__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"max"',
'NTC__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"max"',
'BAND_ARR_2__sum_values',
'BAND_ARR_2__count_below_mean',
'BAND_ARR_2__spkt_welch_density__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"real"__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"real"__coeff_3',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_1',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_3',
'BAND_ARR_2__fft_coefficient__attr_"imag"__coeff_5',
'BAND_ARR_2__fft_coefficient__attr_"abs"__coeff_1',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_2',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_4',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_9',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_12',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_40',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_49',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_51',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_64',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_65',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_70',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_81',
'BAND_ARR_2__fft_coefficient__attr_"angle"__coeff_89',
'BAND_ARR_2__range_count__max_1__min_-1',
'BAND_ARR_2__agg_linear_trend__attr_"rvalue"__chunk_len_50__f_agg_"mean"',
'BAND_ARR_2__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"min"',
'BAND_ARR_8__sum_values',
'BAND_ARR_8__longest_strike_below_mean',
'BAND_ARR_8__c3__lag_1',
'BAND_ARR_8__quantile__q_0.8',
'BAND_ARR_8__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.6',
'BAND_ARR_8__fft_coefficient__attr_"real"__coeff_3',
'BAND_ARR_8__fft_coefficient__attr_"real"__coeff_4',
'BAND_ARR_8__fft_coefficient__attr_"abs"__coeff_0',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_2',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_7',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_14',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_15',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_38',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_58',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_62',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_64',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_65',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_69',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_72',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_78',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_95',
'BAND_ARR_8__fft_coefficient__attr_"angle"__coeff_96',
'BAND_ARR_8__max_langevin_fixed_point__m_3__r_30',
'VALUE'

]].reset_index()


In [19]:
error_per_session(df_train_at_least_2, df_validation_at_least_2, MLPClassifier(), 'MLP - more features')
df_for_model_tab



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random Forest,Gradient Boost,Ada Boost,Gaussian Naive Bayes,KNN,MLP,MLP - more features
0,F1,0.970488,0.96727,0.946152,0.817446,0.855209,0.936142,0.940677
1,Accuracy,0.975906,0.9732,0.957004,0.864978,0.894662,0.945246,0.950393
2,Precision,0.981638,0.97541,0.979519,0.925051,0.988579,0.902695,0.928946
3,Recall,0.959587,0.959265,0.914983,0.732266,0.753547,0.972163,0.952709
4,max difference from error mean,0.189126,0.184287,0.255174,0.644843,0.655259,0.249593,0.950393
5,mean difference from error mean,0.029396,0.031394,0.04664,0.123599,0.117612,0.063435,0.078578
6,number of bigger than 0.2 differnece,0.0,0.0,1.0,2.0,2.0,2.0,2.0
7,number of bigger than 0.1 differnece,2.0,2.0,2.0,43.0,45.0,4.0,5.0


The addition of features seems to have improved some of the results of the neural network model, but others have actually become less good. Either way, this model seems to be less good for this data than the top 2 models:
Random Forest and Gradient Boost.

For these two, I will try to improve the model by running fine tuning methods in order to select the final model with the final parameters.

In [20]:
from sklearn.model_selection import GridSearchCV

In [1]:
grid_param = {
    'n_estimators': [20,  60, 100, 200, 300, 500],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'min_samples_split': [1, 2, 3, 4, 5],
    'min_samples_leaf' : [1,2,3,4, 5],
    'max_features' : ['sqrt', 'log2', None],
}


In [4]:

gd_sr = GridSearchCV(estimator=RandomForestClassifier(),
                     param_grid=grid_param,
                     scoring='f1',
                     cv=5,
                     n_jobs=-1)


In [11]:
X_train = df_train_4.loc[:, df_train_4.columns != 'VALUE']

number = preprocessing.LabelEncoder()
X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)       
y_train = df_train_4['VALUE']

gd_sr.fit(X_train, y_train)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_features': ['sqrt', 'log2', None],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [1, 2, 3, 4, 5],
                         'n_estimators': [20, 60, 100, 200, 300, 500]},
             scoring='f1')

In [12]:
best_parameters = gd_sr.best_params_
print(best_parameters)


{'criterion': 'gini', 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 20}


In [21]:
error_per_session(df_train_4, df_validation_4, RandomForestClassifier(criterion = 'gini', max_features= 'log2', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 20), 'Random F Best Parameters')
df_for_model_tab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random Forest,Gradient Boost,Ada Boost,Gaussian Naive Bayes,KNN,MLP,MLP - more features,Random F Best Parameters
0,F1,0.970488,0.96727,0.946152,0.817446,0.855209,0.936142,0.940677,0.972923
1,Accuracy,0.975906,0.9732,0.957004,0.864978,0.894662,0.945246,0.950393,0.977814
2,Precision,0.981638,0.97541,0.979519,0.925051,0.988579,0.902695,0.928946,0.980463
3,Recall,0.959587,0.959265,0.914983,0.732266,0.753547,0.972163,0.952709,0.965499
4,max difference from error mean,0.189126,0.184287,0.255174,0.644843,0.655259,0.249593,0.950393,0.184637
5,mean difference from error mean,0.029396,0.031394,0.04664,0.123599,0.117612,0.063435,0.078578,0.02693
6,number of bigger than 0.2 differnece,0.0,0.0,1.0,2.0,2.0,2.0,2.0,0.0
7,number of bigger than 0.1 differnece,2.0,2.0,2.0,43.0,45.0,4.0,5.0,2.0


In [22]:
df_for_model_tab = df_for_model_tab.drop(columns=['Gaussian Naive Bayes', 'KNN', 'MLP', 'MLP - more features'])
df_for_model_tab

Unnamed: 0,validation metric,Random Forest,Gradient Boost,Ada Boost,Random F Best Parameters
0,F1,0.976601,0.96727,0.946152,0.978462
1,Accuracy,0.980787,0.9732,0.957004,0.982251
2,Precision,0.982067,0.97541,0.979519,0.980363
3,Recall,0.971195,0.959265,0.914983,0.976569
4,max difference from error mean,0.110851,0.184287,0.255174,0.090993
5,mean difference from error mean,0.022646,0.031394,0.04664,0.020465
6,number of bigger than 0.2 differnece,0.0,0.0,1.0,0.0
7,number of bigger than 0.1 differnece,2.0,2.0,2.0,0.0


Random Forest's achievements have indeed improved in almost all indices. I deleted some models from the table to make it easier to look at. And now - we will try to improve parameters and achievements for Gradient Boost as well.

In [23]:
grid_param = {
    'loss': ['log_loss', 'deviance', 'exponential'],
    'learning_rate': [0.0, 0.1, 0.5, 1.0,  10.0,  10000.0],
    'n_estimators': [100, 10000, 10000],
    'subsample' : [0.0, 0.5, 1.0],
    'criterion' : ['friedman_mse','squared_error', 'mse'],
}

gd_sr = GridSearchCV(estimator=GradientBoostingClassifier(),
                     param_grid=grid_param,
                     scoring='f1',
                     cv=5,
                     n_jobs=-1)
X_train = df_train_4.loc[:, df_train_4.columns != 'VALUE']

number = preprocessing.LabelEncoder()
X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)       
y_train = df_train_4['VALUE']


gd_sr.fit(X_train, y_train)
best_parameters = gd_sr.best_params_
print(best_parameters)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan 0.95390059 0.95490013        nan 0.95696746 0.95680678
        nan 0.96013025 0.95674125        nan 0.9520715  0.9522148
        nan 0.96804883 0.96513382        nan 0.96867685 0.96516017
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan 0.93836717 0.

{'criterion': 'friedman_mse', 'learning_rate': 0.5, 'loss': 'exponential', 'n_estimators': 10000, 'subsample': 0.5}


In [24]:
error_per_session(df_train_4, df_validation_4, GradientBoostingClassifier( criterion= 'friedman_mse', learning_rate = 0.5, loss = 'exponential', n_estimators = 10000, subsample = 0.5), 'Gradient Best Parameters')
df_for_model_tab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random Forest,Gradient Boost,Ada Boost,Gaussian Naive Bayes,KNN,MLP,MLP - more features,Random F Best Parameters,Gradient Best Parameters
0,F1,0.970488,0.96727,0.946152,0.817446,0.855209,0.936142,0.940677,0.972923,0.978375
1,Accuracy,0.975906,0.9732,0.957004,0.864978,0.894662,0.945246,0.950393,0.977814,0.982163
2,Precision,0.981638,0.97541,0.979519,0.925051,0.988579,0.902695,0.928946,0.980463,0.979324
3,Recall,0.959587,0.959265,0.914983,0.732266,0.753547,0.972163,0.952709,0.965499,0.977429
4,max difference from error mean,0.189126,0.184287,0.255174,0.644843,0.655259,0.249593,0.950393,0.184637,0.188986
5,mean difference from error mean,0.029396,0.031394,0.04664,0.123599,0.117612,0.063435,0.078578,0.02693,0.022422
6,number of bigger than 0.2 differnece,0.0,0.0,1.0,2.0,2.0,2.0,2.0,0.0,0.0
7,number of bigger than 0.1 differnece,2.0,2.0,2.0,43.0,45.0,4.0,5.0,2.0,1.0


After fine tuning Gradient Boost's achievements overtake all other models, and it seems to be the most powerful model.
We will try to improve it even further with more attempts at its parameters.

In [29]:
grid_param = {
    'loss': ['exponential'],
    'learning_rate': [0.25, 0.5, 0.75],
    'n_estimators': [10000],
    'subsample' : [0.25, 0.5, 0.75],
    'criterion' : ['friedman_mse'],}

gd_sr = GridSearchCV(estimator=GradientBoostingClassifier(),
                     param_grid=grid_param,
                     scoring='f1',
                     cv=5,
                     n_jobs=-1)
X_train = df_train_4.loc[:, df_train_4.columns != 'VALUE']

number = preprocessing.LabelEncoder()
X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)       
y_train = df_train_4['VALUE']


gd_sr.fit(X_train, y_train)
best_parameters = gd_sr.best_params_
print(best_parameters)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)


{'criterion': 'friedman_mse', 'learning_rate': 0.25, 'loss': 'exponential', 'n_estimators': 10000, 'subsample': 0.25}


In [30]:
error_per_session(df_train_4, df_validation_4, GradientBoostingClassifier( criterion= 'friedman_mse', learning_rate = 0.25, loss = 'exponential', n_estimators = 10000, subsample = 0.25), 'Gradient even better than best Parameters')
df_for_model_tab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random Forest,Gradient Boost,Ada Boost,Gaussian Naive Bayes,KNN,MLP,MLP - more features,Random F Best Parameters,Gradient Best Parameters,Gradient even better than best Parameters
0,F1,0.970488,0.96727,0.946152,0.817446,0.855209,0.936142,0.940677,0.972923,0.978375,0.973781
1,Accuracy,0.975906,0.9732,0.957004,0.864978,0.894662,0.945246,0.950393,0.977814,0.982163,0.978524
2,Precision,0.981638,0.97541,0.979519,0.925051,0.988579,0.902695,0.928946,0.980463,0.979324,0.981651
3,Recall,0.959587,0.959265,0.914983,0.732266,0.753547,0.972163,0.952709,0.965499,0.977429,0.966036
4,max difference from error mean,0.189126,0.184287,0.255174,0.644843,0.655259,0.249593,0.950393,0.184637,0.188986,0.170422
5,mean difference from error mean,0.029396,0.031394,0.04664,0.123599,0.117612,0.063435,0.078578,0.02693,0.022422,0.025877
6,number of bigger than 0.2 differnece,0.0,0.0,1.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0
7,number of bigger than 0.1 differnece,2.0,2.0,2.0,43.0,45.0,4.0,5.0,2.0,1.0,1.0


It seems that this time the results have not improved. Random Forest is also given another chance to improve its final results, after which we will select the model.
But first we will narrow down the table to include only the models that are still relevant.

In [31]:
df_for_model_tab = df_for_model_tab.drop(columns=['Random Forest','Gradient Boost','Gaussian Naive Bayes', 'KNN', 'MLP', 'MLP - more features', 'Gradient even better than best Parameters'])
df_for_model_tab

Unnamed: 0,validation metric,Ada Boost,Random F Best Parameters,Gradient Best Parameters
0,F1,0.946152,0.972923,0.978375
1,Accuracy,0.957004,0.977814,0.982163
2,Precision,0.979519,0.980463,0.979324
3,Recall,0.914983,0.965499,0.977429
4,max difference from error mean,0.255174,0.184637,0.188986
5,mean difference from error mean,0.04664,0.02693,0.022422
6,number of bigger than 0.2 differnece,1.0,0.0,0.0
7,number of bigger than 0.1 differnece,2.0,2.0,1.0


In [32]:
df_for_model_tab = df_for_model_tab.drop(columns=['Ada Boost'])
df_for_model_tab

Unnamed: 0,validation metric,Random F Best Parameters,Gradient Best Parameters
0,F1,0.972923,0.978375
1,Accuracy,0.977814,0.982163
2,Precision,0.980463,0.979324
3,Recall,0.965499,0.977429
4,max difference from error mean,0.184637,0.188986
5,mean difference from error mean,0.02693,0.022422
6,number of bigger than 0.2 differnece,0.0,0.0
7,number of bigger than 0.1 differnece,2.0,1.0


In [33]:
grid_param = {
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini'],
    'min_samples_split': [2],
    'min_samples_leaf' : [1],
    'max_features' : ['log2'],
}


gd_sr = GridSearchCV(estimator=RandomForestClassifier(),
                     param_grid=grid_param,
                     scoring='f1',
                     cv=5,
                     n_jobs=-1)
X_train = df_train_4.loc[:, df_train_4.columns != 'VALUE']

number = preprocessing.LabelEncoder()
X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)       
y_train = df_train_4['VALUE']

gd_sr.fit(X_train, y_train)

best_parameters = gd_sr.best_params_
print(best_parameters)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)


{'criterion': 'gini', 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 25}


In [34]:
error_per_session(df_train_4, df_validation_4, RandomForestClassifier(criterion = 'gini', max_features= 'log2', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 25), 'Random F even better than Best Parameters')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


In [35]:
df_for_model_tab

Unnamed: 0,validation metric,Random F Best Parameters,Gradient Best Parameters,Random F even better than Best Parameters
0,F1,0.972923,0.978375,0.978691
1,Accuracy,0.977814,0.982163,0.982429
2,Precision,0.980463,0.979324,0.979957
3,Recall,0.965499,0.977429,0.977429
4,max difference from error mean,0.184637,0.188986,0.118889
5,mean difference from error mean,0.02693,0.022422,0.020704
6,number of bigger than 0.2 differnece,0.0,0.0,0.0
7,number of bigger than 0.1 differnece,2.0,1.0,1.0


Random Forest with the existing parameters improved the results and came in first place.

The big difference compared to Gradient Boost is that it takes much less time to examine parameters and train the model, so I feel comfortable running on it a few more attempts and improving further, before I test the results on the test dataset.

In [36]:
df_for_model_tab = df_for_model_tab.drop(columns=['Random F Best Parameters', 'Gradient Best Parameters'])
df_for_model_tab

Unnamed: 0,validation metric,Random F even better than Best Parameters
0,F1,0.978691
1,Accuracy,0.982429
2,Precision,0.979957
3,Recall,0.977429
4,max difference from error mean,0.118889
5,mean difference from error mean,0.020704
6,number of bigger than 0.2 differnece,0.0
7,number of bigger than 0.1 differnece,1.0


In [37]:
grid_param = {
    'n_estimators': [22, 23, 24, 25, 26, 27, 28],
    'criterion': ['gini'],
    'min_samples_split': [2, 6, 10],
    'min_samples_leaf' : [1, 6, 10],
    'max_features' : ['log2'],
}


gd_sr = GridSearchCV(estimator=RandomForestClassifier(),
                     param_grid=grid_param,
                     scoring='f1',
                     cv=5,
                     n_jobs=-1)
X_train = df_train_4.loc[:, df_train_4.columns != 'VALUE']

number = preprocessing.LabelEncoder()
X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)       
y_train = df_train_4['VALUE']

gd_sr.fit(X_train, y_train)

best_parameters = gd_sr.best_params_
print(best_parameters)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)


{'criterion': 'gini', 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 26}


In [39]:
error_per_session(df_train_4, df_validation_4, RandomForestClassifier(criterion = 'gini', max_features= 'log2', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 26), 'Random Forest last improvement')
df_for_model_tab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random F even better than Best Parameters,Random Forest last improvement
0,F1,0.978691,0.972324
1,Accuracy,0.982429,0.977282
2,Precision,0.979957,0.978034
3,Recall,0.977429,0.966681
4,max difference from error mean,0.118889,0.131424
5,mean difference from error mean,0.020704,0.026232
6,number of bigger than 0.2 differnece,0.0,0.0
7,number of bigger than 0.1 differnece,1.0,1.0


In this case there is no improvement, and we will stay with the previous version and definitely have excellent results. All that is left is to test it on the test set

In [41]:
error_per_session(df_train_4, df_test_4, RandomForestClassifier(criterion = 'gini', max_features= 'log2', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 25), 'Best Random Forest on Test Data Set ')
df_for_model_tab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['SESSION_INFO_ID'] = number.fit_transform(X_train.SESSION_INFO_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['SESSION_INFO_ID'] = number.fit_transform(X_test.SESSION_INFO_ID)


Unnamed: 0,validation metric,Random F even better than Best Parameters,Random Forest last improvement,Best Random Forest on Test Data Set
0,F1,0.978691,0.972324,0.987335
1,Accuracy,0.982429,0.977282,0.991835
2,Precision,0.979957,0.978034,0.987472
3,Recall,0.977429,0.966681,0.987197
4,max difference from error mean,0.118889,0.131424,0.074761
5,mean difference from error mean,0.020704,0.026232,0.009391
6,number of bigger than 0.2 differnece,0.0,0.0,0.0
7,number of bigger than 0.1 differnece,1.0,1.0,0.0


In [42]:
df_for_model_tab = df_for_model_tab.drop(columns=['Random Forest last improvement'])
df_for_model_tab.rename(columns = {'Random F even better than Best Parameters':'Best Random Forest on Validation Data Set'}, inplace = True)

df_for_model_tab

Unnamed: 0,validation metric,Best Random Forest on Validation Data Set,Best Random Forest on Test Data Set
0,F1,0.978691,0.987335
1,Accuracy,0.982429,0.991835
2,Precision,0.979957,0.987472
3,Recall,0.977429,0.987197
4,max difference from error mean,0.118889,0.074761
5,mean difference from error mean,0.020704,0.009391
6,number of bigger than 0.2 differnece,0.0,0.0
7,number of bigger than 0.1 differnece,1.0,0.0


We have won another and surprising improvement in all the indices, we have a winning model! Now we have to save it

In [43]:
final_data_set_for_model_file = [df_train_4, df_validation_4, df_test_4]

final_data_set_for_model_file = pd.concat(final_data_set_for_model_file)

final_data_set_for_model_file.shape


(114271, 13)

In [44]:
final_data_set_for_model_file.head()

Unnamed: 0,SESSION_INFO_ID,INTERVAL_IN_FILE,BAND_ARR_5,BAND_ARR_6,BAND_ARR_7,BAND_ARR_8,BAND_ARR_13,BIT_FLAG,MAX_VALUE,NTC,"BAND_ARR_2__fft_coefficient__attr_""imag""__coeff_1","BAND_ARR_8__fft_coefficient__attr_""real""__coeff_4",VALUE
0,0149b8bcd7754ff398cc5bebd1e43ccd,0,0.065059,0.05482,0.05599,0.058916,0.049846,65,0.078078,1393,0.0,0.0,0
1,0316a719cdc240d8a8ae6608484dffae,1,0.055083,0.048825,0.04483,0.050023,0.060275,65,0.078783,1418,-0.004584,0.0,0
2,0316a719cdc240d8a8ae6608484dffae,0,0.073132,0.055827,0.054865,0.063518,0.059672,65,0.295214,1416,-0.004584,0.0,0
3,0316a719cdc240d8a8ae6608484dffae,2,0.063715,0.055412,0.063122,0.057784,0.056005,65,0.115607,1415,-0.004584,0.0,0
4,05c71780b43648399788edeb658889aa,15,5.214829,4.737063,2.248698,1.950095,2.308419,67,5.214829,1509,-775.5361,-136.078768,0


In [45]:
X = final_data_set_for_model_file.loc[:, final_data_set_for_model_file.columns != 'VALUE']
X = X.loc[:, X.columns != 'SESSION_INFO_ID']

y = final_data_set_for_model_file['VALUE']


In [46]:
    clf = RandomForestClassifier(criterion = 'gini', max_features= 'log2', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 25)
    clf.fit(X, y)

    


RandomForestClassifier(max_features='log2', n_estimators=25)

In [48]:
from joblib import dump, load
dump(clf, 'model.joblib') 


['model.joblib']