# Train Classifier
The following notebook trains a number of classifiers on the world championships dataset after pre-processing.

Import the necessary dependencies

In [3]:
import pandas as pd
import numpy as np
import util
from os import getcwd as wd
# plotting
import plotly.express as px
import plotly.graph_objects as go
# ML
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# min-max scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()


In [4]:
column_order = [16,0,24,32]
column_order_str = [f'{dimension}{item}' for item in column_order for dimension in ["y"]]
all_coll_str_y = [f'{dimension}{item}' for item in range(33) for dimension in ["y"]]

def plot_feature_given_id(lift_id:int,df_orig,feature_columns:list = column_order_str):
    """
    Given a lift id, create a figure displaying the lift
    """
    # print("in the plot function",lift_id,feature_column)
    df = df_orig.copy(deep=True)
    
    # create a df with only the values for this id.
    filter_id = df['id'] == lift_id
    df_plot = df[filter_id].copy()
    # fig1 = px.line(df_plot,y = feature_columns)
    fig2 = px.scatter(df_plot,y = feature_columns, color="class")

    # fig = go.Figure(data=fig1.data + fig2.data)
    fig = go.Figure(data=fig2.data)
    fig.show()

## Scale the data
For the classifier to be able to effectively detect parts of the lift the data that the classifiet is trained upon needs to be normalised, so a min max sclaer is implemented, that sets all the values in the training dataset from 0 to 1.

### Load the world championship dataset


In [6]:
csv_path = f"{wd()}/data/world_championships_data.csv"
df = pd.read_csv(csv_path, index_col = 0)
df.head()


  df = pd.read_csv(csv_path, index_col = 0)


Unnamed: 0,id,class,time_ms,success,weightclass,name,country,weight,x0,y0,...,z30,v30,x31,y31,z31,v31,x32,y32,z32,v32
15,0,start,391633.333333,1,109+,TOYCHYYEV,TKM,180,0.49227,0.249343,...,-0.103702,0.250496,0.427865,-0.001709,-0.27367,0.232539,0.559647,0.001709,-0.244762,0.213154
16,0,start,391666.666667,1,109+,TOYCHYYEV,TKM,180,0.492256,0.260386,...,-0.080179,0.323355,0.429044,-0.002347,-0.221434,0.30503,0.559634,0.002347,-0.206907,0.29051
17,0,start,391700.0,1,109+,TOYCHYYEV,TKM,180,0.492122,0.273022,...,-0.052037,0.38939,0.42888,-0.001509,-0.168077,0.369838,0.559187,0.001509,-0.164393,0.360186
18,0,start,391733.333333,1,109+,TOYCHYYEV,TKM,180,0.492107,0.275419,...,-0.041674,0.448591,0.428852,-0.001392,-0.153025,0.42847,0.559413,0.001392,-0.144922,0.422875
19,0,start,391766.666667,1,109+,TOYCHYYEV,TKM,180,0.492137,0.284583,...,-0.045571,0.501953,0.428794,-0.001289,-0.160241,0.481376,0.559285,0.001289,-0.152675,0.479406


Normalise the data

In [13]:
df_scaled = df.copy(deep=True)
data_to_scale = df_scaled[all_coll_str_y].to_numpy()
data_to_scale = data_to_scale.transpose()
data_to_scale = scaler.fit_transform(data_to_scale)
data_to_scale = data_to_scale.transpose()
df_scaled[all_coll_str_y] = data_to_scale
df_scaled.loc[:,all_coll_str_y].describe()

Unnamed: 0,y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,...,y23,y24,y25,y26,y27,y28,y29,y30,y31,y32
count,27515.0,27515.0,27515.0,27515.0,27515.0,27515.0,27515.0,27515.0,27515.0,27515.0,...,27515.0,27515.0,27515.0,27515.0,27515.0,27515.0,27515.0,27515.0,27515.0,27515.0
mean,0.862023,0.892485,0.893907,0.895299,0.891724,0.892563,0.893423,0.894089,0.891784,0.834718,...,0.436122,0.436858,0.395319,0.394094,0.093909,0.092644,0.046285,0.044999,0.003763,0.002646
std,0.120352,0.120701,0.11995,0.119054,0.12088,0.120223,0.119389,0.112923,0.112318,0.113632,...,0.157222,0.155997,0.102015,0.101868,0.024837,0.023998,0.018377,0.018204,0.00618,0.005396
min,0.583161,0.607917,0.613601,0.62091,0.610551,0.616984,0.62281,0.657517,0.657417,0.571558,...,0.13649,0.125032,0.212357,0.0035,0.024457,0.013216,0.0,0.0,0.0,0.0
25%,0.736209,0.764893,0.766384,0.768239,0.76392,0.764936,0.765941,0.772511,0.770167,0.714974,...,0.326943,0.3294,0.311549,0.312507,0.075181,0.074541,0.033167,0.031234,0.0,0.0
50%,0.930117,0.976708,0.980171,0.983968,0.97681,0.980325,0.98347,0.977084,0.973118,0.894203,...,0.437381,0.439987,0.355606,0.352876,0.08884,0.086736,0.04341,0.042312,0.000959,0.0
75%,0.967413,0.999055,0.998968,0.998714,0.998225,0.997704,0.996725,0.987826,0.985031,0.928107,...,0.526946,0.525777,0.47338,0.472841,0.109342,0.108514,0.056449,0.055739,0.005696,0.003456
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99263,...,1.0,1.0,0.827209,0.750236,0.237036,0.210441,0.189797,0.185924,0.115556,0.092433


### Load the data from the training session


In [10]:
csv_gatis_path = f"{wd()}/data/training_data.csv"
dfGat = pd.read_csv(csv_gatis_path,index_col = 0)
dfGat.head()

Unnamed: 0_level_0,id,country,name,success,weightclass,class,weight,time_ms,x0,y0,...,z30,v30,x31,y31,z31,v31,x32,y32,z32,v32
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
914,1,LAT,gatis,1,109+,other,70,16716.666667,0.518763,0.394609,...,-0.144154,0.848121,0.680128,0.197192,-0.521266,0.912696,0.365079,0.19753,-0.443593,0.931324
915,1,LAT,gatis,1,109+,other,70,16733.333333,0.522952,0.397789,...,-0.148174,0.848327,0.680036,0.197322,-0.539683,0.915034,0.365008,0.197483,-0.44891,0.933486
916,1,LAT,gatis,1,109+,other,70,16750.0,0.524787,0.398377,...,-0.19404,0.849025,0.680167,0.197384,-0.60075,0.917502,0.36105,0.197456,-0.499846,0.936055
917,1,LAT,gatis,1,109+,other,70,16766.666667,0.525662,0.399457,...,-0.21006,0.851675,0.677279,0.197479,-0.614857,0.92077,0.359379,0.197075,-0.512824,0.93921
918,1,LAT,gatis,1,109+,other,70,16783.333333,0.525538,0.399316,...,-0.205181,0.85138,0.677343,0.197557,-0.555913,0.920654,0.35934,0.196817,-0.491696,0.940389


Normalise data

In [11]:
dfGat_scaled = dfGat.copy(deep=True)
data_to_scale = dfGat_scaled[all_coll_str_y].to_numpy()
data_to_scale = data_to_scale.transpose()
data_to_scale = scaler.fit_transform(data_to_scale)
data_to_scale = data_to_scale.transpose()
dfGat_scaled[all_coll_str_y] = data_to_scale
dfGat_scaled.loc[:,all_coll_str_y].describe()

Unnamed: 0,y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,...,y23,y24,y25,y26,y27,y28,y29,y30,y31,y32
count,8429.0,8429.0,8429.0,8429.0,8429.0,8429.0,8429.0,8429.0,8429.0,8429.0,...,8429.0,8429.0,8429.0,8429.0,8429.0,8429.0,8429.0,8429.0,8429.0,8429.0
mean,0.887726,0.909738,0.909405,0.909067,0.910587,0.910786,0.910805,0.900985,0.901702,0.860209,...,0.436887,0.438823,0.382676,0.391302,0.090838,0.086945,0.041926,0.033653,0.001511,0.007325
std,0.069481,0.073256,0.07351,0.073735,0.072581,0.072344,0.072198,0.073974,0.072441,0.065728,...,0.186145,0.184195,0.101241,0.107096,0.024018,0.022611,0.017229,0.01896,0.009068,0.007471
min,0.613165,0.615275,0.621149,0.627982,0.614341,0.618941,0.625536,0.690861,0.687177,0.651633,...,0.235785,0.237452,0.245893,0.24748,0.050373,0.045003,0.0094,0.0,0.0,0.0
25%,0.829144,0.848794,0.846964,0.844861,0.850971,0.85038,0.849049,0.828113,0.830586,0.796744,...,0.277588,0.278692,0.300857,0.307753,0.07542,0.071019,0.031792,0.019514,0.0,0.001345
50%,0.891285,0.907141,0.906431,0.905376,0.908072,0.907716,0.907072,0.891919,0.892789,0.873265,...,0.396405,0.400566,0.336188,0.340824,0.088535,0.079127,0.039994,0.031257,0.0,0.006211
75%,0.962517,0.996779,0.996512,0.995275,0.996595,0.995622,0.994329,0.978781,0.977394,0.922647,...,0.503151,0.5057,0.440465,0.454124,0.102234,0.102159,0.049961,0.046348,0.0,0.010856
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.987062,...,1.0,1.0,0.702866,0.711718,0.347363,0.274314,0.35379,0.23546,0.222439,0.179862


# Create the Classification models

Separate the data for the model

In [14]:
X = df_scaled.loc[:,all_coll_str_y].copy(deep=True)    # features - xy columns
y = df_scaled.loc[:,"class"]                           # class

Split the data set to test and train datasets

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

### Train Machine Learning Classification Model

In [15]:
# Set the piplines for training the models
pipelines = {
    'lr':make_pipeline(StandardScaler(), LogisticRegression()),
    'rc':make_pipeline(StandardScaler(), RidgeClassifier()),
    'rf':make_pipeline(StandardScaler(), RandomForestClassifier()),
    'gb':make_pipeline(StandardScaler(), GradientBoostingClassifier()),
}

Train the models

In [19]:
fit_models = {}
for algo, pipeline in pipelines.items():
    model = pipeline.fit(X_train, y_train)
    fit_models[algo] = model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Evaluate and Serialize Model 

In [20]:
from sklearn.metrics import accuracy_score # Accuracy metrics 
import pickle 

In [21]:
for algo, model in fit_models.items():
    yhat = model.predict(X_test)
    print(algo, accuracy_score(y_test, yhat))

lr 0.9086614173228347
rc 0.8642035130224106
rf 0.9365233192004846
gb 0.9205330102967898


In [22]:
model_path = f"{wd()}/data/gradient_boost_classifier.pkl" 
# Save the model to a file
with open(model_path, 'wb') as f:
    pickle.dump(fit_models['gb'], f)

# Load the model from a file
# with open(model_path, 'rb') as f:
#     model_rf = pickle.load(f)

# Make detections


Make detections on the training session data

In [26]:
dfGat_scaled["class"] = fit_models["gb"].predict(dfGat_scaled[all_coll_str_y])

In [27]:
dfGat["class"] = dfGat_scaled["class"]

In [28]:
dfGat["class"].value_counts()

catch        2287
start        1917
overturn     1797
extension    1223
end          1205
Name: class, dtype: int64

In [29]:
util.save_df_to_csv(dfGat,f"{wd()}/data/training_data.csv")

30171,14,LAT,gatis,1,109+,start,95,53733.3333334408,0.5286672115325928,0.4126608371734619,-1.204664945602417,0.999917209148407,0.5319547653198242,0.4251054525375366,-1.172417402267456,0.9998129606246948,0.5344367027282715,0.4263954758644104,-1.1723852157592771,0.9996640086174012,0.5368766784667969,0.4278700351715088,-1.1726367473602295,0.9997432827949524,0.5194018483161926,0.4247443675994873,-1.1942814588546753,0.9998111724853516,0.5123372077941895,0.4255356788635254,-1.1945557594299316,0.9996850490570068,0.5045381188392639,0.4264062047004699,-1.1946663856506348,0.9997379779815674,0.5297791957855225,0.4345366954803467,-0.8948372602462769,0.9994648694992064,0.4756223857402801,0.4298550486564636,-1.0050511360168457,0.9995927214622498,0.5273545980453491,0.407447874546051,-1.090802550315857,0.999954640865326,0.5122041702270508,0.4069744348526001,-1.1215695142745972,0.999959409236908,0.5823746919631958,0.4159466028213501,-0.6831803321838379,0.999942183494568,0.3795725107192993,0.41824352741

### Plot some of the predictions

In [31]:
for id_value in dfGat.id.unique()[:3]:
    print(f"Predictions for lift id: {id_value}")
    plot_feature_given_id(id_value, dfGat,column_order_str)

Predictions for lift id: 1


Predictions for lift id: 2


Predictions for lift id: 4
