## Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame
import datetime
import seaborn as sns
from pylab import *
from pandas import Series
from os import listdir
from os.path import join
from sklearn import linear_model
from sklearn import ensemble
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn.datasets import load_iris
from sklearn import preprocessing
from datetime import timedelta
from datetime import date
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go
from sklearn import svm
import scipy
from scipy import stats


from IPython.display import display, HTML, Image
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))
from sklearn.cluster import KMeans

# PART 1 : DATA SELECTION

## Dataset with flat test cases (only data with speed > 60 and vibrations above 0)

In [2]:
folder_flat = 'S:\\Analytics\\TemporaryThings\\difference between RCF and flats\\Flat\\'
dataframe_dic_flat = {} # dataframe dictionary to fill in with data 
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
for i in listdir(folder_flat):
    tmp = i.split('Export_')[-1].split('.csv')[0]
    fin = join(folder_flat,i)
    dataframe_dic_flat[tmp]=pd.read_csv(
        fin, 
        index_col = False,
        usecols = ['UTC Time Date','BHI','WHI','Speed','RMS X','RMS Y','RMS Z','Peak X','Peak Y','Peak Z','Speed','RSSI','Ave Temp C','Temp Diff C','Voltage','Latitude','Longitude','Speed','Altitude','Heading','X Offset','Y Offset','Z Offset'],
        parse_dates = ['UTC Time Date'],date_parser=dateparse)

In [3]:
df_flat = pd.concat(dataframe_dic_flat)
df_flat = df_flat.reset_index().rename(columns ={'level_0':'Wheel','Peak X':'PeakX'}).drop(['level_1'],axis = 1)
df_flat = df_flat.loc[(df_flat['Speed']>60) & (df_flat['PeakX']> 0) & (df_flat['Peak Y']> 0) & (df_flat['Peak Z']>0) & (df_flat['RMS X']>0) & (df_flat['RMS Y']>0)  & (df_flat['RMS Z']>0)]
df_flat['ID'] = np.zeros(len(df_flat))
df_flat['EHI'] = (np.power(df_flat['RMS X'],2)+np.power(df_flat['RMS Y'],2)+np.power(df_flat['RMS Z'],2))/np.power(df_flat['Speed'],2)

## Dataset with RCF test cases (only data with speed > 60 and vibrations above 0)

In [4]:
folder_RCF = 'S:\\Analytics\\TemporaryThings\\difference between RCF and flats\\RCF\\'
dataframe_dic_RCF = {} 
for i in listdir(folder_RCF):
    tmp = i.split('Export_')[-1].split('.csv')[0]
    fin = join(folder_RCF,i)
    dataframe_dic_RCF[tmp]=pd.read_csv(
        fin, 
        index_col = False,
        usecols = ['UTC Time Date','BHI','WHI','Speed','RMS X','RMS Y','RMS Z','Peak X','Peak Y','Peak Z','Speed','RSSI','Ave Temp C','Temp Diff C','Voltage','Latitude','Longitude','Speed','Altitude','Heading','X Offset','Y Offset','Z Offset'],
        parse_dates = ['UTC Time Date'])

In [5]:
df_RCF = pd.concat(dataframe_dic_RCF)
df_RCF = df_RCF.reset_index().rename(columns ={'level_0':'Wheel','Peak X':'PeakX'}).drop(['level_1'],axis = 1)
df_RCF = df_RCF.loc[(df_RCF['Speed']>60) & (df_RCF['PeakX']> 0) & (df_RCF['Peak Y']> 0) & (df_RCF['Peak Z']>0) & (df_RCF['RMS X']>0) & (df_RCF['RMS Y']>0)  & (df_RCF['RMS Z']>0)]
df_RCF['ID'] = np.full((len(df_RCF), 1),1)
df_RCF['EHI'] = (np.power(df_RCF['RMS X'],2)+np.power(df_RCF['RMS Y'],2)+np.power(df_RCF['RMS Z'],2))/np.power(df_RCF['Speed'],2)

## WHI and EHI time series FLAT

In [6]:
for i, vals in df_flat.groupby('Wheel'):
    df = df_flat.loc[(df_flat.Wheel ==  i)]
    plt.figure(figsize = (30,10))
    plt.plot(df['UTC Time Date'],df['WHI'])
    plt.plot(df['UTC Time Date'],df['EHI']*100)
    plt.legend([i])

## WHI and EHI time series RCF

In [7]:
for i, vals in df_RCF.groupby('Wheel'):
    df = df_RCF.loc[(df_RCF.Wheel ==  i)]
    plt.figure(figsize = (30,10))
    plt.plot(df['UTC Time Date'],df['WHI'])
    #plt.figure(figsize = (30,10))
    plt.plot(df['UTC Time Date'],df['EHI']*100)
    plt.legend([i])

## Aggregation between dataframe with flat cases and RCF cases

In [8]:
df = pd.concat([df_flat,df_RCF])

##  a new dataset with the date of maximum WHI for each test cases (wheel)

In [9]:
WHI_max = {}
WHI_quantile = {}
df = df
for (v,d),vals in df.groupby(['Wheel','ID']):
    WHI_max[v,d] = df.loc[(df.WHI == vals['WHI'].max())]
    WHI_max[v,d]['UTC Time Date'] = pd.to_datetime(WHI_max[v,d]['UTC Time Date'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [10]:
max_WHI = pd.concat(WHI_max).reset_index().drop(['level_0','level_1','level_2'],axis = 1).rename(columns = {'WHI': 'WHI_max'})

## Aggregate the dataset with the date of maximum WHI and the dataframe with flat and RCF cases

In [11]:
df_max = pd.merge(df,max_WHI[['Wheel','ID','UTC Time Date','WHI_max']],on = ['Wheel','ID']).rename(columns ={'UTC Time Date_x' : 'Date','UTC Time Date_y' : 'Date_max_WHI'})

## Select data with WHI high with value closes to the maximum

In [12]:
df_high_range ={}
for (w,ind,maxm),vals in df_max.groupby(['Wheel','ID','Date_max_WHI']):
    if ind == 1:
        df_high_range[w,ind] = df_max.loc[(df_max.Wheel == w) & (df_max.ID == ind) & (df_max.Date <= maxm) & (df_max.WHI >= vals['WHI'].quantile(0.85))]
    else:
        df_high_range[w,ind] = df_max.loc[(df_max.Wheel == w) & (df_max.ID == ind) & (df_max.Date >= maxm - timedelta(days = 7)) & (df_max.Date <= maxm) & (df_max.WHI >= vals['WHI'].quantile(0.85))]

In [13]:
df_high_WHI = pd.concat(df_high_range).reset_index().drop(['level_0','level_1','level_2'],axis = 1)
df_high_WHI['Costumer'],df_high_WHI['Vehicle'],df_high_WHI['Unit'],df_high_WHI['Node'] = df_high_WHI['Wheel'].str.split('_').str

In [14]:
df_high_WHI = df_high_WHI[['Wheel','ID','Date','RSSI', 'Ave Temp C', 'Temp Diff C','BHI','WHI','Voltage','Speed','X Offset','Y Offset','Z Offset','RMS X','RMS Y','RMS Z', 'PeakX', 'Peak Y','Peak Z','EHI']]

## Plot WHI only for the date where it is high (flat)

In [15]:
for i, vals in df_high_WHI.loc[(df_high_WHI.ID ==0)].groupby('Wheel'):
    df = df_high_WHI.loc[(df_high_WHI.ID ==0)].loc[(df_high_WHI.loc[(df_high_WHI.ID ==0)].Wheel ==  i)]
    plt.figure(figsize = (30,10))
    plt.plot(df['Date'],df['WHI'])
    plt.legend([i])

## Plot WHI only for the date where it is high (RCF)

In [16]:
for i, vals in df_high_WHI.loc[(df_high_WHI.ID ==1)].groupby('Wheel'):
    df = df_high_WHI.loc[(df_high_WHI.ID ==1)].loc[(df_high_WHI.loc[(df_high_WHI.ID ==1)].Wheel ==  i)]
    plt.figure(figsize = (30,10))
    plt.plot(df['Date'],df['WHI'])
    plt.legend([i])

# PART 2 : ANALYSIS

## Calculate the vibrations average,25 and 85 percentile per day (flat cases) or per 14 days (RCF cases) and generate a new dataframe 

In [17]:
average = {}
high_percentile = {}
low_percentile = {}

for k in df_high_WHI.ID.unique():
    if k == 0: 
        average[k] = df_high_WHI.loc[(df_high_WHI.ID == k)].groupby(['Wheel',pd.Grouper(key = 'Date',freq='D')]).mean().reset_index()
        high_percentile[k] = df_high_WHI.loc[(df_high_WHI.ID == k)].groupby(['Wheel',pd.Grouper(key = 'Date',freq='D')]).quantile(0.85).reset_index()
        low_percentile[k] = df_high_WHI.loc[(df_high_WHI.ID == k)].groupby(['Wheel',pd.Grouper(key = 'Date',freq='D')]).quantile(0.25).reset_index()
    if k == 1:
        average[k] = df_high_WHI.loc[(df_high_WHI.ID == k)].groupby(['Wheel',pd.Grouper(key = 'Date',freq='14D')]).mean().reset_index()
        high_percentile[k] = df_high_WHI.loc[(df_high_WHI.ID == k)].groupby(['Wheel',pd.Grouper(key = 'Date',freq='14D')]).quantile(0.85).reset_index()
        low_percentile[k] = df_high_WHI.loc[(df_high_WHI.ID == k)].groupby(['Wheel',pd.Grouper(key = 'Date',freq='14D')]).quantile(0.25).reset_index()
        
average = pd.concat(average, sort=False)
high_percentile = pd.concat(high_percentile, sort=False)
low_percentile = pd.concat(low_percentile, sort=False)

In [18]:
percentile = pd.merge(high_percentile,low_percentile,on = ['Wheel','ID'])
final_df = pd.merge(percentile,average,on = ['Wheel','ID']).drop(['Date_x','Date_y'],axis = 1)
final_df.columns = final_df.columns.str.replace('_x','_85 percentile')
final_df.columns = final_df.columns.str.replace('_y','_25 percentile')

## Define train and test set

In [19]:
train, test = train_test_split(final_df['Wheel'].unique(),test_size=0.2)
df_train_l = {}
df_test_l = {}
for t in train:
    df_train_l[t] = final_df.loc[final_df['Wheel']==t]
for te in test:
    df_test_l[te] = final_df.loc[final_df['Wheel']==te]

df_train = pd.concat(df_train_l).reset_index().drop(['level_0','level_1'],axis=1) 
df_test = pd.concat(df_test_l).reset_index().drop(['level_0','level_1'],axis=1)

In [20]:
print(len(df_train[df_train['ID'] == 0]),len(df_test[df_test['ID'] == 0]))

5737 1918


In [21]:
print(len(df_train[df_train['ID'] == 1]),len(df_test[df_test['ID'] == 1]))

19282 6764


## Comparison variable distribution for flat and RCF TRAIN

In [22]:
features_sel = ['PeakX_85 percentile','Peak Y_85 percentile','Peak Z_85 percentile','RMS X_85 percentile','RMS Y_85 percentile','RMS Z_85 percentile','PeakX_25 percentile','Peak Y_25 percentile','Peak Z_25 percentile','RMS X_25 percentile','RMS Y_25 percentile','RMS Z_25 percentile',
                'PeakX','Peak Y','Peak Z','RMS X','RMS Y','RMS Z','EHI','EHI_85 percentile','EHI_25 percentile']

f = 20
flat = df_train[df_train['ID'] == 0]
RCF = df_train[df_train['ID'] == 1]
for i in features_sel:
    fig, axs = plt.subplots(1, 2,figsize=(40, 10))
    axs[0].hist(flat[i],bins=40,color='r',alpha=.5,density = True)
    axs[0].hist(RCF[i],bins=40,color='g',alpha=0.3,density = True)
    axs[0].legend(['flat','RCF'],fontsize = f)
    axs[0].set_title(' '.join([i,'Train','distribution']),fontsize = f)
    axs[0].tick_params(axis="x", labelsize=f)
    axs[0].tick_params(axis="y", labelsize=f)
    axs[1].hist(flat[i],density=True, cumulative=True, label='CDF',histtype='step', alpha=0.8, color='r')
    axs[1].hist(RCF[i],density=True, cumulative=True, label='CDF',histtype='step', alpha=0.8, color='g')
    axs[1].set_title(' '.join([i,'Train','cumulative distribution function']),fontsize = f)
    axs[1].legend(['flat','RCF'],fontsize = f)
    axs[1].tick_params(axis="x", labelsize=f)
    axs[1].tick_params(axis="y", labelsize=f)

## Comparison variable distribution for flat and RCF TEST

In [23]:
f = 20
flat = df_test[df_test['ID'] == 0]
RCF = df_test[df_test['ID'] == 1]
for i in features_sel:
    fig, axs = plt.subplots(1, 2,figsize=(40, 10))
    axs[0].hist(flat[i],bins=40,color='r',alpha=.5,density = True)
    axs[0].hist(RCF[i],bins=40,color='g',alpha=0.3,density = True)
    axs[0].legend(['flat','RCF'],fontsize = f)
    axs[0].set_title(' '.join([i,'Test','distribution']),fontsize = f)
    axs[0].tick_params(axis="x", labelsize=f)
    axs[0].tick_params(axis="y", labelsize=f)
    axs[1].hist(flat[i],density=True, cumulative=True, label='CDF',histtype='step', alpha=0.8, color='r')
    axs[1].hist(RCF[i],density=True, cumulative=True, label='CDF',histtype='step', alpha=0.8, color='g')
    axs[1].set_title(' '.join([i,'Test','cumulative distribution function']),fontsize = f)
    axs[1].legend(['flat','RCF'],fontsize = f)
    axs[1].tick_params(axis="x", labelsize=f)

## DIFFERENCE OF DISTRIBUTION FOR RCF CASES BASED ON RAW VIBRATIONS VALUES

In [24]:
seq = df_train[df_train['ID'] == 1].Wheel.unique()
comp = []
n = 6
lists = [[] for _ in range(n)]
for x in seq: 
    for y in seq:
        if x !=y:
            comp.append((x,y))
comp = pd.DataFrame(comp)

for i in range(0,len(comp)):
    for n,p in enumerate(['PeakX','Peak Y','Peak Z','RMS X','RMS Y','RMS Z']):
        lists[n].append(stats.ks_2samp(df_train.loc[(df_train['ID'] == 1) & (df_train['Wheel'] == comp[0][i])][p], df_train.loc[(df_train['ID'] == 1) & (df_train['Wheel'] == comp[1][i])][p])[0])
    
for n,p in enumerate(['PeakX','Peak Y','Peak Z','RMS X','RMS Y','RMS Z']):
    comp[p] = lists[n] 
comp['Damage'] = 'RCF'
comp = comp.rename(columns = {0: 'Wheel1',1:'Wheel2'})

## RCF CLUSTER ANALYSIS

In [25]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(comp.iloc[:,2:8])
print(kmeans.cluster_centers_) # print location of clusters learned by kmeans object
y_km = kmeans.fit_predict(comp.iloc[:,2:8]) # save new clusters for chart
comp['cluster'] = list(y_km)

[[0.9191612  0.92531932 0.91863944 0.93497101 0.94990007 0.95284   ]
 [0.49035035 0.48608323 0.48098748 0.52136661 0.48957771 0.52427837]
 [0.60032381 0.65609355 0.7958312  0.6701965  0.71585198 0.85690238]
 [0.73845476 0.68573947 0.51255628 0.7658816  0.69446061 0.62155245]]


## CLUSTER ANALYSIS VISUALIZATION-RCF

In [26]:
g = ['PeakX','Peak Y','Peak Z','RMS X','RMS Y','RMS Z','cluster']
f = ['PeakX','Peak Y','Peak Z','RMS X','RMS Y','RMS Z']
plt.figure(figsize = (20,10))
for n,k in enumerate(['red','black','blue','cyan']):
    plt.scatter(comp[g].loc[(comp[g].cluster == n)]['PeakX'], comp[g].loc[(comp[g].cluster == n)]['Peak Y'], s=100, c=k)
    plt.scatter(comp[g].loc[(comp[g].cluster == n)]['Peak Z'], comp[g].loc[(comp[g].cluster == n)]['RMS X'], s=100, c=k)
    plt.scatter(comp[g].loc[(comp[g].cluster == n)]['RMS Y'], comp[g].loc[(comp[g].cluster == n)]['RMS Z'], s=100, c=k)
plt.title('RCF cluster output based on vibrations distance')

## DIFFERENCE OF DISTRIBUTION FOR FLATS CASES BASED ON RAW VIBRATIONS VALUES

In [27]:
seq = df_train[df_train['ID'] == 0].Wheel.unique()
comp2 = []
n = 6
lists = [[] for _ in range(n)]
for x in seq: 
    for y in seq:
        if x !=y:
            comp2.append((x,y))
comp2= pd.DataFrame(comp2)

for i in range(0,len(comp2)):
    for n,p in enumerate(['PeakX','Peak Y','Peak Z','RMS X','RMS Y','RMS Z']):
        lists[n].append(stats.ks_2samp(df_train.loc[(df_train['ID'] == 0) & (df_train['Wheel'] == comp2[0][i])][p], df_train.loc[(df_train['ID'] == 0) & (df_train['Wheel'] == comp2[1][i])][p])[0])
    
for n,p in enumerate(['PeakX','Peak Y','Peak Z','RMS X','RMS Y','RMS Z']):
    comp2[p] = lists[n]  
comp2['Damage'] = 'Flat'
comp2 = comp.rename(columns = {0: 'Wheel1',1:'Wheel2'})

## FLATS CLUSTER ANALYSIS

In [28]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(comp2.iloc[:,2:8])
print(kmeans.cluster_centers_) # print location of clusters learned by kmeans object
y_km = kmeans.fit_predict(comp2.iloc[:,2:8]) # save new clusters for chart
comp2['cluster'] = list(y_km)

[[0.60178146 0.68156799 0.8067221  0.66536591 0.74044931 0.85666874]
 [0.48720426 0.48577638 0.48514767 0.50786418 0.48960214 0.52122632]
 [0.92415094 0.92627142 0.90913801 0.93972538 0.95055629 0.94495339]
 [0.71248716 0.64408269 0.51256411 0.76369198 0.65463754 0.63628882]]


## CLUSTER ANALYSIS VISUALIZATION-FLATS

In [29]:
g = ['PeakX','Peak Y','Peak Z','RMS X','RMS Y','RMS Z','cluster']
f = ['PeakX','Peak Y','Peak Z','RMS X','RMS Y','RMS Z']
plt.figure(figsize = (20,10))
for n,k in enumerate(['red','black','blue','cyan']):
    plt.scatter(comp2[g].loc[(comp2[g].cluster == n)]['PeakX'], comp2[g].loc[(comp2[g].cluster == n)]['Peak Y'], s=100, c=k)
    plt.scatter(comp2[g].loc[(comp2[g].cluster == n)]['Peak Z'], comp2[g].loc[(comp2[g].cluster == n)]['RMS X'], s=100, c=k)
    plt.scatter(comp2[g].loc[(comp2[g].cluster == n)]['RMS Y'], comp2[g].loc[(comp2[g].cluster == n)]['RMS Z'], s=100, c=k)
    plt.title('Flat cluster output based on vibrations distance')

#### Based on raw vibrations the results are not good. Make the same analysis using the percentile

## Features Normalization

In [30]:
normalized_train_X = preprocessing.normalize(df_train[features_sel])
normalized_test_X = preprocessing.normalize(df_test[features_sel])

## PCA train

In [31]:
pca = PCA(n_components = 3)
principalComponents = pca.fit_transform(normalized_train_X)
print(pca.explained_variance_ratio_)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', 'principal component 3'])
finalDf = pd.concat([principalDf, df_train[['ID']]], axis = 1)

[0.42289642 0.31350679 0.13370735]


## Features and components correlation matrix TRAIN

In [32]:
f = 30
fig, ax = plt.subplots(figsize=(40,10))
ax.matshow(pca.components_,cmap='viridis')
ax.set_yticklabels(['']+['1st Comp','2nd Comp','3rd Comp'],fontsize=f)
fig.colorbar(ax.matshow(pca.components_,cmap='viridis')).ax.tick_params(labelsize=f)
plt.title('Correlation matrix between components and features: Train set', y=-0.4,fontsize=f)
plt.xticks(range(len(features_sel)),features_sel,rotation=65,ha='left',fontsize=f)
plt.tight_layout()
plt.show() 

## EXPALINED VARIANCE RATIO TRAIN

In [33]:
f = 30
plt.figure(figsize = (30,10))
plt.grid()
pca = PCA().fit(normalized_train_X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.axvline(x = 3,color = 'r')
plt.xticks(arange(24, step=1),fontsize = f)
plt.yticks(fontsize = f)
plt.title('Explained variance ratio: Train set',fontsize = f)
plt.xlabel('number of components',fontsize = f)
plt.ylabel('cumulative explained variance',fontsize = f);

## Apply transform to both the training set and the test set.

In [34]:
train_t = pca.transform(normalized_train_X)
test_t = pca.transform(normalized_test_X)

## Apply Logistic Regression to the Transformed Data

In [35]:
logisticRegr = LogisticRegression(solver = 'lbfgs')
logisticRegr.fit(train_t,df_train['ID'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

## MEASURE COMPONENT PERFORMANCE

In [36]:
logisticRegr.score(test_t,df_test['ID'])

0.7719419488597098

In [37]:
y_pred_l = logisticRegr.predict(test_t)
print(classification_report(df_test['ID'], y_pred_l))

              precision    recall  f1-score   support

         0.0       0.20      0.01      0.02      1918
         1.0       0.78      0.99      0.87      6764

   micro avg       0.77      0.77      0.77      8682
   macro avg       0.49      0.50      0.45      8682
weighted avg       0.65      0.77      0.68      8682



In [38]:
mat = confusion_matrix(df_test['ID'], y_pred_l)
sns.heatmap(mat.T, square=True, annot=True,cmap='coolwarm', fmt='d', cbar=True)
plt.title('Logistic regression with components')
plt.xlabel('true label')
plt.ylabel('predicted label');

## FIRST TWO COMPONENTS VISUALIZATION TRAIN

In [39]:
f = 30
fig = plt.figure(figsize = (40,20))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = f)
ax.set_ylabel('Principal Component 2', fontsize = f)
ax.set_title('First 2 PCA Component', fontsize = f)
targets = [0,1]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['ID'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 70)
ax.legend(['Wheel_flat','RCF'],fontsize = 30)
plt.xticks(fontsize = f)
plt.yticks(fontsize = f)
plt.title('PCA results: Training set',fontsize = f)
ax.grid()

## SELECT ANHD NORMALIZED THE FATURES WITH THE MAIN IMPACT IN THE COMPONENTS

In [40]:
features_m = ['PeakX','Peak Y','PeakX_25 percentile','Peak Y_25 percentile','PeakX_85 percentile','Peak Y_85 percentile']
normalized_train_X_m = preprocessing.normalize(df_train[features_m])
normalized_test_X_m = preprocessing.normalize(df_test[features_m])

## SVM EXTIMATION

In [41]:
clf = svm.SVC(kernel='poly',degree = 4,gamma='scale',class_weight = 'balanced')
clf.fit(normalized_train_X_m, df_train['ID'])
y_pred = clf.predict(normalized_test_X_m)

## SVM: EVALUATING THE ALGORITHM

In [42]:
print(classification_report(df_test['ID'], y_pred))

              precision    recall  f1-score   support

         0.0       0.41      0.86      0.55      1918
         1.0       0.94      0.64      0.76      6764

   micro avg       0.69      0.69      0.69      8682
   macro avg       0.67      0.75      0.66      8682
weighted avg       0.82      0.69      0.72      8682



In [43]:
mat = confusion_matrix(df_test['ID'], y_pred)
sns.heatmap(mat.T, square=True, annot=True,cmap='coolwarm', fmt='d', cbar=True)
plt.title('SVM Model')
plt.xlabel('true label')
plt.ylabel('predicted label');

## LOGISTIC REGRESSION WITH ORIGINAL FEATURES WITH THE MAIN IMPACT IN THE COMPONENT

In [44]:
logisticRegr = LogisticRegression(solver = 'saga',class_weight = 'balanced')
logisticRegr.fit(normalized_train_X_m,df_train['ID'])
y_pred_log = logisticRegr.predict(normalized_test_X_m)
print(classification_report(df_test['ID'], y_pred_log))

              precision    recall  f1-score   support

         0.0       0.35      0.80      0.48      1918
         1.0       0.91      0.57      0.70      6764

   micro avg       0.62      0.62      0.62      8682
   macro avg       0.63      0.69      0.59      8682
weighted avg       0.79      0.62      0.65      8682



In [45]:
mat = confusion_matrix(df_test['ID'], y_pred_log)
sns.heatmap(mat.T, square=True, annot=True,cmap='coolwarm', fmt='d', cbar=True)
plt.title('Logistic regression')
plt.xlabel('true label')
plt.ylabel('predicted label');