Importing the dataset

In [19]:

from google.colab import drive
import os
import pandas as pd

def import_dataset():
	
  drive.mount('/content/gdrive', force_remount=True)
  input=os.path.join('./CE888/Data/','combined_lagEDA.csv')
  input=os.path.join(os.path.join('gdrive', 'MyDrive', input))
  df=pd.read_csv(input)
  return df

Function to calculate performance

In [11]:
from sklearn.metrics import classification_report, f1_score, accuracy_score
import numpy as np

def calculate_performance(y_true, y_pred):
     

  acc = accuracy_score(y_true, y_pred)
  print('Accuracy Score: ', acc)

  f1score=f1_score(y_true, y_pred, average='macro')
  print('F1 Score(macro): ', f1score)


  print(classification_report(y_true, y_pred))



Print Accuracy

In [10]:
def print_accuracy(scores):
  print('Fold accuracy', scores['test_accuracy'])
  print('Average accuracy', np.mean(scores['test_accuracy']))

Data Preprocessing

In [12]:
def preprocess_data(df_input):
  processed_dataframe=df_input.drop(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30'],axis=1)
  input_array = processed_dataframe.drop(['Stress'], axis=1).to_numpy()
  output_array = processed_dataframe.loc[:,'Stress'].to_numpy()
  print(input_array.shape)
  print(output_array.shape)
  return input_array,output_array

Importing the libraries

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier

Calling the functions for import

In [20]:
df_input=import_dataset()
input_array,output_array=preprocess_data(df_input)

Mounted at /content/gdrive
(12445, 18)
(12445,)


Data Splitting

In [21]:
input_train, input_test, output_train, output_test = train_test_split(input_array, output_array, test_size=0.2, stratify=output_array)

Grid Search CV based SVM model

In [29]:
cv_svc = StratifiedKFold(shuffle=True,n_splits=10)
svm=SVC()

pipe_svc = Pipeline([('scaler',MinMaxScaler()),('svm',svm)])  # build pipeline
param_grid_svc = {
                 'svm__kernel':['linear', 'poly'],
                 'svm__gamma':['scale', 'auto'],
                
                 }
search_svc = GridSearchCV(pipe_svc, param_grid_svc, n_jobs=-1)
scores_svc = cross_validate(search_svc, input_train, output_train, scoring=['accuracy'], cv=cv_svc, return_estimator=True)

Calculating Accuracy

In [30]:
print_accuracy(scores_svc)

y_predict = cross_val_predict(pipe_svc, input_train, output_train, cv=cv_svc)

calculate_performance(output_train,y_predict)

#Model evaluation on Test Data
search_svc.fit(input_train,output_train)
test_pred=search_svc.best_estimator_.predict(input_test)

calculate_performance(output_test,test_pred)

Fold accuracy [0.65863454 0.6435743  0.63654618 0.64457831 0.6435743  0.66767068
 0.64020101 0.64924623 0.63517588 0.65025126]
Average accuracy 0.6469452685112308
Accuracy Score:  0.6893330654881479
F1 Score(macro):  0.5908936520259508
              precision    recall  f1-score   support

         0.0       0.81      0.76      0.79      4328
         1.0       0.67      0.18      0.28      1800
         2.0       0.60      0.85      0.70      3828

    accuracy                           0.69      9956
   macro avg       0.69      0.60      0.59      9956
weighted avg       0.70      0.69      0.66      9956

Accuracy Score:  0.635596625150663
F1 Score(macro):  0.4834475995103351
              precision    recall  f1-score   support

         0.0       0.74      0.72      0.73      1082
         1.0       0.87      0.03      0.06       450
         2.0       0.55      0.83      0.67       957

    accuracy                           0.64      2489
   macro avg       0.72      0.52      

Grid Search cv based DT model

In [32]:
cv_DT = StratifiedKFold(shuffle=True,n_splits=10)
dt=DecisionTreeClassifier()

pipe_DT = Pipeline([('scaler',MinMaxScaler()),('dt', dt)])  # build pipeline
param_grid_DT = {
                 'dt__random_state':[0, 1, 2, 3, 4, 5, 10, 15,20,35,50],
                 'dt__criterion':['gini','entropy'],
                 }
search_DT = GridSearchCV(pipe_DT, param_grid_DT, n_jobs=-1)
scores_DT = cross_validate(search_DT, input_train, output_train, scoring=['accuracy'], cv=cv_DT, return_estimator=True)

Calculating Accuracy

In [34]:
print_accuracy(scores_DT)

y_predict = cross_val_predict(pipe_DT, input_train, output_train, cv=cv_DT)

calculate_performance(output_train,y_predict)

#Model evaluation on Test Data
search_DT.fit(input_train,output_train)
test_pred=search_DT.best_estimator_.predict(input_test)

calculate_performance(output_test,test_pred)

Fold accuracy [0.86646586 0.84538153 0.88253012 0.8564257  0.87550201 0.84236948
 0.87638191 0.86633166 0.84924623 0.85628141]
Average accuracy 0.8616915904825332
Accuracy Score:  0.8323623945359582
F1 Score(macro):  0.8058963563320941
              precision    recall  f1-score   support

         0.0       0.89      0.90      0.90      4328
         1.0       0.70      0.70      0.70      1800
         2.0       0.83      0.82      0.82      3828

    accuracy                           0.83      9956
   macro avg       0.81      0.81      0.81      9956
weighted avg       0.83      0.83      0.83      9956

Accuracy Score:  0.8726396143029329
F1 Score(macro):  0.8509485222638511
              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92      1082
         1.0       0.77      0.76      0.76       450
         2.0       0.86      0.87      0.87       957

    accuracy                           0.87      2489
   macro avg       0.85      0.85     