In [1]:
import numpy as np 
import pandas as pd 

df = pd.read_csv('clean_data.csv')
df.head()

Unnamed: 0,Age,Gender,Region,Industry,Work_Arrangement,Hours_Per_Week,Burnout_Level,Work_Life_Balance_Score,Social_Isolation_Score,Avg_Salary,Physical_Health_Count,Mental_Health_Score
0,27,Female,Asia,Professional Services,Onsite,64,3,3,2,50000.0,2,2
1,37,Female,Asia,Professional Services,Onsite,37,3,4,2,90000.0,1,2
2,32,Female,Africa,Education,Onsite,36,3,3,2,90000.0,2,1
3,40,Female,Europe,Education,Onsite,63,2,1,2,70000.0,2,1
4,30,Male,South America,Manufacturing,Hybrid,65,2,5,4,70000.0,0,0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3035 entries, 0 to 3034
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      3035 non-null   int64  
 1   Gender                   3035 non-null   object 
 2   Region                   3035 non-null   object 
 3   Industry                 3035 non-null   object 
 4   Work_Arrangement         3035 non-null   object 
 5   Hours_Per_Week           3035 non-null   int64  
 6   Burnout_Level            3035 non-null   int64  
 7   Work_Life_Balance_Score  3035 non-null   int64  
 8   Social_Isolation_Score   3035 non-null   int64  
 9   Avg_Salary               3035 non-null   float64
 10  Physical_Health_Count    3035 non-null   int64  
 11  Mental_Health_Score      3035 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 284.7+ KB


In [3]:
# get all the numeric features
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))

Num of Numerical Features : 8


In [4]:
# categorical features
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))

Num of Categorical Features : 4


In [5]:
# Discrete features
discrete_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Features :',len(discrete_features))

Num of Discrete Features : 6


In [6]:
# coontinuous features
continuous_features=[feature for feature in num_features if feature not in discrete_features]
print('Num of Continuous Features :',len(continuous_features))

Num of Continuous Features : 2


In [7]:
from sklearn.model_selection import train_test_split
X = df.drop('Burnout_Level', axis=1)
y = df['Burnout_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
y.value_counts()

Burnout_Level
2    1319
3    1005
1     711
Name: count, dtype: int64

In [9]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

print('Numerical Features:', num_features)
print('Categorical Features:', cat_features)

Numerical Features: Index(['Age', 'Hours_Per_Week', 'Work_Life_Balance_Score',
       'Social_Isolation_Score', 'Avg_Salary', 'Physical_Health_Count',
       'Mental_Health_Score'],
      dtype='object')
Categorical Features: Index(['Gender', 'Region', 'Industry', 'Work_Arrangement'], dtype='object')


In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(drop='first'), cat_features)
])


In [11]:
preprocessor

In [12]:
X_train = preprocessor.fit_transform(X_train)

In [13]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,-1.640312,1.602176,1.719090,-0.601007,-0.554035,1.043399,0.634992,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.803435,-1.646188,0.007402,0.241513,0.392620,0.107193,0.166941,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-1.403820,-0.526062,0.863246,-1.443527,-1.500689,1.979604,-1.237213,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-1.009667,0.594063,0.863246,0.241513,1.339275,1.043399,1.571094,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-1.246159,0.370038,-0.848441,0.241513,-1.500689,-0.829012,-0.769161,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2423,-0.142531,-1.198138,0.007402,1.084033,0.392620,-1.765218,1.571094,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2424,-0.300192,1.378151,0.863246,0.241513,0.392620,-0.829012,1.103043,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2425,-0.300192,1.714189,0.007402,0.241513,-1.500689,1.043399,-1.237213,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2426,0.093960,0.258026,-0.848441,-0.601007,0.392620,-0.829012,-0.769161,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [14]:
X_test = preprocessor.transform(X_test)

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score


In [16]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [21]:
y_test[:5]

93      3
2714    3
1614    1
2010    2
2560    2
Name: Burnout_Level, dtype: int64

In [23]:
model.predict(X_test[:5])

array([1, 2, 2, 2, 2], dtype=int64)

In [31]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Training set performance
model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
model_train_precision = precision_score(y_train, y_train_pred, average='weighted') # Calculate Precision
model_train_recall = recall_score(y_train, y_train_pred, average='weighted') # Calculate Recall


# Test set performance
model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
model_test_precision = precision_score(y_test, y_test_pred, average='weighted') # Calculate Precision
model_test_recall = recall_score(y_test, y_test_pred, average='weighted') # Calculate Recall


In [32]:
    
print('Model performance for Training set')
print("- Accuracy: {:.4f}".format(model_train_accuracy))
print('- F1 score: {:.4f}'.format(model_train_f1))

print('- Precision: {:.4f}'.format(model_train_precision))
print('- Recall: {:.4f}'.format(model_train_recall))



print('----------------------------------')

print('Model performance for Test set')
print('- Accuracy: {:.4f}'.format(model_test_accuracy))
print('- F1 score: {:.4f}'.format(model_test_f1))
print('- Precision: {:.4f}'.format(model_test_precision))
print('- Recall: {:.4f}'.format(model_test_recall))


Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.4102
- F1 score: 0.3746
- Precision: 0.3887
- Recall: 0.4102


In [99]:
from sklearn.svm import SVC
svm_clf = SVC(decision_function_shape='ovr', max_iter=5000)
svm_clf.fit(X_train, y_train)

In [100]:
y_test_pred = svm_clf.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.42833607907743

In [84]:
from sklearn.multiclass import OneVsRestClassifier 

ovr_clf = OneVsRestClassifier(SVC(decision_function_shape='ovr')) 
ovr_clf.fit(X_train, y_train)

In [85]:
y_test_pred = ovr_clf.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.4085667215815486

In [80]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(penalty='l2') 
sgd_clf.fit(X_train, y_train) 

In [81]:
y_test_pred = sgd_clf.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.4052718286655684

---

In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.metrics import precision_score, recall_score, f1_score


In [None]:
models={
    "Logisitic Regression" : LogisticRegression(),
    "Decision Tree" : DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(),
    "Gradient Boost" : GradientBoostingClassifier(),
    "SGD Classifier" : SGDClassifier(),
    "OVR Classifier" : OneVsRestClassifier(SVC()),
    "SVM Classifier" : SVC() 
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred, average='weighted') # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred, average='weighted') # Calculate Recall


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred, average='weighted') # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred, average='weighted') # Calculate Recall


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))

    
    print('='*35)
    print('\n')


Logisitic Regression
Model performance for Training set
- Accuracy: 0.4584
- F1 score: 0.3755
- Precision: 0.5131
- Recall: 0.4584
----------------------------------
Model performance for Test set
- Accuracy: 0.4267
- F1 score: 0.3396
- Precision: 0.3173
- Recall: 0.4267


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.3377
- F1 score: 0.3380
- Precision: 0.3387
- Recall: 0.3377


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.3756
- F1 score: 0.3396
- Precision: 0.3439
- Recall: 0.3756


Gradient Boost
Model performance for Training set
- Accuracy: 0.5861
- F1 score: 0.5555
- Precision: 0.6177
- Recall: 0.5861
----------------------------------
Model performance for Test set
-

Logistic Regression, Gradient Boost, SVM Classifier, SGD Classifier, OVR Classifier are comparatively good 

In [112]:
# Hyperparameter Training
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'class_weight': [None, 'balanced'],
    'max_iter': [100, 200, 500]
}
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0]
}
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}
param_grid_sgd = {
    'loss': ['hinge', 'huber', 'modified_huber'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['optimal', 'invscaling']
}


In [113]:
# Models list for Hyperparameter tuning
randomcv_models = [
                   ("Logistic Regression", LogisticRegression(), param_grid_lr),
                   ("Gradient Boost", GradientBoostingClassifier(), param_grid_gb),
                   ("SVM Classifier", SVC(), param_grid_svm),
                   ("SGD Classifier", SGDClassifier(eta0=0.001), param_grid_sgd)
]

In [114]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=5,
                                   verbose=1,
                                   n_jobs=-1,
                                   random_state=42)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])




Fitting 5 folds for each of 60 candidates, totalling 300 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits




Fitting 5 folds for each of 18 candidates, totalling 90 fits




Fitting 5 folds for each of 54 candidates, totalling 270 fits
---------------- Best Params for Logistic Regression -------------------
{'solver': 'liblinear', 'max_iter': 100, 'class_weight': None, 'C': 0.1}
---------------- Best Params for Gradient Boost -------------------
{'subsample': 0.8, 'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 3, 'learning_rate': 0.01}
---------------- Best Params for SVM Classifier -------------------
{'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}
---------------- Best Params for SGD Classifier -------------------
{'penalty': 'l2', 'loss': 'modified_huber', 'learning_rate': 'optimal', 'alpha': 0.01}


In [117]:
model_param.keys()

dict_keys(['Logistic Regression', 'Gradient Boost', 'SVM Classifier', 'SGD Classifier'])

In [119]:
models={
    "Logisitic Regression" : LogisticRegression(**model_param['Logistic Regression']),
    "Gradient Boost" : GradientBoostingClassifier(**model_param['Gradient Boost']),
    "SGD Classifier" : SGDClassifier(**model_param['SGD Classifier']),
    "OVR Classifier" : OneVsRestClassifier(SVC(**model_param['SVM Classifier'])),
    "SVM Classifier" : SVC(**model_param['SVM Classifier']) 
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred, average='weighted') # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred, average='weighted') # Calculate Recall

    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred, average='weighted') # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred, average='weighted') # Calculate Recall

    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))


    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))

    
    print('='*35)
    print('\n')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logisitic Regression
Model performance for Training set
- Accuracy: 0.4588
- F1 score: 0.3743
- Precision: 0.3599
- Recall: 0.4588
----------------------------------
Model performance for Test set
- Accuracy: 0.4250
- F1 score: 0.3351
- Precision: 0.3155
- Recall: 0.4250




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Gradient Boost
Model performance for Training set
- Accuracy: 0.4765
- F1 score: 0.3597
- Precision: 0.4274
- Recall: 0.4765
----------------------------------
Model performance for Test set
- Accuracy: 0.4250
- F1 score: 0.2957
- Precision: 0.3214
- Recall: 0.4250


SGD Classifier
Model performance for Training set
- Accuracy: 0.4440
- F1 score: 0.4068
- Precision: 0.4199
- Recall: 0.4440
----------------------------------
Model performance for Test set
- Accuracy: 0.4053
- F1 score: 0.3691
- Precision: 0.3870
- Recall: 0.4053


OVR Classifier
Model performance for Training set
- Accuracy: 0.3925
- F1 score: 0.3914
- Precision: 0.3941
- Recall: 0.3925
----------------------------------
Model performance for Test set
- Accuracy: 0.3641
- F1 score: 0.3619
- Precision: 0.3637
- Recall: 0.3641


SVM Classifier
Model performance for Training set
- Accuracy: 0.4485
- F1 score: 0.3645
- Precision: 0.3481
- Recall: 0.4485
----------------------------------
Model performance for Test set
- Acc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Applying SMOTE : 

In [126]:
from imblearn.over_sampling import SMOTE
X_res_train, y_res_train = SMOTE(random_state=42).fit_resample(X_train, y_train)
X_res_test, y_res_test = SMOTE(random_state=42).fit_resample(X_test, y_test)

In [127]:
models={
    "Logisitic Regression" : LogisticRegression(),
    "Decision Tree" : DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(),
    "Gradient Boost" : GradientBoostingClassifier(),
    "SGD Classifier" : SGDClassifier(),
    "OVR Classifier" : OneVsRestClassifier(SVC()),
    "SVM Classifier" : SVC() 
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_res_train, y_res_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_res_train)
    y_test_pred = model.predict(X_res_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_res_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_res_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_res_train, y_train_pred, average='weighted') # Calculate Precision
    model_train_recall = recall_score(y_res_train, y_train_pred, average='weighted') # Calculate Recall


    # Test set performance
    model_test_accuracy = accuracy_score(y_res_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_res_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_res_test, y_test_pred, average='weighted') # Calculate Precision
    model_test_recall = recall_score(y_res_test, y_test_pred, average='weighted') # Calculate Recall


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))

    
    print('='*35)
    print('\n')


Logisitic Regression
Model performance for Training set
- Accuracy: 0.4377
- F1 score: 0.4224
- Precision: 0.4300
- Recall: 0.4377
----------------------------------
Model performance for Test set
- Accuracy: 0.3411
- F1 score: 0.3219
- Precision: 0.3181
- Recall: 0.3411


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.3861
- F1 score: 0.3863
- Precision: 0.3879
- Recall: 0.3861


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.4247
- F1 score: 0.4224
- Precision: 0.4330
- Recall: 0.4247


Gradient Boost
Model performance for Training set
- Accuracy: 0.6484
- F1 score: 0.6477
- Precision: 0.6623
- Recall: 0.6484
----------------------------------
Model performance for Test set
-

In [128]:
# Hyperparameter Training
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'class_weight': [None, 'balanced'],
    'max_iter': [100, 200, 500]
}
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0]
}
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}
param_grid_sgd = {
    'loss': ['hinge', 'huber', 'modified_huber'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['optimal', 'invscaling']
}


In [129]:
# Models list for Hyperparameter tuning
randomcv_models = [
                   ("Logistic Regression", LogisticRegression(), param_grid_lr),
                   ("Gradient Boost", GradientBoostingClassifier(), param_grid_gb),
                   ("SVM Classifier", SVC(), param_grid_svm),
                   ("SGD Classifier", SGDClassifier(eta0=0.001), param_grid_sgd)
]

In [131]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=5,
                                   verbose=1,
                                   n_jobs=-1,
                                   random_state=42)
    random.fit(X_res_train, y_res_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])




Fitting 5 folds for each of 60 candidates, totalling 300 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits




Fitting 5 folds for each of 18 candidates, totalling 90 fits




Fitting 5 folds for each of 54 candidates, totalling 270 fits
---------------- Best Params for Logistic Regression -------------------
{'solver': 'lbfgs', 'max_iter': 100, 'class_weight': None, 'C': 0.1}
---------------- Best Params for Gradient Boost -------------------
{'subsample': 0.8, 'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 7, 'learning_rate': 0.1}
---------------- Best Params for SVM Classifier -------------------
{'kernel': 'rbf', 'gamma': 'scale', 'C': 10}
---------------- Best Params for SGD Classifier -------------------
{'penalty': 'elasticnet', 'loss': 'modified_huber', 'learning_rate': 'optimal', 'alpha': 0.01}


In [133]:
models={
    "Logisitic Regression" : LogisticRegression(**model_param['Logistic Regression']),
    "Gradient Boost" : GradientBoostingClassifier(**model_param['Gradient Boost']),
    "SGD Classifier" : SGDClassifier(**model_param['SGD Classifier']),
    "OVR Classifier" : OneVsRestClassifier(SVC(**model_param['SVM Classifier'])),
    "SVM Classifier" : SVC(**model_param['SVM Classifier']) 
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_res_train, y_res_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_res_train)
    y_test_pred = model.predict(X_res_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_res_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_res_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_res_train, y_train_pred, average='weighted') # Calculate Precision
    model_train_recall = recall_score(y_res_train, y_train_pred, average='weighted') # Calculate Recall

    # Test set performance
    model_test_accuracy = accuracy_score(y_res_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_res_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_res_test, y_test_pred, average='weighted') # Calculate Precision
    model_test_recall = recall_score(y_res_test, y_test_pred, average='weighted') # Calculate Recall

    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))


    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))

    
    print('='*35)
    print('\n')

Logisitic Regression
Model performance for Training set
- Accuracy: 0.4412
- F1 score: 0.4238
- Precision: 0.4340
- Recall: 0.4412
----------------------------------
Model performance for Test set
- Accuracy: 0.3411
- F1 score: 0.3217
- Precision: 0.3182
- Recall: 0.3411


Gradient Boost
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.4337
- F1 score: 0.4338
- Precision: 0.4437
- Recall: 0.4337


SGD Classifier
Model performance for Training set
- Accuracy: 0.4299
- F1 score: 0.3798
- Precision: 0.4199
- Recall: 0.4299
----------------------------------
Model performance for Test set
- Accuracy: 0.3861
- F1 score: 0.3386
- Precision: 0.3763
- Recall: 0.3861


OVR Classifier
Model performance for Training set
- Accuracy: 0.9443
- F1 score: 0.9443
- Precision: 0.9444
- Recall: 0.9443
----------------------------------
Model performance for Test set

Gradient Boost improves after SMOTE but didn't cross the SVM Classifier without SMOTE

So far SVM Classifier (without SMOTE) got maximum values i.e.: 

Model performance for Training set
- Accuracy: 0.4485
- F1 score: 0.3645
- Precision: 0.3481
- Recall: 0.4485
----------------------------------
Model performance for Test set
- Accuracy: 0.4399
- F1 score: 0.3503
- Precision: 0.3380
- Recall: 0.4399