In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import xgboost as xgb
import pickle
from imblearn.combine import SMOTETomek, SMOTEENN
from datetime import date

# Load the dataset

df = pd.read_csv('EasyVisa.csv')
current_year = date.today().year
df['company_age'] = current_year - df['yr_of_estab']

df.drop(['case_id', 'yr_of_estab'], axis=1, inplace=True)

x = df.drop('case_status', axis=1)
y = df['case_status']

y = y.map({'Certified': 1, 'Denied': 0}).astype(int)

ordinal_columns = ['education_of_employee', 'has_job_experience', 'requires_job_training', 'full_time_position']
onehot_columns = ['region_of_employment', 'continent', 'unit_of_wage']
transform_columns = ['no_of_employees', 'company_age']
num_features = [feature for feature in x.columns if x[feature].dtypes != 'O']

num_transformer = StandardScaler()
onehot_transformer = OneHotEncoder()
ordinal_encoder = OrdinalEncoder()

transformer_pipe = Pipeline(steps=[
    ('transformer', PowerTransformer(method='yeo-johnson'))
])

preprocessor = ColumnTransformer(transformers=[
    ("OneHotEncoder", onehot_transformer, onehot_columns),
    ("OrdinalEncoder", ordinal_encoder, ordinal_columns),
    ("Transformer", transformer_pipe, transform_columns),
    ("StandardScaler", num_transformer, num_features)
])

x = preprocessor.fit_transform(x)

smt = SMOTETomek(random_state=42, sampling_strategy='minority')
x_new, y_new = smt.fit_resample(x, y)

x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.2, random_state=42)

def evaluate_clf(true, predicted):
  acc = accuracy_score(true, predicted)
  prec = precision_score(true, predicted)
  rec = recall_score(true, predicted)
  f1 = f1_score(true, predicted)
  roc_score = roc_auc_score(true, predicted)
  class_report = classification_report(true, predicted)
  conf_matrix = confusion_matrix(true, predicted)
  roc_cur = roc_curve(true, predicted)
  return acc, prec, rec, f1, roc_score, class_report, conf_matrix, roc_cur


models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "CatBoost": CatBoostClassifier(verbose=False),
    "XGBoost": XGBClassifier(),
    "SVC": SVC(),
    "AdaBoost": AdaBoostClassifier()
}

def evaluate_models(x_train, x_test, y_train, y_test, models):
  models_list = []
  accuracy_list = []
  precision_list = []
  recall_list = []
  f1_list = []
  roc_list = []

  for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    model_train_acc, model_train_prec, model_train_rec, model_train_f1, model_train_roc, model_train_class_report, model_train_conf_matrix, model_train_roc_cur = evaluate_clf(y_train, y_train_pred)
    model_test_acc, model_test_prec, model_test_rec, model_test_f1, model_test_roc, model_test_class_report, model_test_conf_matrix, model_test_roc_cur = evaluate_clf(y_test, y_test_pred)

    print(list(models.keys())[i])
    models_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_acc))
    print('- Precision: {:.4f}'.format(model_train_prec))
    print('- Recall: {:.4f}'.format(model_train_rec))
    print('- F1 Score: {:.4f}'.format(model_train_f1))
    print('- ROC AUC Score: {:.4f}'.format(model_train_roc))
    print(f'- Classification Report \n {model_train_class_report}')
    print(f'- Confusion Matrix \n {model_train_conf_matrix}')
    print(f'- ROC Curve \n {model_train_roc_cur}')
    print('----------------------------------')
    print()

    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(model_test_acc))
    accuracy_list.append(model_test_acc)
    print('- Precision: {:.4f}'.format(model_test_prec))
    precision_list.append(model_test_prec)
    print('- Recall: {:.4f}'.format(model_test_rec))
    recall_list.append(model_test_rec)
    print('- F1 Score: {:.4f}'.format(model_test_f1))
    f1_list.append(model_test_f1)
    print('- ROC AUC Score: {:.4f}'.format(model_test_roc))
    roc_list.append(model_test_roc)
    print(f'- Classification Report \n {model_test_class_report}')
    print(f'- Confusion Matrix \n {model_test_conf_matrix}')
    print(f'- ROC Curve \n {model_test_roc_cur}')
    print('='*35)
    print('\n')

  report = pd.DataFrame(list(zip(models_list, accuracy_list, precision_list, recall_list, f1_list, roc_list)), columns=['Model Name', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC Score'])

  return report

In [3]:
model_report = evaluate_models(x_train, x_test, y_train, y_test, models)

Random Forest
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
- ROC AUC Score: 1.0000
- Classification Report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     12552
           1       1.00      1.00      1.00     12438

    accuracy                           1.00     24990
   macro avg       1.00      1.00      1.00     24990
weighted avg       1.00      1.00      1.00     24990

- Confusion Matrix 
 [[12552     0]
 [    0 12438]]
- ROC Curve 
 (array([0., 0., 1.]), array([0., 1., 1.]), array([inf,  1.,  0.]))
----------------------------------

Model performance for Test set
- Accuracy: 0.8180
- Precision: 0.8373
- Recall: 0.7975
- F1 Score: 0.8169
- ROC AUC Score: 0.8184
- Classification Report 
               precision    recall  f1-score   support

           0       0.80      0.84      0.82      3067
           1       0.84      0.80      0.82      3181

    accura

In [4]:
model_report.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model Name,Accuracy,Precision,Recall,F1 Score,ROC AUC Score
0,Random Forest,0.818022,0.837294,0.797548,0.816938,0.818402
5,CatBoost,0.815141,0.804753,0.840931,0.822444,0.814662
6,XGBoost,0.806818,0.801835,0.824269,0.812897,0.806494
4,K-Nearest Neighbors,0.769046,0.817849,0.702924,0.756044,0.770275
2,Gradient Boosting,0.765685,0.774194,0.762025,0.768061,0.765753
1,Decision Tree,0.746159,0.757341,0.737818,0.747452,0.746314
7,SVC,0.731594,0.728155,0.75448,0.741084,0.731169
8,AdaBoost,0.729353,0.744102,0.713926,0.728702,0.72964
3,Logistic Regression,0.643566,0.644633,0.668343,0.656274,0.643105


In [5]:
xgboost_params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}

rf_params = {
    "max_depth": [10, 12, None, 15, 20],
    "max_features": ['sqrt', 'log2', None],
    "n_estimators": [10, 50, 100, 200]
}

knn_params = {
    "algorithm": ['auto', 'ball_tree', 'kd_tree','brute'],
    "weights": ['uniform', 'distance'],
    "n_neighbors": [3, 4, 5, 7, 9],
}

catboost_params = {
    'depth': [6, 8, 10],
    'learning_rate': [0.1, 0.01, 0.001],
    'iterations': [100, 200, 500]
}

In [6]:
randomcv_models = [
    ('XGBoost', XGBClassifier(), xgboost_params),
    ("RF", RandomForestClassifier(), rf_params),
    ("KNN", KNeighborsClassifier(), knn_params),
    ("CatBoost", CatBoostClassifier(verbose=False), catboost_params)
]

In [7]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(x_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 12 candidates, totalling 36 fits




Fitting 3 folds for each of 60 candidates, totalling 180 fits




Fitting 3 folds for each of 40 candidates, totalling 120 fits




Fitting 3 folds for each of 27 candidates, totalling 81 fits
---------------- Best Params for XGBoost -------------------
{'min_child_weight': 1, 'max_depth': 7}
---------------- Best Params for RF -------------------
{'n_estimators': 200, 'max_features': 'log2', 'max_depth': None}
---------------- Best Params for KNN -------------------
{'weights': 'distance', 'n_neighbors': 4, 'algorithm': 'auto'}
---------------- Best Params for CatBoost -------------------
{'learning_rate': 0.1, 'iterations': 500, 'depth': 10}


In [8]:
model_param

{'XGBoost': {'min_child_weight': 1, 'max_depth': 7},
 'RF': {'n_estimators': 200, 'max_features': 'log2', 'max_depth': None},
 'KNN': {'weights': 'distance', 'n_neighbors': 4, 'algorithm': 'auto'},
 'CatBoost': {'learning_rate': 0.1, 'iterations': 500, 'depth': 10}}

In [9]:
from sklearn.metrics import roc_auc_score,roc_curve
best_models = {
    "Random Forest Classifier": RandomForestClassifier(**model_param['RF']),
    "KNeighborsClassifier": KNeighborsClassifier(**model_param['KNN']),
    "XGBClassifier": XGBClassifier(**model_param['XGBoost'],n_jobs=-1),
    "CatBoostClassifier": CatBoostClassifier(**model_param['CatBoost'],verbose=False)
}
tuned_report =evaluate_models(x_train, x_test, y_train, y_test, best_models)

Random Forest Classifier
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
- ROC AUC Score: 1.0000
- Classification Report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     12552
           1       1.00      1.00      1.00     12438

    accuracy                           1.00     24990
   macro avg       1.00      1.00      1.00     24990
weighted avg       1.00      1.00      1.00     24990

- Confusion Matrix 
 [[12552     0]
 [    0 12438]]
- ROC Curve 
 (array([0., 0., 1.]), array([0., 1., 1.]), array([inf,  1.,  0.]))
----------------------------------

Model performance for Test set
- Accuracy: 0.8161
- Precision: 0.8329
- Recall: 0.7991
- F1 Score: 0.8157
- ROC AUC Score: 0.8164
- Classification Report 
               precision    recall  f1-score   support

           0       0.80      0.83      0.82      3067
           1       0.83      0.80      0.82      3181


In [10]:
tuned_report.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model Name,Accuracy,Precision,Recall,F1 Score,ROC AUC Score
3,CatBoostClassifier,0.820262,0.818576,0.831185,0.824832,0.820059
1,KNeighborsClassifier,0.819622,0.888133,0.738761,0.80659,0.821125
0,Random Forest Classifier,0.816101,0.832896,0.79912,0.815659,0.816417
2,XGBClassifier,0.807778,0.804615,0.822069,0.813248,0.807513
