In [1]:
# Imports

import numpy as np

import pandas as pd

from sklearn.impute import SimpleImputer

import warnings
 
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.compose import ColumnTransformer

from sklearn import metrics

from sklearn.pipeline import Pipeline

import joblib
import warnings
# Model
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
pip install catboost


Note: you may need to restart the kernel to use updated packages.


In [3]:
# Apply setting
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns",None)

In [4]:
#Load data
data = pd.read_csv("clean_data.csv")
#data backup
df = data.copy()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   continent              25480 non-null  object 
 1   education_of_employee  25480 non-null  object 
 2   has_job_experience     25480 non-null  object 
 3   requires_job_training  25480 non-null  object 
 4   no_of_employees        25480 non-null  int64  
 5   yr_of_estab            25480 non-null  int64  
 6   region_of_employment   25480 non-null  object 
 7   prevailing_wage        25480 non-null  float64
 8   unit_of_wage           25480 non-null  object 
 9   full_time_position     25480 non-null  object 
 10  case_status            25480 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 2.1+ MB


In [5]:
# Encode target column
df["case_status"] = df["case_status"].map({'Certified':1, 'Denied':0})
print(df["case_status"].value_counts())

case_status
1    17018
0     8462
Name: count, dtype: int64


In [6]:
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,0
1,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,1
2,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,0
3,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,0
4,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,1


In [7]:
# Sepearate target and feature
# Separate target and feature
X = df.drop(columns= ["case_status"], axis =1)
y = df["case_status"]

In [8]:

num_columns = X.select_dtypes(include= "number").columns.to_list()
cat_columns = X.select_dtypes(include= "object").columns.to_list()

print("\n THE NUMERIC COLUMNS ARE: \n", num_columns)
print("\n THE CATEGORICAL COLUMNS ARE: \n", cat_columns)


 THE NUMERIC COLUMNS ARE: 
 ['no_of_employees', 'yr_of_estab', 'prevailing_wage']

 THE CATEGORICAL COLUMNS ARE: 
 ['continent', 'education_of_employee', 'has_job_experience', 'requires_job_training', 'region_of_employment', 'unit_of_wage', 'full_time_position']


In [9]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [10]:
# Processing data
num_transformer = Pipeline(steps=[
                            ("imputer", SimpleImputer(strategy="median")),
                           ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
                            ("imputer", SimpleImputer(strategy= "most_frequent")),
                           ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])


In [11]:

# Combine steps
preprocessor = ColumnTransformer(transformers = [("num",num_transformer,num_columns),
                                               ("cat",cat_transformer,cat_columns)])


In [12]:
# Models
model_dict = { "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
     "K-Neighbors Classifier": KNeighborsClassifier(),
      "XGBClassifier": XGBClassifier(), 
     "CatBoosting Classifier": CatBoostClassifier(verbose=False),
     "Support Vector Classifier": SVC(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [13]:

# Function to filter hyperparameter
def filter_hyperparameters(model,space):
    valid_key = model.get_params().keys()
    return {k:v for k,v in space.items() if k in valid_key}

In [14]:
# Hyperparameter
# Generic hyperparameter
search_space = {
   "C": [0.1,1,10],
   "kernel": ["linear", "rgf"],
    "n_estimators": [50,100,200],
    "max_depth": [None,5,10],
    "learning_rate" : [0.5,1]

    }

In [15]:
# Grid Search for each model
result = []

best_pipeline = {}
 
for name, model in model_dict.items():

    print(f'Tuning {model}...')

    pipe = Pipeline(steps = [

        ('processor', preprocessor),

        ('model', model)

    ])
    

hyperparameter = filter_hyperparameters(model, search_space)


Tuning RandomForestClassifier()...
Tuning DecisionTreeClassifier()...
Tuning GradientBoostingClassifier()...
Tuning LogisticRegression()...
Tuning KNeighborsClassifier()...
Tuning XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, ...)...
Tuning <catboost.core.CatBoostClassifier object at 0x000

In [16]:
# Prefix model name

param_grid = {f'model__{k}':v for k,v in hyperparameter.items()}

grid = GridSearchCV(estimator = pipe, param_grid = param_grid, cv = 5, scoring = 'accuracy', n_jobs = -1)

grid.fit(X_train, y_train)
 
y_pred = grid.predict(X_test)

report = metrics.classification_report(y_test, y_pred, output_dict = True)
 
result.append({

        'model_name': name,

        'best_parameters': param_grid,

        'accuracy': round(metrics.accuracy_score(y_test, y_pred), 4),

        'f1-score': round(report['weighted avg']['f1-score'], 4)

})
 
best_pipeline[name] = grid.best_estimator_

print(result)

[{'model_name': 'AdaBoost Classifier', 'best_parameters': {'model__n_estimators': [50, 100, 200], 'model__learning_rate': [0.5, 1]}, 'accuracy': 0.7429, 'f1-score': 0.7253}]


In [17]:
# Compare results
result_df =pd.DataFrame(result)
print(result_df)
sorted_result_df= result_df.sort_values(by= "accuracy", ascending = False)
print("\nModel comparision:\n", sorted_result_df)

            model_name                                    best_parameters  \
0  AdaBoost Classifier  {'model__n_estimators': [50, 100, 200], 'model...   

   accuracy  f1-score  
0    0.7429    0.7253  

Model comparision:
             model_name                                    best_parameters  \
0  AdaBoost Classifier  {'model__n_estimators': [50, 100, 200], 'model...   

   accuracy  f1-score  
0    0.7429    0.7253  


In [19]:
# Best Model
best_row = sorted_result_df.iloc[0]
best_model = best_row["model_name"]
print("\nBest Model:",best_model)
print("\nBest hyperparameters: \n", best_row["best_parameters"])


Best Model: AdaBoost Classifier

Best hyperparameters: 
 {'model__n_estimators': [50, 100, 200], 'model__learning_rate': [0.5, 1]}


In [20]:
# Retraining best model on full data set
final_pipeline= best_pipeline[best_model]
final_pipeline.fit(X,y)

In [21]:
# Save pipeline
joblib.dump(final_pipeline, "visa_pipeline.pkl")
print("The deployment model is saved as: visa_pipeline.pkl")

The deployment model is saved as: visa_pipeline.pkl
