In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import xgboost as xgb
import pickle

# Load the dataset

df = pd.read_csv('EasyVisa.csv')

# Define target variable as 1 for 'Certified' and 0 otherwise

y = (df['case_status'] == 'Certified').astype(int)

# Define features (dropping case_status and case_id)
X = df.drop(['case_status', 'case_id'], axis=1)

# Split categorical and numerical features
categorical_features = ['continent', 'education_of_employee', 'has_job_experience',
                        'requires_job_training', 'region_of_employment',
                        'unit_of_wage', 'full_time_position']
numerical_features = ['no_of_employees', 'yr_of_estab', 'prevailing_wage']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create preprocessing pipelines
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print("Transformed training data shape: " + str(X_train_transformed.shape))
print("Transformed test data shape: " + str(X_test_transformed.shape))

# Define evaluation function
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    try:
        y_pred_proba = model.predict_proba(X_test)[:,1]
    except:
        try:
            y_pred_proba = model.decision_function(X_test)
        except:
            y_pred_proba = y_pred

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    print('Model: ' + model_name)
    print('Accuracy: ' + str(round(accuracy,4)))
    print('Precision: ' + str(round(precision,4)))
    print('Recall: ' + str(round(recall,4)))
    print('F1 Score: ' + str(round(f1,4)))
    print('ROC AUC: ' + str(round(roc_auc,4)))
    print('-------------------------------------')

    return {
        'model': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }

# List to store model metrics
model_metrics = []

# 1. Logistic Regression
print('\
Training Logistic Regression...')
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train_transformed, y_train)
metrics_log = evaluate_model(log_model, X_test_transformed, y_test, 'Logistic Regression')
model_metrics.append(metrics_log)

# 2. Random Forest
print('\
Training Random Forest...')
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_transformed, y_train)
metrics_rf = evaluate_model(rf_model, X_test_transformed, y_test, 'Random Forest')
model_metrics.append(metrics_rf)

# 3. Gradient Boosting
print('\
Training Gradient Boosting...')
gb_model = GradientBoostingClassifier(n_estimators=200, random_state=42)
gb_model.fit(X_train_transformed, y_train)
metrics_gb = evaluate_model(gb_model, X_test_transformed, y_test, 'Gradient Boosting')
model_metrics.append(metrics_gb)

# 4. SVM
print('\
Training SVM...')
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train_transformed, y_train)
metrics_svm = evaluate_model(svm_model, X_test_transformed, y_test, 'SVM')
model_metrics.append(metrics_svm)

# 5. XGBoost
print('\
Training XGBoost...')
xgb_model = xgb.XGBClassifier(tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_transformed, y_train)
metrics_xgb = evaluate_model(xgb_model, X_test_transformed, y_test, 'XGBoost')
model_metrics.append(metrics_xgb)

# Compare results
metrics_df = pd.DataFrame(model_metrics)
print('\
Summary of model performances:')
print(metrics_df)

# Find the best model based on accuracy
best_model_idx = metrics_df['accuracy'].idxmax()
best_model_name = metrics_df.loc[best_model_idx, 'model']
best_accuracy = metrics_df.loc[best_model_idx, 'accuracy']
print('\
Best model based on accuracy: ' + best_model_name + ' with accuracy ' + str(round(best_accuracy,4)))

# Save the best model and preprocessor
if best_model_name == 'Logistic Regression':
    best_model = log_model
elif best_model_name == 'Random Forest':
    best_model = rf_model
elif best_model_name == 'Gradient Boosting':
    best_model = gb_model
elif best_model_name == 'SVM':
    best_model = svm_model
else:
    best_model = xgb_model

model_name = best_model_name.replace(' ', '_')+"_best_model.pkl"

with open(model_name, 'wb') as f:
  pickle.dump(best_model, f)
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

print('\
Best model (' + best_model_name + ') saved as ' + model_name)
print('Preprocessor saved as preprocessor.pkl')

# For tree-based models, show feature importances if available
if best_model_name in ['Random Forest', 'Gradient Boosting', 'XGBoost']:
    # Get feature names from preprocessor
    cat_features = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
    all_features = np.concatenate([numerical_features, cat_features])

    # Get feature importances
    importances = best_model.feature_importances_

    # Create a DataFrame for feature importances
    feature_importances = pd.DataFrame({
        'Feature': all_features,
        'Importance': importances
    })



Transformed training data shape: (20384, 28)
Transformed test data shape: (5096, 28)
Training Logistic Regression...
Model: Logistic Regression
Accuracy: 0.7433
Precision: 0.7647
Recall: 0.8889
F1 Score: 0.8221
ROC AUC: 0.7798
-------------------------------------
Training Random Forest...
Model: Random Forest
Accuracy: 0.7265
Precision: 0.7732
Recall: 0.835
F1 Score: 0.8029
ROC AUC: 0.762
-------------------------------------
Training Gradient Boosting...
Model: Gradient Boosting
Accuracy: 0.7551
Precision: 0.7803
Recall: 0.8812
F1 Score: 0.8277
ROC AUC: 0.7913
-------------------------------------
Training SVM...
Model: SVM
Accuracy: 0.7571
Precision: 0.7813
Recall: 0.8833
F1 Score: 0.8291
ROC AUC: 0.7655
-------------------------------------
Training XGBoost...



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.



Model: XGBoost
Accuracy: 0.742
Precision: 0.7753
Recall: 0.8636
F1 Score: 0.8171
ROC AUC: 0.7803
-------------------------------------
Summary of model performances:
                 model  accuracy  precision    recall        f1   roc_auc
0  Logistic Regression  0.743328   0.764736  0.888856  0.822138  0.779775
1        Random Forest  0.726452   0.773210  0.835049  0.802940  0.762050
2    Gradient Boosting  0.755102   0.780266  0.881211  0.827672  0.791329
3                  SVM  0.757064   0.781274  0.883270  0.829147  0.765482
4              XGBoost  0.741954   0.775343  0.863570  0.817082  0.780282
Best model based on accuracy: SVM with accuracy 0.7571
Best model (SVM) saved as SVM_best_model.pkl
Preprocessor saved as preprocessor.pkl



    E.g. tree_method = "hist", device = "cuda"

