# Import libraries

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTENC
from collections import Counter
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Normalize with Z-score

In [4]:
def Normalize_data(df):
  columns_to_normalize = [col for col in df.columns if df[col].nunique() > 10]
  # Standardization (Z-score normalization)
  scaler = StandardScaler()
  df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
  return df

# Upsampling with SMOTE-NC

In [5]:
def Apply_SMOTE_NC(df):
  # Separate features and target
  X = df.drop('Attrition', axis=1)
  y = df['Attrition']

  # Identify the indices of categorical features
  categorical_features = [col for col in X.columns if X[col].nunique() < 10]

  # Apply SMOTENC to the training data
  smote_nc = SMOTENC(categorical_features=categorical_features, random_state=42)
  X_res_smote_nc, y_res_smote_nc = smote_nc.fit_resample(X, y)

  # Check the class distribution after SMOTENC
  print(f"Resampled class distribution (SMOTENC): {Counter(y_res_smote_nc)}")
  df = pd.concat([X_res_smote_nc, y_res_smote_nc], axis=1)
  return df

# Base Models & Meta Model for Ensemble Learning

In [19]:
# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(random_state=42)),
    ('svc', SVC(probability=True, random_state=42))
]

# Define meta-model
meta_model = LogisticRegression(random_state=42)

# Load Dataset

In [20]:
small_train_df = pd.read_csv('/content/small_train_df_encoded_processed.csv')
small_test_df = pd.read_csv('/content/small_test_df_encoded_processed.csv')

In [21]:
small_train_df = Apply_SMOTE_NC(small_train_df)

Resampled class distribution (SMOTENC): Counter({0.0: 1033, 1.0: 1033})


In [22]:
small_train_df = Normalize_data(small_train_df)
small_test_df = Normalize_data(small_test_df)

# Train-Test Split

In [23]:
X_train = small_train_df.drop('Attrition', axis=1)
y_train = small_train_df['Attrition']
X_test = small_test_df.drop('Attrition', axis=1)
y_test = small_test_df['Attrition']

# Using Ensemble Learning (Stacking)

In [24]:
# Initialize and train the StackingClassifier

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_clf.fit(X_train, y_train)

# Evaluation & Confusion Metrix

In [25]:
# Make predictions

y_pred = stacking_clf.predict(X_test)

In [26]:
y_pred_proba = stacking_clf.predict_proba(X_test)[:, 1]  # Probability of positive class

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Stacking Classifier Accuracy: {accuracy:.2f}")
print(f"Stacking Classifier Precision: {precision:.2f}")
print(f"Stacking Classifier Recall: {recall:.2f}")
print(f"Stacking Classifier F1-score: {f1:.2f}")
print(f"Stacking Classifier ROC AUC: {roc_auc:.2f}")
print(f"Stacking Classifier Confusion Matrix:\n{conf_matrix}")

Stacking Classifier Accuracy: 0.87
Stacking Classifier Precision: 0.46
Stacking Classifier Recall: 0.83
Stacking Classifier F1-score: 0.60
Stacking Classifier ROC AUC: 0.92
Stacking Classifier Confusion Matrix:
[[386  58]
 [ 10  50]]
