In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import LabelEncoder

file_path = 'WA_Fn-UseC_-HR-Employee-Attrition.csv'
df = pd.read_csv(file_path)

label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])

# Apply Min-Max scaling to features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df.drop('Attrition', axis=1))
y = df['Attrition']

# Apply SMOTE-ENN oversampling to the entire dataset
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_scaled, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Use RFE with RandomForestClassifier for feature selection
rf_classifier = RandomForestClassifier()
rfe_selector = RFECV(estimator=rf_classifier, step=1, cv=5)  # You can adjust cv and other parameters
X_train_rfe = rfe_selector.fit_transform(X_train, y_train)

# Train a RandomForestClassifier on the selected features
rf_classifier.fit(X_train_rfe, y_train)

# Get feature rankings and support from the trained RFECV
feature_rankings = rfe_selector.ranking_
feature_support = rfe_selector.support_

# Create a DataFrame to display feature names, rankings, and support
feature_rfe_df = pd.DataFrame({
    'Feature': df.drop('Attrition', axis=1).columns,
    'RFE Ranking': feature_rankings,
    'RFE Support': feature_support
})

# Sort features by ranking (ascending order)
sorted_rfe_df = feature_rfe_df.sort_values(by='RFE Ranking')

# Display the features with rankings and support
print(sorted_rfe_df)

# Select the top-k features based on ranking and support
k = 30  # You can choose the number of top features
selected_rfe_df = sorted_rfe_df[sorted_rfe_df['RFE Ranking'] <= k]
selected_features = selected_rfe_df['Feature'].index


                     Feature  RFE Ranking  RFE Support
0                        Age            1         True
31        YearsInCurrentRole            1         True
30            YearsAtCompany            1         True
29           WorkLifeBalance            1         True
28     TrainingTimesLastYear            1         True
27         TotalWorkingYears            1         True
26          StockOptionLevel            1         True
24  RelationshipSatisfaction            1         True
22         PercentSalaryHike            1         True
21                  OverTime            1         True
19        NumCompaniesWorked            1         True
18               MonthlyRate            1         True
17             MonthlyIncome            1         True
32   YearsSinceLastPromotion            1         True
15           JobSatisfaction            1         True
16             MaritalStatus            1         True
13                  JobLevel            1         True
1         

In [6]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"MCC: {mcc}")



Model: Random Forest
Random Forest Accuracy: 0.9646739130434783
Precision: 0.9702127659574468
Recall: 0.9743589743589743
MCC: 0.9236088744836418

Model: AdaBoost
AdaBoost Accuracy: 0.9157608695652174
Precision: 0.9109311740890689
Recall: 0.9615384615384616
MCC: 0.816720132312914

Model: CatBoost
CatBoost Accuracy: 0.9836956521739131
Precision: 0.9789915966386554
Recall: 0.9957264957264957
MCC: 0.9648346627804185
