In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import reliefF
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
# Load the dataset
file_path = 'WA_Fn-UseC_-HR-Employee-Attrition.csv'
df = pd.read_csv(file_path)
# Handle categorical variables if needed (encode labels)
label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])
# Assume 'Attrition' is the target variable, and you want to predict it based on other features
X = df.drop('Attrition', axis=1)
y = df['Attrition']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate information gain for each feature
info_gain_selector = SelectKBest(score_func=reliefF, k='all')
info_gain_selector.fit(X_train, y_train)

# Get feature scores and indices
feature_scores = info_gain_selector.scores_
feature_indices = info_gain_selector.get_support(indices=True)

# Create a DataFrame to display feature names and their scores
feature_scores_df = pd.DataFrame({
    'Feature': X.columns,
    'Information Gain Score': feature_scores
})

# Sort features by information gain (descending order)
sorted_features_df = feature_scores_df.sort_values(by='Information Gain Score', ascending=False)

# Display the top-k features with scores
k = 30  # You can choose the number of top features
selected_features_df = sorted_features_df.head(k)
print(selected_features_df)

# Select the top-k features based on information gain
selected_features = sorted_features_df.head(k)['Feature'].tolist()

# Filter the dataset to include only the selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Train a Random Forest classifier using the selected features
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_selected, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_selected)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy with Information Gain Feature Selection: {accuracy}")

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics and confusion matrix
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"MCC: {mcc}")
print(f"AUC: {roc_auc}")
print("\nConfusion Matrix:")
print(conf_matrix)


ImportError: cannot import name 'reliefF' from 'sklearn.feature_selection' (C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_selection\__init__.py)

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

file_path = 'WA_Fn-UseC_-HR-Employee-Attrition.csv'
df = pd.read_csv(file_path)

label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])

X = df.drop('Attrition', axis=1)
y = df['Attrition']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Min-Max scaling to features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE oversampling to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Calculate information gain for each feature
info_gain_selector = SelectKBest(score_func=mutual_info_classif, k='all')
info_gain_selector.fit(X_train_resampled, y_train_resampled)

# Get feature scores and indices
feature_scores = info_gain_selector.scores_
feature_indices = info_gain_selector.get_support(indices=True)

# Create a DataFrame to display feature names and their scores
feature_scores_df = pd.DataFrame({
    'Feature': X.columns,
    'Information Gain Score': feature_scores
})

# Sort features by information gain (descending order)
sorted_features_df = feature_scores_df.sort_values(by='Information Gain Score', ascending=False)

# Display the top-k features with scores
k = 30  # You can choose the number of top features
selected_features_df = sorted_features_df.head(k)
print(selected_features_df)

# Select the top-k features based on information gain
selected_features = sorted_features_df.head(k)['Feature'].index

# Filter the dataset to include only the selected features
X_train_selected = X_train_resampled[:, selected_features]
X_test_selected = X_test_scaled[:, selected_features]

# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train_resampled)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train_resampled)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train_resampled)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train_resampled)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train_resampled)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train_resampled)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"MCC: {mcc}")


                     Feature  Information Gain Score
27         TotalWorkingYears                0.357522
4           DistanceFromHome                0.347504
0                        Age                0.339160
30            YearsAtCompany                0.335072
22         PercentSalaryHike                0.317540
33      YearsWithCurrManager                0.302047
31        YearsInCurrentRole                0.292292
11                HourlyRate                0.291171
19        NumCompaniesWorked                0.272384
28     TrainingTimesLastYear                0.246651
6             EducationField                0.246511
14                   JobRole                0.238231
9    EnvironmentSatisfaction                0.234667
32   YearsSinceLastPromotion                0.230036
5                  Education                0.226961
13                  JobLevel                0.220388
12            JobInvolvement                0.212297
24  RelationshipSatisfaction                0.

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

file_path = 'WA_Fn-UseC_-HR-Employee-Attrition.csv'
df = pd.read_csv(file_path)

label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])

# Apply Min-Max scaling to features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df.drop('Attrition', axis=1))
y = df['Attrition']

# Apply SMOTE oversampling to the entire dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Calculate information gain for each feature
info_gain_selector = SelectKBest(score_func=mutual_info_classif, k='all')
info_gain_selector.fit(X_train, y_train)

# Get feature scores and indices
feature_scores = info_gain_selector.scores_
feature_indices = info_gain_selector.get_support(indices=True)

# Create a DataFrame to display feature names and their scores
feature_scores_df = pd.DataFrame({
    'Feature': df.drop('Attrition', axis=1).columns,
    'Information Gain Score': feature_scores
})

# Sort features by information gain (descending order)
sorted_features_df = feature_scores_df.sort_values(by='Information Gain Score', ascending=False)

# Display the top-k features with scores
k = 30  # You can choose the number of top features
selected_features_df = sorted_features_df.head(k)
print(selected_features_df)

# Select the top-k features based on information gain
selected_features = sorted_features_df.head(k)['Feature'].index

# Filter the dataset to include only the selected features
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"MCC: {mcc}")


                     Feature  Information Gain Score
27         TotalWorkingYears                0.363907
4           DistanceFromHome                0.351502
30            YearsAtCompany                0.344413
0                        Age                0.341836
33      YearsWithCurrManager                0.317628
22         PercentSalaryHike                0.313778
11                HourlyRate                0.305346
31        YearsInCurrentRole                0.286843
19        NumCompaniesWorked                0.276335
32   YearsSinceLastPromotion                0.248169
5                  Education                0.241150
6             EducationField                0.237545
14                   JobRole                0.234822
28     TrainingTimesLastYear                0.229660
15           JobSatisfaction                0.225040
29           WorkLifeBalance                0.219446
24  RelationshipSatisfaction                0.204054
9    EnvironmentSatisfaction                0.

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import LabelEncoder

file_path = 'WA_Fn-UseC_-HR-Employee-Attrition.csv'
df = pd.read_csv(file_path)

label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])

# Apply Min-Max scaling to features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df.drop('Attrition', axis=1))
y = df['Attrition']

# Apply SMOTE-ENN oversampling to the entire dataset
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_scaled, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Calculate information gain for each feature
info_gain_selector = SelectKBest(score_func=mutual_info_classif, k='all')
info_gain_selector.fit(X_train, y_train)

# Get feature scores and indices
feature_scores = info_gain_selector.scores_
feature_indices = info_gain_selector.get_support(indices=True)

# Create a DataFrame to display feature names and their scores
feature_scores_df = pd.DataFrame({
    'Feature': df.drop('Attrition', axis=1).columns,
    'Information Gain Score': feature_scores
})

# Sort features by information gain (descending order)
sorted_features_df = feature_scores_df.sort_values(by='Information Gain Score', ascending=False)

# Display the top-k features with scores
k = 30  # You can choose the number of top features
selected_features_df = sorted_features_df.head(k)
print(selected_features_df)

# Select the top-k features based on information gain
selected_features = sorted_features_df.head(k)['Feature'].index

                     Feature  Information Gain Score
30            YearsAtCompany                0.300239
0                        Age                0.291174
27         TotalWorkingYears                0.286231
4           DistanceFromHome                0.278307
33      YearsWithCurrManager                0.276562
31        YearsInCurrentRole                0.251302
11                HourlyRate                0.239467
14                   JobRole                0.235874
22         PercentSalaryHike                0.234834
19        NumCompaniesWorked                0.230882
15           JobSatisfaction                0.197337
28     TrainingTimesLastYear                0.193484
9    EnvironmentSatisfaction                0.189542
32   YearsSinceLastPromotion                0.184983
13                  JobLevel                0.182612
6             EducationField                0.180033
26          StockOptionLevel                0.177734
5                  Education                0.

In [10]:
# Filter the dataset to include only the selected features
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"MCC: {mcc}")



Model: Random Forest
Random Forest Accuracy: 0.9728260869565217
Precision: 0.9745762711864406
Recall: 0.9829059829059829
MCC: 0.9411965398254964

Model: AdaBoost
AdaBoost Accuracy: 0.9266304347826086
Precision: 0.9224489795918367
Recall: 0.9658119658119658
MCC: 0.840548868965454

Model: CatBoost
CatBoost Accuracy: 0.9782608695652174
Precision: 0.9747899159663865
Recall: 0.9914529914529915
MCC: 0.9530198366156643
