In [None]:
import numpy as np

def normalize(data):
    norm_data = data / np.sqrt(np.sum(data**2, axis=0))
    return norm_data

def weighted_normalized_decision_matrix(data, weights):
    norm_data = normalize(data)
    weighted_data = norm_data * weights
    return weighted_data

def median_ideal_and_nadir(weighted_data):
    ideal_best = np.median(weighted_data, axis=0)
    ideal_worst = np.median(weighted_data, axis=0)
    return ideal_best, ideal_worst

def robust_distance(ideal_best, ideal_worst, weighted_data):
    # Manhattan Distance
    manhattan_to_best = np.sum(np.abs(weighted_data - ideal_best), axis=1)
    manhattan_to_worst = np.sum(np.abs(weighted_data - ideal_worst), axis=1)

    # Euclidean Distance
    euclidean_to_best = np.sqrt(np.sum((weighted_data - ideal_best)**2, axis=1))
    euclidean_to_worst = np.sqrt(np.sum((weighted_data - ideal_worst)**2, axis=1))

    # Chebyshev Distance
    chebyshev_to_best = np.max(np.abs(weighted_data - ideal_best), axis=1)
    chebyshev_to_worst = np.max(np.abs(weighted_data - ideal_worst), axis=1)

    # Combined robust distance calculation
    dist_to_best = np.sqrt((np.log(1 + manhattan_to_best))**2 +
                           (np.log(1 + euclidean_to_best))**2 +
                           (np.log(1 + chebyshev_to_best))**2)

    dist_to_worst = np.sqrt((np.log(1 + manhattan_to_worst))**2 +
                            (np.log(1 + euclidean_to_worst))**2 +
                            (np.log(1 + chebyshev_to_worst))**2)

    return dist_to_best, dist_to_worst

def topsis(data, weights):
    weighted_data = weighted_normalized_decision_matrix(data, weights)
    ideal_best, ideal_worst = median_ideal_and_nadir(weighted_data)
    dist_to_best, dist_to_worst = robust_distance(ideal_best, ideal_worst, weighted_data)

    # Calculate TOPSIS score
    topsis_score = 0.5 * (dist_to_worst / (dist_to_best + dist_to_worst)) + 0.5 * (1 / (1 + dist_to_best))

    return topsis_score

data = np.array([
    [ 0.258827, 302.967663, 1],
    [ 0.214318, 283.510830, 1],
    [ 0.264581, 338.326726, 1],
    [ 0.289202, 322.475929, 1],
    [ 0.299575, 364.942572, 1],
    [ 0.302290, 411.138744, 1],
    [ 0.317003, 417.127252, 1],
    [ 0.311528, 471.553455, 1],
    [ 0.180598, 6.611628, 1],
    [ 0.363767, 613.847322, 1],
    [ 0.028950, 25.355219, 1],
    [ 0.034966, 14.601259, 1],
    [0.026474, 8.855874, 1],
    [ 0.034622, 0.427003, 1],
    [ 0.011118, 10.985586, 1],
    [ 0.000978, 0.431122, 1]
])

# Weight matrix
weights = np.array([0.33, 0.33, 0.33])  # Example weights

# Perform TOPSIS ranking
topsis_ranks = topsis(data[:, :-1], weights[:-1])

# Sort the features based on the TOPSIS ranking
sorted_indices = np.argsort(topsis_ranks)[::-1]  # Reverse the order to get highest rank first

# Feature names
feature_names = [
    'work is meaningful', 'good relationship with peers', 'family supports', 'satisfied with career and opportunity',
    'mentally well and do not have anxiety', 'satisfied with work-life balance', 'satisfied compensation', 'satisfied with job profession',
    'working hour', 'satisfaction with workload', 'Age', 'monthly average expenditure', 'Gender', 'Job position', 'Work tenure', 'Education']
# Print the sorted features with names and TOPSIS ranks
features_list_sorted = []
print("Sorted Features:")
for rank, (index, name) in enumerate(zip(sorted_indices, [feature_names[i] for i in sorted_indices]), start=1):
    features_list_sorted.append(name)
    print(f"{rank}. {name}: {topsis_ranks[index]:.6f}")

Sorted Features:
1. good relationship with peers: 0.742164
2. work is meaningful: 0.742164
3. family supports: 0.735460
4. satisfied with career and opportunity: 0.731301
5. mentally well and do not have anxiety: 0.723457
6. satisfied with work-life balance: 0.715387
7. satisfied compensation: 0.711982
8. satisfied with job profession: 0.703404
9. working hour: 0.687354
10. satisfaction with workload: 0.674431
11. Age: 0.671531
12. monthly average expenditure: 0.671277
13. Gender: 0.669441
14. Job position: 0.669370
15. Work tenure: 0.667057
16. Education: 0.664353


In [None]:
!pip install python-docx

from docx import Document

# Create a Document object
doc = Document()

# Title of the document
doc.add_heading('Sorted Features Based on TOPSIS Ranking', 0)

# Create a table
table = doc.add_table(rows=1, cols=3)

# Add headers
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Rank'
hdr_cells[1].text = 'Feature'
hdr_cells[2].text = 'TOPSIS Score'

# Data for the table
data = [
    (1, "Good relationship with peers", 0.742164),
    (2, "Work is meaningful", 0.742164),
    (3, "Family supports", 0.735460),
    (4, "Satisfied with career and opportunity", 0.731301),
    (5, "Mentally well and do not have anxiety", 0.723457),
    (6, "Satisfied with work-life balance", 0.715387),
    (7, "Satisfied compensation", 0.711982),
    (8, "Satisfied with job profession", 0.703404),
    (9, "Working hour", 0.687354),
    (10, "Satisfaction with workload", 0.674431),
    (11, "Age", 0.671531),
    (12, "Monthly average expenditure", 0.671277),
    (13, "Gender", 0.669441),
    (14, "Job position", 0.669370),
    (15, "Work tenure", 0.667057),
    (16, "Education", 0.664353)
]

# Add data rows to the table
for rank, feature, score in data:
    row_cells = table.add_row().cells
    row_cells[0].text = str(rank)
    row_cells[1].text = feature
    row_cells[2].text = str(score)

# Save the document
doc.save("TOPSIS_Features_Table.docx")


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m194.6/244.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [None]:
len(features_list_sorted)

16

In [None]:
from google.colab import files

# Upload the CSV file
uploaded = files.upload()

Saving feature slection new.csv to feature slection new.csv


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import LabelEncoder

file_path = 'feature slection new.csv'
df = pd.read_csv(file_path)

label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])

# Apply Min-Max scaling to features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df.drop('TOI (turnover intention)', axis=1))
y = df['TOI (turnover intention)']

# Apply SMOTE-ENN oversampling to the entire dataset
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_scaled, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Use RFE with RandomForestClassifier for feature selection
rf_classifier = RandomForestClassifier()
rfe_selector = RFECV(estimator=rf_classifier, step=1, cv=5, scoring='accuracy')  # You can adjust cv and other parameters
X_train_rfe = rfe_selector.fit_transform(X_train, y_train)

# Train a RandomForestClassifier on the selected features
rf_classifier.fit(X_train_rfe, y_train)

# Get feature rankings, support, and scores from the trained RFECV
feature_rankings = rfe_selector.ranking_
feature_support = rfe_selector.support_
feature_scores = rfe_selector.cv_results_['mean_test_score']

# Create a DataFrame to display feature names, rankings, support, and scores
features = pd.DataFrame({'Feature': features_list_sorted})

# Select the top-k features based on ranking and support
k = 16  # You can choose the number of top features
# selected = sorted_rfe_df[sorted_rfe_df['RFE Ranking'] <= k]
# selected_features = selected['Feature'].index


In [None]:
sorted_features = [

'good relationship with peers', 'work is meaningful', 'family supports', 'satisfied with career and opportunity',
     'mentally well and do not have anxiety', 'satisfied with work-life balance', 'satisfied compensation', 'satisfied with job profession',
     'working hour', 'satisfaction with workload', 'Age', 'monthly average expenditure', 'Gender', 'Job position', 'Work tenure', 'Education']
# Select the highest 15 features
selected_feature_set = sorted_features[:15]


print(selected_feature_set)

['good relationship with peers', 'work is meaningful', 'family supports', 'satisfied with career and opportunity', 'mentally well and do not have anxiety', 'satisfied with work-life balance', 'satisfied compensation', 'satisfied with job profession', 'working hour', 'satisfaction with workload', 'Age', 'monthly average expenditure', 'Gender', 'Job position', 'Work tenure']


In [None]:
selected_feature_set

['good relationship with peers',
 'work is meaningful',
 'family supports',
 'satisfied with career and opportunity',
 'mentally well and do not have anxiety',
 'satisfied with work-life balance',
 'satisfied compensation',
 'satisfied with job profession',
 'working hour',
 'satisfaction with workload',
 'Age',
 'monthly average expenditure',
 'Gender',
 'Job position',
 'Work tenure']

In [None]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Model: Random Forest
Random Forest Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]

Model: AdaBoost
AdaBoost Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]

Model: CatBoost
CatBoost Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]


In [None]:
# Sorted Features List
sorted_features = [

'good relationship with peers', 'work is meaningful', 'family supports', 'satisfied with career and opportunity',
     'mentally well and do not have anxiety', 'satisfied with work-life balance', 'satisfied compensation', 'satisfied with job profession',
     'working hour', 'satisfaction with workload', 'Age', 'monthly average expenditure', 'Gender', 'Job position', 'Work tenure', 'Education']

# Select the highest 10 features
selected_feature_set = sorted_features[:10]


print(selected_feature_set)

['good relationship with peers', 'work is meaningful', 'family supports', 'satisfied with career and opportunity', 'mentally well and do not have anxiety', 'satisfied with work-life balance', 'satisfied compensation', 'satisfied with job profession', 'working hour', 'satisfaction with workload']


In [None]:
selected_feature_set

['good relationship with peers',
 'work is meaningful',
 'family supports',
 'satisfied with career and opportunity',
 'mentally well and do not have anxiety',
 'satisfied with work-life balance',
 'satisfied compensation',
 'satisfied with job profession',
 'working hour',
 'satisfaction with workload']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Model: Random Forest
Random Forest Accuracy: 0.9782608695652174
Precision: 1.0
Recall: 0.9591836734693877
F1 Score: 0.9791666666666666
MCC: 0.9573678958383028
ROC AUC: 0.9795918367346939
Confusion Matrix:
[[43  0]
 [ 2 47]]

Model: AdaBoost
AdaBoost Accuracy: 0.967391304347826
Precision: 0.9791666666666666
Recall: 0.9591836734693877
F1 Score: 0.9690721649484536
MCC: 0.9348193338876291
ROC AUC: 0.9679639297579496
Confusion Matrix:
[[42  1]
 [ 2 47]]

Model: CatBoost
CatBoost Accuracy: 0.967391304347826
Precision: 1.0
Recall: 0.9387755102040817
F1 Score: 0.968421052631579
MCC: 0.9367769320431428
ROC AUC: 0.9693877551020409
Confusion Matrix:
[[43  0]
 [ 3 46]]


In [None]:
# Sorted Features List
sorted_features = [

'good relationship with peers', 'work is meaningful', 'family supports', 'satisfied with career and opportunity',
     'mentally well and do not have anxiety', 'satisfied with work-life balance', 'satisfied compensation', 'satisfied with job profession',
     'working hour', 'satisfaction with workload', 'Age', 'monthly average expenditure', 'Gender', 'Job position', 'Work tenure', 'Education']

# Select the highest 5 features
selected_feature_set = sorted_features[:5]


print(selected_feature_set)

['good relationship with peers', 'work is meaningful', 'family supports', 'satisfied with career and opportunity', 'mentally well and do not have anxiety']


In [None]:
selected_feature_set

['good relationship with peers',
 'work is meaningful',
 'family supports',
 'satisfied with career and opportunity',
 'mentally well and do not have anxiety']

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")



Model: Random Forest
Random Forest Accuracy: 0.6847826086956522
Precision: 0.6923076923076923
Recall: 0.7346938775510204
F1 Score: 0.7128712871287128
MCC: 0.3649466851746568
ROC AUC: 0.6813004271476032
Confusion Matrix:
[[27 16]
 [13 36]]

Model: AdaBoost
AdaBoost Accuracy: 0.6739130434782609
Precision: 0.6792452830188679
Recall: 0.7346938775510204
F1 Score: 0.7058823529411764
MCC: 0.34261276175986216
ROC AUC: 0.6696725201708591
Confusion Matrix:
[[26 17]
 [13 36]]

Model: CatBoost
CatBoost Accuracy: 0.6956521739130435
Precision: 0.7058823529411765
Recall: 0.7346938775510204
F1 Score: 0.7200000000000001
MCC: 0.3873301119611609
ROC AUC: 0.6929283341243475
Confusion Matrix:
[[28 15]
 [13 36]]


In [None]:
#select k best features
# Sorted Features List
sorted_features = [
   'satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance',
   'mentally well and do not have anxiety', 'family supports', 'satisfied with career and opportunity', 'work is meaningful',
   'good relationship with peers', 'Age', 'monthly average expenditure', 'Work tenure', 'Gender', 'working hour', 'Education', 'Job position'
]

# Select the highest 15 features
selected_feature_set = sorted_features[:15]


print(selected_feature_set)

['satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance', 'mentally well and do not have anxiety', 'family supports', 'satisfied with career and opportunity', 'work is meaningful', 'good relationship with peers', 'Age', 'monthly average expenditure', 'Work tenure', 'Gender', 'working hour', 'Education']


In [None]:
selected_feature_set

['satisfaction with workload',
 'satisfied with job profession',
 'satisfied compensation',
 'satisfied with work-life balance',
 'mentally well and do not have anxiety',
 'family supports',
 'satisfied with career and opportunity',
 'work is meaningful',
 'good relationship with peers',
 'Age',
 'monthly average expenditure',
 'Work tenure',
 'Gender',
 'working hour',
 'Education']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Model: Random Forest
Random Forest Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]

Model: AdaBoost
AdaBoost Accuracy: 0.967391304347826
Precision: 1.0
Recall: 0.9387755102040817
F1 Score: 0.968421052631579
MCC: 0.9367769320431428
ROC AUC: 0.9693877551020409
Confusion Matrix:
[[43  0]
 [ 3 46]]

Model: CatBoost
CatBoost Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]


In [None]:
#select k best features
# Sorted Features List
sorted_features = [
   'satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance',
   'mentally well and do not have anxiety', 'family supports', 'satisfied with career and opportunity', 'work is meaningful',
   'good relationship with peers', 'Age', 'monthly average expenditure', 'Work tenure', 'Gender', 'working hour', 'Education', 'Job position'
]

# Select the highest 10 features
selected_feature_set = sorted_features[:10]


print(selected_feature_set)

['satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance', 'mentally well and do not have anxiety', 'family supports', 'satisfied with career and opportunity', 'work is meaningful', 'good relationship with peers', 'Age']


In [None]:
selected_feature_set

['satisfaction with workload',
 'satisfied with job profession',
 'satisfied compensation',
 'satisfied with work-life balance',
 'mentally well and do not have anxiety',
 'family supports',
 'satisfied with career and opportunity',
 'work is meaningful',
 'good relationship with peers',
 'Age']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Model: Random Forest
Random Forest Accuracy: 0.9782608695652174
Precision: 1.0
Recall: 0.9591836734693877
F1 Score: 0.9791666666666666
MCC: 0.9573678958383028
ROC AUC: 0.9795918367346939
Confusion Matrix:
[[43  0]
 [ 2 47]]

Model: AdaBoost
AdaBoost Accuracy: 0.967391304347826
Precision: 1.0
Recall: 0.9387755102040817
F1 Score: 0.968421052631579
MCC: 0.9367769320431428
ROC AUC: 0.9693877551020409
Confusion Matrix:
[[43  0]
 [ 3 46]]

Model: CatBoost
CatBoost Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]


In [None]:
#select k best features
# Sorted Features List
sorted_features = [
   'satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance',
   'mentally well and do not have anxiety', 'family supports', 'satisfied with career and opportunity', 'work is meaningful',
   'good relationship with peers', 'Age', 'monthly average expenditure', 'Work tenure', 'Gender', 'working hour', 'Education', 'Job position'
]

# Select the highest 5 features
selected_feature_set = sorted_features[:5]


print(selected_feature_set)

['satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance', 'mentally well and do not have anxiety']


In [None]:
selected_feature_set

['satisfaction with workload',
 'satisfied with job profession',
 'satisfied compensation',
 'satisfied with work-life balance',
 'mentally well and do not have anxiety']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Model: Random Forest
Random Forest Accuracy: 0.9239130434782609
Precision: 1.0
Recall: 0.8571428571428571
F1 Score: 0.923076923076923
MCC: 0.8585702400752412
ROC AUC: 0.9285714285714286
Confusion Matrix:
[[43  0]
 [ 7 42]]

Model: AdaBoost
AdaBoost Accuracy: 0.9565217391304348
Precision: 0.9787234042553191
Recall: 0.9387755102040817
F1 Score: 0.9583333333333333
MCC: 0.9137865764829719
ROC AUC: 0.9577598481252966
Confusion Matrix:
[[42  1]
 [ 3 46]]

Model: CatBoost
CatBoost Accuracy: 0.967391304347826
Precision: 1.0
Recall: 0.9387755102040817
F1 Score: 0.968421052631579
MCC: 0.9367769320431428
ROC AUC: 0.9693877551020409
Confusion Matrix:
[[43  0]
 [ 3 46]]


In [None]:
# Sorted Features List
#information gain
sorted_features = [
   'satisfaction with workload', 'satisfied compensation', 'satisfied with work-life balance',
   'satisfied with job profession', 'mentally well and do not have anxiety', 'satisfied with career and opportunity',
   'family supports', 'work is meaningful', 'good relationship with peers', 'working hour', 'monthly average expenditure',
   'Job position', 'Age', 'Gender', 'Work tenure', 'Education'
]

# Select the highest 15 features
selected_feature_set = sorted_features[:15]


print(selected_feature_set)

['satisfaction with workload', 'satisfied compensation', 'satisfied with work-life balance', 'satisfied with job profession', 'mentally well and do not have anxiety', 'satisfied with career and opportunity', 'family supports', 'work is meaningful', 'good relationship with peers', 'working hour', 'monthly average expenditure', 'Job position', 'Age', 'Gender', 'Work tenure']


In [None]:
selected_feature_set

['satisfaction with workload',
 'satisfied compensation',
 'satisfied with work-life balance',
 'satisfied with job profession',
 'mentally well and do not have anxiety',
 'satisfied with career and opportunity',
 'family supports',
 'work is meaningful',
 'good relationship with peers',
 'working hour',
 'monthly average expenditure',
 'Job position',
 'Age',
 'Gender',
 'Work tenure']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Model: Random Forest
Random Forest Accuracy: 0.9456521739130435
Precision: 1.0
Recall: 0.8979591836734694
F1 Score: 0.9462365591397849
MCC: 0.896895628658922
ROC AUC: 0.9489795918367347
Confusion Matrix:
[[43  0]
 [ 5 44]]

Model: AdaBoost
AdaBoost Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]

Model: CatBoost
CatBoost Accuracy: 0.967391304347826
Precision: 1.0
Recall: 0.9387755102040817
F1 Score: 0.968421052631579
MCC: 0.9367769320431428
ROC AUC: 0.9693877551020409
Confusion Matrix:
[[43  0]
 [ 3 46]]


In [None]:
# Sorted Features List
#information gain
sorted_features = [
   'satisfaction with workload', 'satisfied compensation', 'satisfied with work-life balance',
   'satisfied with job profession', 'mentally well and do not have anxiety', 'satisfied with career and opportunity',
   'family supports', 'work is meaningful', 'good relationship with peers', 'working hour', 'monthly average expenditure',
   'Job position', 'Age', 'Gender', 'Work tenure', 'Education'
]

# Select the highest 10 features
selected_feature_set = sorted_features[:10]


print(selected_feature_set)

['satisfaction with workload', 'satisfied compensation', 'satisfied with work-life balance', 'satisfied with job profession', 'mentally well and do not have anxiety', 'satisfied with career and opportunity', 'family supports', 'work is meaningful', 'good relationship with peers', 'working hour']


In [None]:
selected_feature_set

['satisfaction with workload',
 'satisfied compensation',
 'satisfied with work-life balance',
 'satisfied with job profession',
 'mentally well and do not have anxiety',
 'satisfied with career and opportunity',
 'family supports',
 'work is meaningful',
 'good relationship with peers',
 'working hour']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Model: Random Forest
Random Forest Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]

Model: AdaBoost
AdaBoost Accuracy: 0.967391304347826
Precision: 0.9791666666666666
Recall: 0.9591836734693877
F1 Score: 0.9690721649484536
MCC: 0.9348193338876291
ROC AUC: 0.9679639297579496
Confusion Matrix:
[[42  1]
 [ 2 47]]

Model: CatBoost
CatBoost Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]


In [None]:
# Sorted Features List
#information gain
sorted_features = [
   'satisfaction with workload', 'satisfied compensation', 'satisfied with work-life balance',
   'satisfied with job profession', 'mentally well and do not have anxiety', 'satisfied with career and opportunity',
   'family supports', 'work is meaningful', 'good relationship with peers', 'working hour', 'monthly average expenditure',
   'Job position', 'Age', 'Gender', 'Work tenure', 'Education'
]

# Select the highest 5 features
selected_feature_set = sorted_features[:5]


print(selected_feature_set)

['satisfaction with workload', 'satisfied compensation', 'satisfied with work-life balance', 'satisfied with job profession', 'mentally well and do not have anxiety']


In [None]:
selected_feature_set

['satisfaction with workload',
 'satisfied compensation',
 'satisfied with work-life balance',
 'satisfied with job profession',
 'mentally well and do not have anxiety']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Model: Random Forest
Random Forest Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]

Model: AdaBoost
AdaBoost Accuracy: 0.9565217391304348
Precision: 0.9787234042553191
Recall: 0.9387755102040817
F1 Score: 0.9583333333333333
MCC: 0.9137865764829719
ROC AUC: 0.9577598481252966
Confusion Matrix:
[[42  1]
 [ 3 46]]

Model: CatBoost
CatBoost Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]


In [None]:
# Sorted Features List
sorted_features = [
    "Over18", "PercentSalaryHike", "JobSatisfaction", "YearsSinceLastPromotion",
    "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked", "OverTime",
    "RelationshipSatisfaction", "EmployeeCount", "StockOptionLevel",
    "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance",
    "YearsAtCompany", "YearsInCurrentRole", "MaritalStatus", "JobLevel",
    "BusinessTravel", "DailyRate", "Department", "DistanceFromHome",
    "Education", "EducationField", "JobRole"
]

# Select the highest 10 features
selected_feature_set = sorted_features[:10]


print(selected_feature_set)

['Over18', 'PercentSalaryHike', 'JobSatisfaction', 'YearsSinceLastPromotion', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'RelationshipSatisfaction', 'EmployeeCount']


In [None]:
selected_feature_set

['Over18',
 'PercentSalaryHike',
 'JobSatisfaction',
 'YearsSinceLastPromotion',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'OverTime',
 'RelationshipSatisfaction',
 'EmployeeCount']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")



Model: Random Forest
Random Forest Accuracy: 0.9293478260869565
Precision: 0.9333333333333333
Recall: 0.9572649572649573
F1 Score: 0.9451476793248945
MCC: 0.846490721036133
ROC AUC: 0.9189309860951652
Confusion Matrix:
[[118  16]
 [ 10 224]]

Model: AdaBoost
AdaBoost Accuracy: 0.8831521739130435
Precision: 0.8835341365461847
Recall: 0.9401709401709402
F1 Score: 0.9109730848861283
MCC: 0.744522056709943
ROC AUC: 0.8618765148615896
Confusion Matrix:
[[105  29]
 [ 14 220]]

Model: CatBoost
CatBoost Accuracy: 0.904891304347826
Precision: 0.902834008097166
Recall: 0.9529914529914529
F1 Score: 0.9272349272349272
MCC: 0.7926778150030066
ROC AUC: 0.8869434876897563
Confusion Matrix:
[[110  24]
 [ 11 223]]


In [None]:
# Sorted Features List
sorted_features = [
    "Over18", "PercentSalaryHike", "JobSatisfaction", "YearsSinceLastPromotion",
    "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked", "OverTime",
    "RelationshipSatisfaction", "EmployeeCount", "StockOptionLevel",
    "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance",
    "YearsAtCompany", "YearsInCurrentRole", "MaritalStatus", "JobLevel",
    "BusinessTravel", "DailyRate", "Department", "DistanceFromHome",
    "Education", "EducationField", "JobRole"
]

# Select the highest 5 features
selected_feature_set = sorted_features[:5]


print(selected_feature_set)

['Over18', 'PercentSalaryHike', 'JobSatisfaction', 'YearsSinceLastPromotion', 'MonthlyIncome']


In [None]:
selected_feature_set

['Over18',
 'PercentSalaryHike',
 'JobSatisfaction',
 'YearsSinceLastPromotion',
 'MonthlyIncome']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")



Model: Random Forest
Random Forest Accuracy: 0.8695652173913043
Precision: 0.8842975206611571
Recall: 0.9145299145299145
F1 Score: 0.8991596638655462
MCC: 0.7155011998170971
ROC AUC: 0.8527873453246588
Confusion Matrix:
[[106  28]
 [ 20 214]]

Model: AdaBoost
AdaBoost Accuracy: 0.842391304347826
Precision: 0.8577235772357723
Recall: 0.9017094017094017
F1 Score: 0.8791666666666667
MCC: 0.6547001130683833
ROC AUC: 0.8202576859293277
Confusion Matrix:
[[ 99  35]
 [ 23 211]]

Model: CatBoost
CatBoost Accuracy: 0.8586956521739131
Precision: 0.864
Recall: 0.9230769230769231
F1 Score: 0.8925619834710743
MCC: 0.6900804430804596
ROC AUC: 0.8346727898966705
Confusion Matrix:
[[100  34]
 [ 18 216]]


In [None]:
import numpy as np

def normalize(data):
    norm_data = data / np.sqrt(np.sum(data**2, axis=0))
    return norm_data

def weighted_normalized_decision_matrix(data, weights):
    norm_data = normalize(data)
    weighted_data = norm_data * weights
    return weighted_data

def ideal_and_nadir(weighted_data):
    # Classical TOPSIS ideal best and worst (max and min of each column)
    ideal_best = np.max(weighted_data, axis=0)
    ideal_worst = np.min(weighted_data, axis=0)
    return ideal_best, ideal_worst

def distance_to_ideal_solutions(ideal_best, ideal_worst, weighted_data):
    # Euclidean distances to the ideal best and worst
    dist_to_best = np.sqrt(np.sum((weighted_data - ideal_best) ** 2, axis=1))
    dist_to_worst = np.sqrt(np.sum((weighted_data - ideal_worst) ** 2, axis=1))
    return dist_to_best, dist_to_worst

def topsis(data, weights):
    weighted_data = weighted_normalized_decision_matrix(data, weights)
    ideal_best, ideal_worst = ideal_and_nadir(weighted_data)
    dist_to_best, dist_to_worst = distance_to_ideal_solutions(ideal_best, ideal_worst, weighted_data)

    # Calculate TOPSIS score (Closeness to the ideal solution)
    topsis_score = dist_to_worst / (dist_to_best + dist_to_worst)

    return topsis_score

# Data matrix
data = np.array([
    [0.258827, 302.967663, 1],
    [0.214318, 283.510830, 1],
    [0.264581, 338.326726, 1],
    [0.289202, 322.475929, 1],
    [0.299575, 364.942572, 1],
    [0.302290, 411.138744, 1],
    [0.317003, 417.127252, 1],
    [0.311528, 471.553455, 1],
    [0.180598, 6.611628, 1],
    [0.363767, 613.847322, 1],
    [0.028950, 25.355219, 1],
    [0.034966, 14.601259, 1],
    [0.026474, 8.855874, 1],
    [0.034622, 0.427003, 1],
    [0.011118, 10.985586, 1],
    [0.000978, 0.431122, 1]
])

# Weight matrix (all weights are equal for this example)
weights = np.array([0.33, 0.33, 0.33])  # Example weights

# Perform TOPSIS ranking
topsis_ranks = topsis(data[:, :-1], weights[:-1])

# Sort the features based on the TOPSIS ranking
sorted_indices = np.argsort(topsis_ranks)[::-1]  # Reverse the order to get highest rank first

# Feature names
feature_names = [
    'work is meaningful', 'good relationship with peers', 'family supports', 'satisfied with career and opportunity',
    'mentally well and do not have anxiety', 'satisfied with work-life balance', 'satisfied compensation', 'satisfied with job profession',
    'working hour', 'satisfaction with workload', 'Age', 'monthly average expenditure', 'Gender', 'Job position', 'Work tenure', 'Education'
]

# Print the sorted features with names and TOPSIS ranks
features_list_sorted = []
print("Sorted Features:")
for rank, (index, name) in enumerate(zip(sorted_indices, [feature_names[i] for i in sorted_indices]), start=1):
    features_list_sorted.append(name)
    print(f"{rank}. {name}: {topsis_ranks[index]:.6f}")


Sorted Features:
1. satisfaction with workload: 1.000000
2. satisfied with job profession: 0.798612
3. satisfied compensation: 0.742247
4. satisfied with work-life balance: 0.724801
5. mentally well and do not have anxiety: 0.672868
6. satisfied with career and opportunity: 0.620338
7. family supports: 0.615219
8. work is meaningful: 0.573905
9. good relationship with peers: 0.510259
10. working hour: 0.269086
11. monthly average expenditure: 0.060319
12. Age: 0.057372
13. Job position: 0.056371
14. Gender: 0.044565
15. Work tenure: 0.021980
16. Education: 0.000005


In [None]:
from google.colab import files

# Upload the CSV file
uploaded = files.upload()

Saving feature slection new.csv to feature slection new (1).csv


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import LabelEncoder

file_path = 'feature slection new.csv'
df = pd.read_csv(file_path)

label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])

# Apply Min-Max scaling to features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df.drop('TOI (turnover intention)', axis=1))
y = df['TOI (turnover intention)']

# Apply SMOTE-ENN oversampling to the entire dataset
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_scaled, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Use RFE with RandomForestClassifier for feature selection
rf_classifier = RandomForestClassifier()
rfe_selector = RFECV(estimator=rf_classifier, step=1, cv=5, scoring='accuracy')  # You can adjust cv and other parameters
X_train_rfe = rfe_selector.fit_transform(X_train, y_train)

# Train a RandomForestClassifier on the selected features
rf_classifier.fit(X_train_rfe, y_train)

# Get feature rankings, support, and scores from the trained RFECV
feature_rankings = rfe_selector.ranking_
feature_support = rfe_selector.support_
feature_scores = rfe_selector.cv_results_['mean_test_score']

# Create a DataFrame to display feature names, rankings, support, and scores
features = pd.DataFrame({'Feature': features_list_sorted})

# Select the top-k features based on ranking and support
k = 16  # You can choose the number of top features
# selected = sorted_rfe_df[sorted_rfe_df['RFE Ranking'] <= k]
# selected_features = selected['Feature'].index

In [None]:
sorted_features = [

'satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance', 'mentally well and do not have anxiety', 'satisfied with career and opportunity', 'family supports', 'work is meaningful', 'good relationship with peers', 'working hour',
 'monthly average expenditure', 'Age', 'Job position', 'Gender', 'Work tenure', 'Education']
# Select the highest 15 features
selected_feature_set = sorted_features[:15]


print(selected_feature_set)

['satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance', 'mentally well and do not have anxiety', 'satisfied with career and opportunity', 'family supports', 'work is meaningful', 'good relationship with peers', 'working hour', 'monthly average expenditure', 'Age', 'Job position', 'Gender', 'Work tenure']


In [None]:
selected_feature_set

['satisfaction with workload',
 'satisfied with job profession',
 'satisfied compensation',
 'satisfied with work-life balance',
 'mentally well and do not have anxiety',
 'satisfied with career and opportunity',
 'family supports',
 'work is meaningful',
 'good relationship with peers',
 'working hour',
 'monthly average expenditure',
 'Age',
 'Job position',
 'Gender',
 'Work tenure']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Model: Random Forest
Random Forest Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]

Model: AdaBoost
AdaBoost Accuracy: 0.9565217391304348
Precision: 1.0
Recall: 0.9183673469387755
F1 Score: 0.9574468085106383
MCC: 0.9166288364409283
ROC AUC: 0.9591836734693877
Confusion Matrix:
[[43  0]
 [ 4 45]]

Model: CatBoost
CatBoost Accuracy: 0.967391304347826
Precision: 1.0
Recall: 0.9387755102040817
F1 Score: 0.968421052631579
MCC: 0.9367769320431428
ROC AUC: 0.9693877551020409
Confusion Matrix:
[[43  0]
 [ 3 46]]


In [None]:
sorted_features = [

'satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance', 'mentally well and do not have anxiety', 'satisfied with career and opportunity', 'family supports', 'work is meaningful', 'good relationship with peers', 'working hour',
 'monthly average expenditure', 'Age', 'Job position', 'Gender', 'Work tenure', 'Education']
# Select the highest 10 features
selected_feature_set = sorted_features[:10]


print(selected_feature_set)

['satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance', 'mentally well and do not have anxiety', 'satisfied with career and opportunity', 'family supports', 'work is meaningful', 'good relationship with peers', 'working hour']


In [None]:
selected_feature_set

['satisfaction with workload',
 'satisfied with job profession',
 'satisfied compensation',
 'satisfied with work-life balance',
 'mentally well and do not have anxiety',
 'satisfied with career and opportunity',
 'family supports',
 'work is meaningful',
 'good relationship with peers',
 'working hour']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Model: Random Forest
Random Forest Accuracy: 0.9782608695652174
Precision: 1.0
Recall: 0.9591836734693877
F1 Score: 0.9791666666666666
MCC: 0.9573678958383028
ROC AUC: 0.9795918367346939
Confusion Matrix:
[[43  0]
 [ 2 47]]

Model: AdaBoost
AdaBoost Accuracy: 0.967391304347826
Precision: 0.9791666666666666
Recall: 0.9591836734693877
F1 Score: 0.9690721649484536
MCC: 0.9348193338876291
ROC AUC: 0.9679639297579496
Confusion Matrix:
[[42  1]
 [ 2 47]]

Model: CatBoost
CatBoost Accuracy: 0.9456521739130435
Precision: 1.0
Recall: 0.8979591836734694
F1 Score: 0.9462365591397849
MCC: 0.896895628658922
ROC AUC: 0.9489795918367347
Confusion Matrix:
[[43  0]
 [ 5 44]]


In [None]:
sorted_features = [

'satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance', 'mentally well and do not have anxiety', 'satisfied with career and opportunity', 'family supports', 'work is meaningful', 'good relationship with peers', 'working hour',
 'monthly average expenditure', 'Age', 'Job position', 'Gender', 'Work tenure', 'Education']
# Select the highest 5 features
selected_feature_set = sorted_features[:5]


print(selected_feature_set)

['satisfaction with workload', 'satisfied with job profession', 'satisfied compensation', 'satisfied with work-life balance', 'mentally well and do not have anxiety']


In [None]:
selected_feature_set

['satisfaction with workload',
 'satisfied with job profession',
 'satisfied compensation',
 'satisfied with work-life balance',
 'mentally well and do not have anxiety']

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier
# Filter the dataset to include only the selected features
# Assuming selected_features is a list of strings (column names) and X_train, X_test are DataFrames

# Assuming you know the column names of the original dataset
df_train = pd.DataFrame(X_train, columns=feature_names)
df_test = pd.DataFrame(X_test, columns=feature_names)

X_train_selected = df_train[selected_feature_set]
X_test_selected = df_test[selected_feature_set]


# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_selected, y_train)
best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params_rf)
best_rf_classifier.fit(X_train_selected, y_train)
y_pred_rf = best_rf_classifier.predict(X_test_selected)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(random_state=42)
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
grid_search_adaboost.fit(X_train_selected, y_train)
best_params_adaboost = grid_search_adaboost.best_params_
best_adaboost_classifier = AdaBoostClassifier(random_state=42, **best_params_adaboost)
best_adaboost_classifier.fit(X_train_selected, y_train)
y_pred_adaboost = best_adaboost_classifier.predict(X_test_selected)

# CatBoost
catboost_classifier = CatBoostClassifier(random_state=42, silent=True)
param_grid_catboost = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'depth': [4, 6, 8]
}
grid_search_catboost = GridSearchCV(catboost_classifier, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train_selected, y_train)
best_params_catboost = grid_search_catboost.best_params_
best_catboost_classifier = CatBoostClassifier(random_state=42, **best_params_catboost, silent=True)
best_catboost_classifier.fit(X_train_selected, y_train)
y_pred_catboost = best_catboost_classifier.predict(X_test_selected)

# Evaluate models
models = {
    'Random Forest': (best_rf_classifier, y_pred_rf),
    'AdaBoost': (best_adaboost_classifier, y_pred_adaboost),
    'CatBoost': (best_catboost_classifier, y_pred_catboost)
}

for model_name, (model, y_pred) in models.items():
    print(f"\nModel: {model_name}")

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

    # Calculate metrics for the model
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)  # F1 score added here
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Confusion matrix for the model
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print metrics and confusion matrix for the model
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")  # Print F1 score
    print(f"MCC: {mcc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Model: Random Forest
Random Forest Accuracy: 0.9239130434782609
Precision: 1.0
Recall: 0.8571428571428571
F1 Score: 0.923076923076923
MCC: 0.8585702400752412
ROC AUC: 0.9285714285714286
Confusion Matrix:
[[43  0]
 [ 7 42]]

Model: AdaBoost
AdaBoost Accuracy: 0.9565217391304348
Precision: 0.9787234042553191
Recall: 0.9387755102040817
F1 Score: 0.9583333333333333
MCC: 0.9137865764829719
ROC AUC: 0.9577598481252966
Confusion Matrix:
[[42  1]
 [ 3 46]]

Model: CatBoost
CatBoost Accuracy: 0.967391304347826
Precision: 1.0
Recall: 0.9387755102040817
F1 Score: 0.968421052631579
MCC: 0.9367769320431428
ROC AUC: 0.9693877551020409
Confusion Matrix:
[[43  0]
 [ 3 46]]
