In [4]:
# ==============================================================================
# Autor(es): Juan Felipe Agudelo Rios                                         |
# Titulo: Past Models                                                         |
# Fecha creación: 23/12/2023                                                  |
# Fecha última modificación: 24/12/2023                                       |
# ==============================================================================

In [5]:
import pandas as pd 
import os 
import regex as re
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.metrics import make_scorer, fbeta_score

original_directory = "/Users/j.agudelo/Desktop/CasoFi/"

os.chdir(original_directory)

data=pd.read_excel("Output/DataFrames/CleanData.xlsx")

data

Unnamed: 0.1,Unnamed: 0,clientnum,attrition_flag,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,...,naive_bayes_classifier_attrition_flag_card_category_contacts_count_12_mon_dependent_count_education_level_months_inactive_12_mon_1,education_level_num,income_category_num,card_category_num,marital_status_num,marital_status_divorced,marital_status_married,marital_status_single,attrition_num,male
0,0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,...,0.000093,1,2,0,1,0,1,0,0,1
1,1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,...,0.000057,2,0,0,2,0,0,1,0,0
2,2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,...,0.000021,2,3,0,1,0,1,0,0,1
3,3,769911858,Existing Customer,40,F,4,High School,Single,Less than $40K,Blue,...,0.000134,1,0,0,2,0,0,1,0,0
4,4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,...,0.000022,0,2,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,10122,772366833,Existing Customer,50,M,2,Graduate,Single,$40K - $60K,Blue,...,0.000191,2,1,0,2,0,0,1,0,1
10123,10123,710638233,Attrited Customer,41,M,2,High School,Divorced,$40K - $60K,Blue,...,0.995270,1,1,0,0,1,0,0,1,1
10124,10124,716506083,Attrited Customer,44,F,1,High School,Married,Less than $40K,Blue,...,0.997880,1,0,0,1,0,1,0,1,0
10125,10125,717406983,Attrited Customer,30,M,2,Graduate,Married,$40K - $60K,Blue,...,0.996710,2,1,0,1,0,1,0,1,1


In [2]:



Dependent_var="attrition_num"

selected_feature_names=['customer_age',
                        'dependent_count',
                        'total_relationship_count',
                        'months_inactive_12_mon',
                        'contacts_count_12_mon',
                        'total_amt_chng_q4_q1',
                        'total_trans_ct',
                        'total_ct_chng_q4_q1',
                        'avg_utilization_ratio',
                        'education_level_num',
                        'income_category_num',
                        'card_category_num',
                        'male',
                        'marital_status_married',
                        'marital_status_single',
                        'marital_status_divorced',
                        "months_on_book",
                        "credit_limit",
                        "total_revolving_bal",
                        "avg_open_to_buy","total_trans_amt"]


f2_scorer = make_scorer(fbeta_score, beta=2)

X = data[selected_feature_names]
y = data[Dependent_var]

# Assuming X and y are your original feature matrix and target variable
# Use oversampling to address class imbalance
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
OVERSAMPLE_X, OVERSAMPLE_y = os.fit_resample(X_train, y_train)

classifiers = [
    ('KNN', KNeighborsClassifier()),
    ('BoostedDecisionTree', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), random_state=42)),
    ('RandomForest', RandomForestClassifier(random_state=42)),
    ('XGBoost', XGBClassifier())
]

param_grid_naive_bayes = {}


param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}


param_grid_boosting = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}

param_grid_random_forest = {
    'n_estimators': [50, 100, 150, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}


param_grid_xgboost = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

param_grids = {
    'KNN': param_grid_knn,
    'BoostedDecisionTree': param_grid_boosting,
    'RandomForest': param_grid_random_forest,
    'XGBoost': param_grid_xgboost
}

tuned_classifiers = []


for name, base_model in classifiers:
    param_grid = param_grids[name]
    
    # Create a RandomizedSearchCV for hyperparameter tuning
    random_search = RandomizedSearchCV(
        base_model,
        param_distributions=param_grid,
        n_iter=50,  # Adjust the number of iterations as needed
        scoring=f2_scorer,  # Use the custom scorer f2_scorer
        cv=5,
        random_state=42,
        n_jobs=-1
    )
    
    # Append the tuned model to the list
    tuned_classifiers.append((name, random_search))

# Create an ensemble of classifiers using VotingClassifier
ensemble_classifier = VotingClassifier(estimators=tuned_classifiers, voting='hard')

# Train the ensemble model on the entire training set
ensemble_classifier.fit(OVERSAMPLE_X, OVERSAMPLE_y)


y_pred = ensemble_classifier.predict(X_test)
print(classification_report(y_test, y_pred))



results = {}

fbeta = fbeta_score(y_test, y_pred, beta=2)


results["Voting"] = {'F2 Score': fbeta,
        'Classification Report': classification_report(y_test, y_pred, output_dict=True)}


for name, model in tuned_classifiers:
    model.fit(OVERSAMPLE_X, OVERSAMPLE_y)
    y_pred = model.predict(X_test)
    fbeta = fbeta_score(y_test, y_pred, beta=2)
    
    # Store results in the dictionary
    results[name] = {
        'F2 Score': fbeta,
        'Classification Report': classification_report(y_test, y_pred, output_dict=True)
    }

    # Print results
    print(f"{name}")
    print(f"F2 Score: {fbeta:.4f}\n")
    print(classification_report(y_test, y_pred))





              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1701
           1       0.91      0.84      0.88       325

    accuracy                           0.96      2026
   macro avg       0.94      0.91      0.93      2026
weighted avg       0.96      0.96      0.96      2026





KNN
F2 Score: 0.7151

              precision    recall  f1-score   support

           0       0.96      0.86      0.90      1701
           1       0.52      0.79      0.63       325

    accuracy                           0.85      2026
   macro avg       0.74      0.82      0.76      2026
weighted avg       0.89      0.85      0.86      2026





BoostedDecisionTree
F2 Score: 0.7959

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      1701
           1       0.76      0.81      0.78       325

    accuracy                           0.93      2026
   macro avg       0.86      0.88      0.87      2026
weighted avg       0.93      0.93      0.93      2026

RandomForest
F2 Score: 0.8702

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1701
           1       0.87      0.87      0.87       325

    accuracy                           0.96      2026
   macro avg       0.92      0.92      0.92      2026
weighted avg       0.96      0.96      0.96      2026

XGBoost
F2 Score: 0.8827

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1701
           1       0.89      0.88      0.89       325

    accuracy                           0.96      2026
   macro avg       0.94      0.93

{'Voting': {'F2 Score': 0.8562499999999998,
  'Classification Report': {'0': {'precision': 0.97045191193511,
    'recall': 0.9847148736037625,
    'f1-score': 0.977531368543916,
    'support': 1701},
   '1': {'precision': 0.9133333333333333,
    'recall': 0.8430769230769231,
    'f1-score': 0.8768,
    'support': 325},
   'accuracy': 0.9619940769990128,
   'macro avg': {'precision': 0.9418926226342217,
    'recall': 0.9138958983403428,
    'f1-score': 0.927165684271958,
    'support': 2026},
   'weighted avg': {'precision': 0.9612892574210046,
    'recall': 0.9619940769990128,
    'f1-score': 0.9613725853372167,
    'support': 2026}}},
 'KNN': {'F2 Score': 0.7150806900389537,
  'Classification Report': {'0': {'precision': 0.9555264879005886,
    'recall': 0.8589065255731922,
    'f1-score': 0.9046439628482972,
    'support': 1701},
   '1': {'precision': 0.5171026156941649,
    'recall': 0.7907692307692308,
    'f1-score': 0.6253041362530413,
    'support': 325},
   'accuracy': 0.847976

In [3]:
results

{'Voting': {'F2 Score': 0.8562499999999998,
  'Classification Report': {'0': {'precision': 0.97045191193511,
    'recall': 0.9847148736037625,
    'f1-score': 0.977531368543916,
    'support': 1701},
   '1': {'precision': 0.9133333333333333,
    'recall': 0.8430769230769231,
    'f1-score': 0.8768,
    'support': 325},
   'accuracy': 0.9619940769990128,
   'macro avg': {'precision': 0.9418926226342217,
    'recall': 0.9138958983403428,
    'f1-score': 0.927165684271958,
    'support': 2026},
   'weighted avg': {'precision': 0.9612892574210046,
    'recall': 0.9619940769990128,
    'f1-score': 0.9613725853372167,
    'support': 2026}}},
 'KNN': {'F2 Score': 0.7150806900389537,
  'Classification Report': {'0': {'precision': 0.9555264879005886,
    'recall': 0.8589065255731922,
    'f1-score': 0.9046439628482972,
    'support': 1701},
   '1': {'precision': 0.5171026156941649,
    'recall': 0.7907692307692308,
    'f1-score': 0.6253041362530413,
    'support': 325},
   'accuracy': 0.847976