In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pickle



In [35]:
df = pd.read_csv('/Users/abhishekwaghchaure/Desktop/CreditCardAnalysis/Datasets/Preprocessed.csv')
unnamed_columns = [col for col in df.columns if 'Unnamed' in col]
if unnamed_columns:
    df = df.drop(columns=unnamed_columns, axis=1)

In [36]:
df.head()

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Education_Level_Doctorate,Gender_M,Attrition_Flag
0,45,3,39,5,1,3,12691.0,777,1.335,1144,42,1.625,0.061,0.0,1.0,Existing Customer
1,49,5,44,6,1,2,8256.0,864,1.541,1291,33,3.714,0.105,0.0,0.0,Existing Customer
2,51,3,36,4,1,0,3418.0,0,2.594,1887,20,2.333,0.0,0.0,1.0,Existing Customer
3,40,4,34,3,4,1,3313.0,2517,1.405,1171,20,2.333,0.76,0.0,0.0,Existing Customer
4,40,3,21,5,1,0,4716.0,0,2.175,816,28,2.5,0.0,0.0,1.0,Existing Customer


In [37]:
le = LabelEncoder()
df['Attrition_Flag'] = le.fit_transform(df['Attrition_Flag'])

In [38]:
X = df.drop(columns=["Attrition_Flag"])  # Features
y = df["Attrition_Flag"]  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [39]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [40]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [41]:
X_train

array([[ 0.96087863,  0.50080961,  1.64154853, ..., -0.99798094,
        -0.21938646, -0.94907893],
       [ 0.58518711, -1.81266234,  1.13884339, ..., -0.13883846,
        -0.21938646,  1.05365314],
       [-0.16619592,  1.27196692, -0.87197715, ...,  0.41450754,
        -0.21938646, -0.94907893],
       ...,
       [ 0.45995661, -1.04150502,  0.00775684, ..., -0.56476927,
        -0.21938646, -0.94907893],
       [-0.41665693,  2.04312424,  0.00775684, ..., -0.33906234,
        -0.21938646,  1.05365314],
       [ 0.96087863, -0.27034771,  0.00775684, ..., -0.71038664,
        -0.21938646,  1.05365314]])

In [42]:
y_train.head()

1602    0
7791    1
7177    1
97      1
4820    1
Name: Attrition_Flag, dtype: int64

In [48]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(random_state=42)
}

In [49]:
cv_scores = {}
for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores[model_name] = np.mean(scores)

In [51]:
cv_scores

{'Logistic Regression': 0.9042103258924152,
 'Random Forest': 0.9602514832331817,
 'Decision Tree': 0.9384031347819134,
 'SVM': 0.9312430979200463}

In [50]:
best_model_name = max(cv_scores, key=cv_scores.get)
print("Best Model based on CV Accuracy:", best_model_name)

Best Model based on CV Accuracy: Random Forest


In [56]:
## Hyperparameter Tuning
random_forest = RandomForestClassifier(random_state = 42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    }

In [58]:
# Perform hyperparameter tuning
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [59]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [60]:
# Print evaluation metrics
print("Best Model Parameters:", grid_search.best_params_)
print("\nAccuracy on Test Set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Best Model Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

Accuracy on Test Set: 0.9619940769990128

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.84      0.88       325
           1       0.97      0.99      0.98      1701

    accuracy                           0.96      2026
   macro avg       0.94      0.91      0.93      2026
weighted avg       0.96      0.96      0.96      2026


Confusion Matrix:
 [[ 273   52]
 [  25 1676]]


In [64]:
# Save the trained model
with open('/Users/abhishekwaghchaure/Desktop/CreditCardAnalysis/Models/best_model.pkl', 'wb') as file:
    pickle.dump(random_forest, file)
print("Model saved as 'best_model.pkl'")

Model saved as 'best_model.pkl'
