### Name: Shrinivas Kakkeri

### BITS ID: 2025AA05057

### Assignment: Machine Learning Assignment 2


In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
import xgboost as xgb

# Load and preprocess your credit card churn data
df = pd.read_csv('Credit_Card_Churn.csv')

def preprocess_data(df):
    data = df.copy()

    # Remove ID column if exists
    if 'CLIENTNUM' in data.columns:
        data = data.drop('CLIENTNUM', axis=1)

    # Handle target variable
    le_target = LabelEncoder()
    data['Attrition_Flag'] = le_target.fit_transform(data['Attrition_Flag'])

    # Handle categorical variables
    categorical_columns = ['Gender', 'Education_Level', 'Marital_Status',
                          'Income_Category', 'Card_Category']

    label_encoders = {}
    for col in categorical_columns:
        if col in data.columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))
            label_encoders[col] = le

    return data, le_target, label_encoders

# Preprocess data
processed_df, target_encoder, feature_encoders = preprocess_data(df)

# Split features and target
X = processed_df.drop('Attrition_Flag', axis=1)
y = processed_df['Attrition_Flag']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define and train all models
models_config = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss')
}

# Train and save models
trained_models = {}
model_performance = {}

for name, model in models_config.items():
    print(f"Training {name}...")

    # Use scaled data for models that need it
    if name in ['Logistic Regression', 'KNN', 'Naive Bayes']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

    # Calculate metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'AUC': roc_auc_score(y_test, y_pred_proba[:, 1]),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1': f1_score(y_test, y_pred, average='weighted'),
        'MCC': matthews_corrcoef(y_test, y_pred)
    }

    trained_models[name] = model
    model_performance[name] = metrics

    # Save individual model
    with open(f'{name.replace(" ", "_").lower()}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

# Save preprocessing components
preprocessing_components = {
    'scaler': scaler,
    'target_encoder': target_encoder,
    'feature_encoders': feature_encoders,
    'feature_names': list(X.columns),
    'models_requiring_scaling': ['Logistic Regression', 'KNN', 'Naive Bayes']
}

with open('preprocessing_components.pkl', 'wb') as f:
    pickle.dump(preprocessing_components, f)

# Save model performance results
with open('model_performance.pkl', 'wb') as f:
    pickle.dump(model_performance, f)

print("All models and components saved successfully!")

# Display results
results_df = pd.DataFrame(model_performance).T
print("\nModel Performance Comparison:")
print(results_df)

Training Logistic Regression...
Training Decision Tree...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training KNN...
Training Naive Bayes...
Training Random Forest...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training XGBoost...
All models and components saved successfully!

Model Performance Comparison:
                     Accuracy       AUC  Precision  Recall        F1       MCC
Logistic Regression     0.846  0.549104   0.715716   0.846  0.775424  0.000000
Decision Tree           0.720  0.515827   0.747034   0.720  0.732702  0.028894
KNN                     0.826  0.508115   0.739642   0.826  0.772559  0.000561
Naive Bayes             0.846  0.546558   0.715716   0.846  0.775424  0.000000
Random Forest           0.846  0.532427   0.715716   0.846  0.775424  0.000000
XGBoost                 0.840  0.513405   0.741480   0.840  0.775149  0.003344
