In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc, confusion_matrix, classification_report, make_scorer
import pickle
import os
import plotly.graph_objects as go
import plotly.express as px
from typing import Dict, List, Tuple, Any, Optional

# Load the data
data = pd.read_csv("../data/cell2celltrain.csv")

# Preprocessing
data = data.drop('CustomerID', axis=1)

# Handle missing values
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].fillna('Unknown')

# Convert categorical features to numerical features
for col in data.columns:
    if data[col].dtype == 'object':
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

# Define features and target
X = data.drop('Churn', axis=1)
y = data['Churn']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, solver='liblinear'),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

# Define hyperparameter grids
param_grids = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.3]}
}

# Define scoring metrics
scoring = ['roc_auc', 'f1', 'precision']

# Create a function to perform cross-validation and hyperparameter tuning
def tune_and_evaluate(model, param_grid, X_train, y_train, X_test, y_test, scoring, model_name):
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, refit='roc_auc', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), return_train_score=True)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    print(f"Model: {model_name}")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_prob)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

    return best_model, y_prob

# Train, evaluate, and save models
trained_models = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    best_model, y_prob = tune_and_evaluate(model, param_grids[model_name], X_train, y_train, X_test, y_test, scoring, model_name)
    trained_models[model_name] = best_model

# Create the models directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Save the trained XGBoost model
filename = 'models/xgboost_model.pkl'
pickle.dump(trained_models['XGBoost'], open(filename, 'wb'))
print(f"XGBoost model saved to {filename}")

# Save the trained Logistic Regression model
filename = 'models/logistic_regression_model.pkl'
pickle.dump(trained_models['Logistic Regression'], open(filename, 'wb'))
print(f"Logistic Regression model saved to {filename}")