In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, f1_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neural_network import MLPClassifier

# Load the training data
train_data = pd.read_csv("../data/cell2celltrain.csv")

# Remove customer_id
train_data = train_data.drop('CustomerID', axis=1)

# Handle missing values BEFORE Label Encoding
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        train_data[col] = train_data[col].fillna('Unknown')

# Convert categorical features to numerical features
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        le = LabelEncoder()
        train_data[col] = le.fit_transform(train_data[col])

# Split the training data into training and validation sets
X = train_data.drop('Churn', axis=1)
y = train_data['Churn']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

# Define the base models
logistic_regression = LogisticRegression(random_state=42)
random_forest = RandomForestClassifier(random_state=42)
xgboost = XGBClassifier(random_state=42)

# Train the base models
logistic_regression.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
xgboost.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_lr = logistic_regression.predict_proba(X_val)[:, 1]
y_pred_rf = random_forest.predict_proba(X_val)[:, 1]
y_pred_xgb = xgboost.predict_proba(X_val)[:, 1]

# Explore different blending weights
weights = [0.1, 0.3, 0.5, 0.7, 0.9]
results = []

for lr_weight in weights:
    for rf_weight in weights:
        for xgb_weight in weights:
            if np.isclose(lr_weight + rf_weight + xgb_weight, 1):
                y_pred_weighted = (lr_weight * y_pred_lr) + (rf_weight * y_pred_rf) + (xgb_weight * y_pred_xgb)
                roc_auc = roc_auc_score(y_val, y_pred_weighted)
                f1 = f1_score(y_val, y_pred_weighted > 0.5)
                results.append({'lr': lr_weight, 'rf': rf_weight, 'xgb': xgb_weight, 'roc_auc': roc_auc, 'f1': f1})

# Convert results to a dataframe
results_df = pd.DataFrame(results)

# Sort the results by ROC AUC
results_df = results_df.sort_values(by='roc_auc', ascending=False)

# Print the top 5 combinations
print("\nTop 5 Blending Combinations:")
print(results_df.head(5))