In [None]:
# Import necessary libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
from river import metrics  # For performance metrics
from river.drift import ADWIN  # For drift detection
from river.ensemble import BaggingClassifier  # For ensemble learning using bagging
from river import linear_model  # For linear models
from river.preprocessing import OneHotEncoder  # For one-hot encoding categorical variables
import matplotlib.pyplot as plt  # For plotting visualizations
import seaborn as sns  # For enhanced visualizations
from sklearn.model_selection import train_test_split  # For splitting the dataset into training and testing sets
from sklearn.linear_model import LogisticRegression  # For logistic regression model
from sklearn.tree import DecisionTreeClassifier  # For decision tree model
from sklearn.metrics import classification_report, roc_auc_score  # For model evaluation metrics
from sklearn.ensemble import VotingClassifier  # For ensemble learning with voting
from imblearn.over_sampling import SMOTE  # For handling class imbalance through oversampling
from imblearn.under_sampling import RandomUnderSampler  # For handling class imbalance through undersampling
from sklearn.pipeline import make_pipeline  # For creating pipelines
import os  # For operating system functionalities

# Set environment variable to limit CPU usage (optional)
os.environ["LOKY_MAX_CPU_COUNT"] = "4"  # Adjust as needed for parallel processing

# Initialize logistic regression model with increased iteration limit
model = LogisticRegression(max_iter=200)  # Increase from default (100)
scaler = StandardScaler()  # Initialize scaler (not used in the pipeline yet)
model = make_pipeline(scaler, LogisticRegression(max_iter=200))  # Create a pipeline including scaling
model = LogisticRegression(solver='saga', max_iter=200)  # Another model initialization (overwrites previous)

# Step 1: Load the dataset
df = pd.read_csv('customer_dataset.csv')  # Read the churn dataset from a CSV file

# Step 2: Check the columns in the DataFrame
print(df.columns)  # Print column names to verify
print(df.head())  # Display the first few rows of the DataFrame

# Step 3: Create a 'date' column if necessary (optional)
if 'date' not in df.columns and len(df) <= 1000:  # Check if 'date' column is missing and DataFrame is small
    df['date'] = pd.date_range(start='2020-01-01', periods=len(df), freq='M')  # Generate a date range

# Step 4: Identify categorical columns for one-hot encoding
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()  # Get categorical columns
if 'churn' in categorical_cols:  # Exclude target variable from categorical columns
    categorical_cols.remove('churn')

# One-hot encode categorical variables
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)  # Convert categorical variables into dummy/indicator variables

# 1. Explore for missing values
missing_values = df.isnull().sum()  # Count missing values in each column
print("\nMissing values in each column:")
print(missing_values[missing_values > 0])  # Print columns with missing values

# 2. Identify anomalies using summary statistics
print("\nSummary statistics:")
print(df.describe())  # Display summary statistics for numerical features

# 3. Check for anomalies (e.g., outliers) in each column
# Defining a function to identify anomalies based on IQR (Interquartile Range)
def identify_anomalies(df):
    anomalies = {}  # Initialize the anomalies dictionary
    for column in df.select_dtypes(include=[np.number]).columns:  # Loop through numerical columns
        Q1 = df[column].quantile(0.25)  # Calculate the first quartile
        Q3 = df[column].quantile(0.75)  # Calculate the third quartile
        IQR = Q3 - Q1  # Calculate Interquartile Range
        lower_bound = Q1 - 1.5 * IQR  # Calculate lower bound for outliers
        upper_bound = Q3 + 1.5 * IQR  # Calculate upper bound for outliers
        anomalies[column] = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]  # Store anomalies

    return anomalies  # Return dictionary of anomalies

# Identify anomalies
anomalies = identify_anomalies(df)  # Call the function to identify anomalies

# Display anomalies for each numerical column
print("\nAnomalies detected in each numerical column:")
for column, anomaly_values in anomalies.items():
    if not anomaly_values.empty:  # Check if there are any anomalies
        print(f"{column}: {anomaly_values.tolist()}")  # Show the list of anomalies
    else:
        print(f"{column}: No anomalies detected.")  # Indicate no anomalies

# 3. Visualize the distribution of key features to detect outliers
plt.figure(figsize=(12, 6))  # Set figure size for the plot
sns.boxplot(data=df[['age', 'income', 'monthly_bill', 'outstanding_balance']])  # Create boxplot for key features
plt.title('Boxplot of Key Features')  # Title for the plot
plt.show()  # Display the plot

# Optional: Visualize correlations between features
correlation_matrix = df.corr()  # Calculate the correlation matrix
plt.figure(figsize=(10, 8))  # Set figure size for the heatmap
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')  # Create a heatmap for correlations
plt.title('Correlation Matrix')  # Title for the heatmap
plt.show()  # Display the heatmap

# Attempt to convert all relevant columns to numeric
for col in df.columns:  # Loop through all columns
    if df[col].dtype == 'object':  # Only consider object type columns
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to NaN where conversion fails

# Fill NaN values with the median of each column
df.fillna(df.median(), inplace=True)  # Replace NaN values with the median of each column

# Calculate and visualize correlations again
correlation_matrix = df.corr()  # Recalculate correlation matrix
sns.heatmap(correlation_matrix, annot=True, fmt='.2f')  # Create heatmap for correlations
plt.title('Feature Correlation Matrix')  # Title for the heatmap
plt.show()  # Display the heatmap

# Step 5: Split the dataset into features and target variable
X = df.drop(columns=['churn', 'date'], errors='ignore')  # Select features, drop target variable and 'date' if present
y = df['churn']  # Select target variable

# Step 6: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)  # Split dataset

# Handle class imbalance using SMOTE (oversampling)
smote = SMOTE(random_state=42)  # Initialize SMOTE
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)  # Apply SMOTE to training data

# Handle class imbalance using Random Undersampling
undersampler = RandomUnderSampler(random_state=42)  # Initialize undersampler
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)  # Apply undersampling

# Train a model with SMOTE data
model_smote = LogisticRegression(max_iter=1000)  # Initialize logistic regression model
model_smote.fit(X_train_resampled, y_train_resampled)  # Train on resampled data

# Train a model with Undersampled data
model_undersample = LogisticRegression(max_iter=1000)  # Initialize logistic regression model
model_undersample.fit(X_train_undersampled, y_train_undersampled)  # Train on undersampled data

# Make predictions and evaluate the model with SMOTE
y_pred_smote = model_smote.predict(X_test)  # Make predictions for the test set
y_pred_proba_smote = model_smote.predict_proba(X_test)[:, 1]  # Get probabilities for AUC calculation

# Make predictions and evaluate the model with Undersampling
y_pred_undersample = model_undersample.predict(X_test)  # Make predictions for the test set
y_pred_proba_undersample = model_undersample.predict_proba(X_test)[:, 1]  # Get probabilities for AUC calculation

# Evaluate the models
print("\nClassification Report for SMOTE:")
print(classification_report(y_test, y_pred_smote))  # Print classification report for SMOTE model
print("\nAUC-ROC Score for SMOTE:", roc_auc_score(y_test, y_pred_proba_smote))  # Print AUC score for SMOTE model

print("\nClassification Report for Undersampling:")
print(classification_report(y_test, y_pred_undersample))  # Print classification report for undersampled model
print("\nAUC-ROC Score for Undersampling:", roc_auc_score(y_test, y_pred_proba_undersample))  # Print AUC score for undersampled model

# Visualizations
sns.countplot(x=y_train_resampled)  # Count plot for resampled training data
plt.title('Class Distribution After SMOTE')  # Title for the plot
plt.xlabel('Churn')  # X-axis label
plt.ylabel('Count')  # Y-axis label
plt.show()  # Show the plot

sns.countplot(x=y_train_undersampled)  # Count plot for undersampled training data
plt.title('Class Distribution After Undersampling')  # Title for the plot
plt.xlabel('Churn')  # X-axis label
plt.ylabel('Count')  # Y-axis label
plt.show()  # Show the plot

# 4. Create a 'date' column if necessary (optional)
# Remove the following block if you don't need a date column
# Check if the DataFrame is large before creating a date column
if 'date' not in df.columns and len(df) <= 1000:  # Adjust threshold as needed
    df['date'] = pd.date_range(start='2020-01-01', periods=len(df), freq='Ms')  # Generate a date range

# 5. Identify categorical columns for one-hot encoding
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()  # Get categorical columns
if 'churn' in categorical_cols:  # Exclude target variable from categorical columns
    categorical_cols.remove('churn')

# 6. One-hot encode categorical variables
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)  # Convert categorical variables into dummy/indicator variables

# Initialize model and drift detector
model = BaggingClassifier(  # Create an ensemble model using bagging
    base_estimator=linear_model.LogisticRegression(),  # Base model for the ensemble
    n_models=10,  # Number of models in the ensemble
    seed=42  # Seed for reproducibility
)
drift_detector = ADWIN()  # Initialize the ADWIN drift detector
metric = metrics.AUC()  # Initialize AUC metric for performance evaluation

# Step 7: Train the model
for i in range(len(X_train)):  # Loop through training data
    model.learn_one(X_train.iloc[i].to_dict(), y_train.iloc[i])  # Train the model on one instance
    # After learning, check for drift using the test data
    y_pred = model.predict_one(X_test.iloc[i].to_dict())  # Make a prediction for the test instance
    metric = metric.update(y_test.iloc[i], y_pred)  # Update the metric with the true and predicted values
    drift_detector.update(y_pred)  # Update the drift detector with the prediction

    # Check if drift is detected
    if drift_detector.drift_detected:  # If drift is detected
        print(f"Drift detected at index {i}. Retraining model...")  # Notify about drift detection
        # Retrain the model with the latest training data
        model = BaggingClassifier(  # Re-initialize the model
            base_estimator=linear_model.LogisticRegression(),
            n_models=10,
            seed=42
        )
        for j in range(len(X_train)):  # Retrain on the entire training set
            model.learn_one(X_train.iloc[j].to_dict(), y_train.iloc[j])
        drift_detector.reset()  # Reset the drift detector after retraining

# Final evaluation on the test set after training
y_pred = model.predict(X_test.to_dict(orient='records'))  # Make predictions for the test set
print("Final Classification Report:")  # Print classification report header
print(classification_report(y_test, y_pred))  # Display the classification report

# Calculate and print final AUC-ROC score
auc = roc_auc_score(y_test, y_pred)  # Calculate AUC-ROC score
print("Final AUC-ROC Score:", auc)  # Print the final AUC-ROC score