In [None]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import os
import inspect
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

# Cell 2: Load and Explore Data
df = pd.read_csv('/workspace/data/train.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

print(f"\nData types:")
print(df.dtypes)

print(f"\nTarget distribution:")
print(df['Churn'].value_counts())
churn_rate = (df['Churn'] == 'Yes').mean()
print(f"Churn rate: {churn_rate:.2%}")

# Cell 3: Data Quality Analysis
print("="*60)
print("DATA QUALITY ANALYSIS")
print("="*60)

# Missing values
print("\nMissing values per column:")
missing = df.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("No missing values found")

# TotalCharges issue
print("\n--- TotalCharges Analysis ---")
print(f"Original data type: {df['TotalCharges'].dtype}")

# Check for non-numeric values
if df['TotalCharges'].dtype == 'object':
    non_numeric = df[pd.to_numeric(df['TotalCharges'], errors='coerce').isna()]
    print(f"Non-numeric TotalCharges entries: {len(non_numeric)}")
    if len(non_numeric) > 0:
        print("\nSample of problematic rows:")
        print(non_numeric[['customerID', 'tenure', 'MonthlyCharges', 'TotalCharges']].head())

# Convert to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print(f"NaN values after conversion: {df['TotalCharges'].isna().sum()}")

# Analyze patterns in NaN values
if df['TotalCharges'].isna().sum() > 0:
    print("\nCustomers with NaN TotalCharges:")
    nan_customers = df[df['TotalCharges'].isna()]
    print(f"Average tenure: {nan_customers['tenure'].mean():.2f} months")
    print(f"Likely new customers with tenure = 0")

# Cell 4: Exploratory Data Analysis
print("\n" + "="*60)
print("EXPLORATORY DATA ANALYSIS")
print("="*60)

# Categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('customerID')
if 'Churn' in categorical_cols:
    categorical_cols.remove('Churn')

print(f"\nCategorical features ({len(categorical_cols)}):")
for col in categorical_cols:
    unique_count = df[col].nunique()
    print(f"  {col}: {unique_count} unique values - {df[col].unique()[:3]}")

# Numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumerical features ({len(numerical_cols)}):")
print(df[numerical_cols].describe())

# Churn rate by key features
print("\n--- Churn Rate Analysis ---")
for col in ['Contract', 'InternetService', 'PaymentMethod']:
    if col in df.columns:
        churn_by_feature = df.groupby(col)['Churn'].apply(
            lambda x: (x == 'Yes').mean()
        ).sort_values(ascending=False)
        print(f"\nChurn rate by {col}:")
        print(churn_by_feature)

# Cell 5: Define ChurnPredictor Class
class ChurnPredictor:
    """
    Production-ready churn prediction model with comprehensive preprocessing.
    
    Features:
    - Automatic handling of categorical and numerical features
    - Missing value imputation
    - Feature scaling and encoding
    - Random Forest classifier with class balancing
    """
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.pipeline = None
        self.feature_columns = None
    
    def _encode_target(self, churn_series):
        """
        Robustly encode Churn target to binary 0/1.
        Handles: 'Yes'/'No' strings, 0/1 numeric, True/False boolean.
        """
        if len(churn_series) == 0:
            raise ValueError("Churn column is empty")
        
        # Check for NaN
        if churn_series.isna().any():
            raise ValueError(
                f"Churn column contains {churn_series.isna().sum()} NaN values"
            )
        
        # Strategy 1: String format (Yes/No)
        if churn_series.dtype == 'object':
            churn_str = churn_series.astype(str).str.strip().str.lower()
            unique_vals = set(churn_str.unique())
            
            if unique_vals.issubset({'yes', 'no'}):
                return (churn_str == 'yes').astype(int).values
            
            if unique_vals.issubset({'true', 'false'}):
                return (churn_str == 'true').astype(int).values
        
        # Strategy 2: Numeric format (0/1)
        if pd.api.types.is_numeric_dtype(churn_series):
            unique_vals = set(churn_series.unique())
            if unique_vals.issubset({0, 1, 0.0, 1.0}):
                return churn_series.astype(int).values
        
        # Strategy 3: Boolean format
        if pd.api.types.is_bool_dtype(churn_series):
            return churn_series.astype(int).values
        
        # If we reach here, format is unrecognized
        raise ValueError(
            f"Unrecognized Churn format. Got unique values: {churn_series.unique()}\n"
            f"Expected: 'Yes'/'No', 0/1, or True/False"
        )
    
    def _build_pipeline(self, X):
        """Build sklearn pipeline with preprocessing and model."""
        # Identify column types
        categorical_cols = X.select_dtypes(
            include=['object', 'category', 'bool']
        ).columns.tolist()
        
        numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        
        # Numeric preprocessing: impute with median, then scale
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        # Categorical preprocessing: impute most frequent, then one-hot encode
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])
        
        # Combine preprocessing
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numerical_cols),
                ('cat', categorical_transformer, categorical_cols)
            ],
            remainder='drop'
        )
        
        # Full pipeline: preprocessing + model
        pipeline = Pipeline(steps=[
            ('preprocess', preprocessor),
            ('model', RandomForestClassifier(
                n_estimators=100,
                max_depth=10,
                min_samples_split=20,
                min_samples_leaf=10,
                class_weight='balanced',
                random_state=self.random_state,
                n_jobs=-1
            ))
        ])
        
        return pipeline
    
    def fit(self, train_df):
        """Train the churn prediction model."""
        if not isinstance(train_df, pd.DataFrame):
            raise TypeError("fit() expects a pandas DataFrame")
        
        if len(train_df) == 0:
            raise ValueError("Training DataFrame is empty")
        
        if 'Churn' not in train_df.columns:
            raise ValueError("Training data must contain 'Churn' column")
        
        df = train_df.copy()
        
        # Drop customerID (identifier, not a feature)
        if 'customerID' in df.columns:
            df = df.drop(columns=['customerID'])
        
        # Handle TotalCharges
        if 'TotalCharges' in df.columns:
            df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
        
        # Separate features and target
        X = df.drop(columns=['Churn'])
        y = self._encode_target(df['Churn'])
        
        # Store feature columns for prediction alignment
        self.feature_columns = X.columns.tolist()
        
        # Build and train pipeline
        self.pipeline = self._build_pipeline(X)
        self.pipeline.fit(X, y)
        
        return self
    
    def _prepare_X(self, X):
        """Prepare features for prediction."""
        X_df = X.copy()
        
        # Drop customerID if present
        if 'customerID' in X_df.columns:
            X_df = X_df.drop(columns=['customerID'])
        
        # Align columns with training
        if self.feature_columns is not None:
            # Add missing columns as NaN
            for col in self.feature_columns:
                if col not in X_df.columns:
                    X_df[col] = np.nan
            
            # Reorder to match training
            X_df = X_df[self.feature_columns]
        
        # Handle TotalCharges
        if 'TotalCharges' in X_df.columns:
            X_df['TotalCharges'] = pd.to_numeric(X_df['TotalCharges'], errors='coerce')
        
        return X_df
    
    def predict(self, X):
        """Predict churn class (0 or 1)."""
        if self.pipeline is None:
            raise RuntimeError("Call fit() before predict()")
        
        X_df = self._prepare_X(X)
        return self.pipeline.predict(X_df)
    
    def predict_proba(self, X):
        """Predict churn probabilities."""
        if self.pipeline is None:
            raise RuntimeError("Call fit() before predict_proba()")
        
        X_df = self._prepare_X(X)
        return self.pipeline.predict_proba(X_df)

print("✓ ChurnPredictor class defined successfully")

# Cell 6: Train-Validation Split
# Use stratified split to maintain class distribution
train_df, val_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42,
    stratify=df['Churn']
)

print(f"Training set: {len(train_df)} samples ({len(train_df)/len(df):.1%})")
print(f"Validation set: {len(val_df)} samples ({len(val_df)/len(df):.1%})")

print(f"\nClass distribution:")
print(f"Training   - Churn: {(train_df['Churn'] == 'Yes').mean():.2%}")
print(f"Validation - Churn: {(val_df['Churn'] == 'Yes').mean():.2%}")

# Cell 7: Train the Model
print("Training model...")
model = ChurnPredictor(random_state=42)
model.fit(train_df)
print("✓ Model training complete")

# Cell 8: Evaluate on Validation Set
# Prepare validation data
X_val = val_df.drop(columns=['Churn'])
y_val = (val_df['Churn'] == 'Yes').astype(int)

# Get predictions
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_val, y_pred_proba[:, 1])

print("\n" + "="*60)
print("VALIDATION PERFORMANCE")
print("="*60)
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Target: >= 0.83")
print(f"Status: {'✓ PASS' if roc_auc >= 0.83 else '✗ FAIL'}")
print("="*60)

# Classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=['No Churn', 'Churn']))

# Confusion matrix
cm = confusion_matrix(y_val, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Cell 9: Feature Importance Analysis
print("\n" + "="*60)
print("FEATURE IMPORTANCE")
print("="*60)

# Get feature names after preprocessing
preprocessor = model.pipeline.named_steps['preprocess']
feature_names = preprocessor.get_feature_names_out()

# Get feature importances from Random Forest
rf_model = model.pipeline.named_steps['model']
importances = rf_model.feature_importances_

# Create DataFrame and sort
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
print(importance_df.head(15).to_string(index=False))

# Cell 10: Test Edge Cases
print("\n" + "="*60)
print("EDGE CASE TESTING")
print("="*60)

# Test 1: Single row prediction
single_row = X_val.iloc[[0]]
pred_single = model.predict(single_row)
proba_single = model.predict_proba(single_row)

print(f"✓ Single row prediction: {pred_single[0]}")
print(f"  Probabilities: [No Churn: {proba_single[0][0]:.3f}, Churn: {proba_single[0][1]:.3f}]")

# Test 2: Batch prediction
batch = X_val.head(5)
pred_batch = model.predict(batch)
print(f"\n✓ Batch prediction (5 rows): {pred_batch}")

# Test 3: With customerID present
test_with_id = val_df.drop(columns=['Churn']).head(3)
pred_with_id = model.predict(test_with_id)
print(f"\n✓ Prediction with customerID: {pred_with_id}")

print("\n✓ All edge cases passed")

# Cell 11: Export to /results/utils.py (CRITICAL STEP)
print("\n" + "="*60)
print("EXPORTING SOLUTION")
print("="*60)

# Create /results directory
os.makedirs('/results', exist_ok=True)

# Get the source code of the ChurnPredictor class
class_source = inspect.getsource(ChurnPredictor)

# Write to file with all necessary imports
with open('/results/utils.py', 'w') as f:
    f.write("# ChurnPredictor - Exported for automated testing\n")
    f.write("# This file is automatically generated - do not edit manually\n\n")
    f.write("import numpy as np\n")
    f.write("import pandas as pd\n")
    f.write("from sklearn.compose import ColumnTransformer\n")
    f.write("from sklearn.preprocessing import OneHotEncoder, StandardScaler\n")
    f.write("from sklearn.pipeline import Pipeline\n")
    f.write("from sklearn.impute import SimpleImputer\n")
    f.write("from sklearn.ensemble import RandomForestClassifier\n\n")
    f.write(class_source)

print("✓ Class definition written to /results/utils.py")

# Verify the export by importing it
import sys
sys.path.insert(0, '/results')
from utils import ChurnPredictor as ExportedPredictor

# Test the imported class
test_model = ExportedPredictor()
test_model.fit(train_df.head(50))
test_pred = test_model.predict(X_val.head(5))

print("✓ Successfully imported and tested exported class")
print("✓ Export verification complete")

# Cell 12: Final Summary
print("\n" + "="*60)
print("SOLUTION SUMMARY")
print("="*60)
print(f"✓ Dataset: {len(df)} total samples")
print(f"✓ Training: {len(train_df)} samples")
print(f"✓ Validation: {len(val_df)} samples")
print(f"✓ ROC AUC Score: {roc_auc:.4f}")
print(f"✓ Target: >= 0.83")
print(f"✓ Class exported to /results/utils.py")
print(f"✓ Export verified by re-import")
print(f"\n{'✓ SOLUTION COMPLETE - READY FOR TESTING' if roc_auc >= 0.83 else '✗ PERFORMANCE BELOW TARGET'}")
print("="*60)