In [37]:
import joblib
import pandas as pd
import numpy as np

# Load all the pre-trained assets
log_reg = joblib.load("logistic_churn_model.pkl")
dt_model = joblib.load("decision_tree_churn_model.pkl")
best_rf_model = joblib.load("random_forest_churn_model.pkl")
scaler = joblib.load("scaler.pkl")

# Define the exact list of columns from the training data
# You can get this from your X.columns in the 03_modeling.ipynb notebook
TRAINING_COLUMNS = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
                    'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges',
                    'MultipleLines_No phone service', 'MultipleLines_Yes',
                    'InternetService_Fiber optic', 'InternetService_No',
                    'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
                    'OnlineBackup_No internet service', 'OnlineBackup_Yes',
                    'DeviceProtection_No internet service', 'DeviceProtection_Yes',
                    'TechSupport_No internet service', 'TechSupport_Yes',
                    'StreamingTV_No internet service', 'StreamingTV_Yes',
                    'StreamingMovies_No internet service', 'StreamingMovies_Yes',
                    'Contract_One year', 'Contract_Two year',
                    'PaymentMethod_Credit card (automatic)',
                    'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']


def predict_churn(customer_data, model_name='Random Forest'):
    """
    Predicts churn for a single new customer.

    Args:
        customer_data (dict): A dictionary with the customer's raw data.
        model_name (str): The name of the model to use ('Logistic Regression', 'Decision Tree', 'Random Forest').

    Returns:
        dict: A dictionary containing the prediction and probability of churn.
    """
    # 1. Create a DataFrame from the new data
    new_df = pd.DataFrame([customer_data])

    # 2. Add any missing columns from the original training data (important for one-hot encoding)
    new_df = new_df.reindex(columns=TRAINING_COLUMNS, fill_value=0)

    # 3. Scale numerical features using the saved scaler
    numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
    new_df[numeric_cols] = scaler.transform(new_df[numeric_cols])

    # 4. Select the correct model
    if model_name == 'Logistic Regression':
        model = log_reg
    elif model_name == 'Decision Tree':
        model = dt_model
    else:
        model = best_rf_model

    # 5. Make the prediction
    prediction = model.predict(new_df)[0]
    probability_of_churn = model.predict_proba(new_df)[0][1]

    # 6. Return a user-friendly result
    result = {
        'model_used': model_name,
        'prediction': 'Churn' if prediction == 1 else 'No Churn',
        'probability_of_churn': round(probability_of_churn, 2)
    }
    return result

In [38]:
# Example of a new customer who is likely to churn
customer_to_predict = {
    'gender': 0, 'SeniorCitizen': 0, 'Partner': 0, 'Dependents': 0, 'tenure': 5,
    'PhoneService': 1, 'PaperlessBilling': 1, 'MonthlyCharges': 80.0, 'TotalCharges': 400.0,
    'MultipleLines_Yes': 1, 'InternetService_Fiber optic': 1,
    'OnlineSecurity_No internet service': 0, 'OnlineSecurity_Yes': 0,
    'OnlineBackup_No internet service': 0, 'OnlineBackup_Yes': 0,
    'DeviceProtection_No internet service': 0, 'DeviceProtection_Yes': 0,
    'TechSupport_No internet service': 0, 'TechSupport_Yes': 0,
    'StreamingTV_No internet service': 0, 'StreamingTV_Yes': 0,
    'StreamingMovies_No internet service': 0, 'StreamingMovies_Yes': 0,
    'Contract_One year': 0, 'Contract_Two year': 0,
    'PaymentMethod_Electronic check': 1,
    'TotalServices': 5, 'AvgCharges': 16, 'TotalRevenue': 400
}

# Get a prediction using the tuned Random Forest model
prediction = predict_churn(customer_to_predict, model_name='Random Forest')
print(f"Random Forest Prediction: {prediction}")

Random Forest Prediction: {'model_used': 'Random Forest', 'prediction': 'Churn', 'probability_of_churn': np.float64(0.78)}
