In [None]:
import pandas as pd
raw_df=pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
raw_df.info()

In [None]:
raw_df['TotalCharges'] = pd.to_numeric(raw_df['TotalCharges'], errors='coerce')

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(raw_df, test_size=0.2, random_state=42, stratify=raw_df['Churn'])
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42, stratify=train_df['Churn'])

In [None]:
train_df.info()

In [None]:
input_column=list(train_df.columns)[1:-1]
target_column='Churn'

In [None]:
train_input=train_df[input_column]
train_target=train_df[target_column]
val_input=val_df[input_column]
val_target=val_df[target_column]
test_input=test_df[input_column]
test_target=test_df[target_column]

In [None]:
import numpy as np

In [None]:
numerical_cols=train_input.select_dtypes(include=np.number).columns.tolist()
categorical_cols=train_input.select_dtypes('object').columns.tolist()
print(numerical_cols)
print(categorical_cols)

In [None]:
train_input[categorical_cols]

In [None]:
from sklearn.impute import SimpleImputer
imputer_num=SimpleImputer(strategy='mean')
imputer_cat=SimpleImputer(strategy='most_frequent')

imputer_num.fit(train_df[numerical_cols])
imputer_cat.fit(train_df[categorical_cols])

train_input[numerical_cols]=imputer_num.transform(train_input[numerical_cols])
val_input[numerical_cols]=imputer_num.transform(val_input[numerical_cols])
test_input[numerical_cols]=imputer_num.transform(test_input[numerical_cols])

train_input[categorical_cols]=imputer_cat.transform(train_input[categorical_cols])
val_input[categorical_cols]=imputer_cat.transform(val_input[categorical_cols])
test_input[categorical_cols]=imputer_cat.transform(test_input[categorical_cols])

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(raw_df[numerical_cols])
train_input[numerical_cols]=scaler.transform(train_input[numerical_cols])
val_input[numerical_cols]=scaler.transform(val_input[numerical_cols])
test_input[numerical_cols]=scaler.transform(test_input[numerical_cols])

In [None]:
train_input.describe()

In [None]:
#  Encoding Categorcal data 
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(raw_df[categorical_cols])

encoded_cols=(encoder.get_feature_names_out(categorical_cols))

train_input[encoded_cols]=encoder.transform(train_input[categorical_cols])
val_input[encoded_cols]=encoder.transform(val_input[categorical_cols])
test_input[encoded_cols]=encoder.transform(test_input[categorical_cols])

In [None]:
X_train = train_input[numerical_cols + list(encoded_cols)]
X_val = val_input[numerical_cols + list(encoded_cols)]
X_test = test_input[numerical_cols + list(encoded_cols)]

In [None]:
from sklearn.ensemble import RandomForestClassifier
base_model=RandomForestClassifier(n_jobs=-1,random_state=42,max_depth=7,n_estimators=300,max_features='log2',min_samples_split=30,min_samples_leaf=1)

In [None]:
base_model.fit(X_train,train_target)

In [None]:
def base_acc():
    return base_model.score(X_train,train_target),base_model.score(X_val,val_target)

In [None]:
def max_depth_error(md):
    model = RandomForestClassifier(max_depth=md, random_state=42)
    model.fit(X_train, train_target)
    train_acc = 1 - model.score(X_train, train_target)
    val_acc = 1 - model.score(X_val, val_target)
    return {'Max Depth': md, 'Training Error': train_acc, 'Validation Error': val_acc}
errors_df = pd.DataFrame([max_depth_error(md) for md in range(1, 21)])
errors_df

In [None]:
def test_params(**params):
    model = RandomForestClassifier(random_state=42, n_jobs=-1, **params).fit(X_train, train_target)
    return base_acc(),model.score(X_train, train_target), model.score(X_val, val_target)

In [207]:
test_params(max_depth=7,n_estimators=300,max_features='log2',min_samples_split=30,min_samples_leaf=1)

((0.8191715976331361, 0.8034066713981547),
 0.8191715976331361,
 0.8034066713981547)

In [None]:
X_val.iloc[0]

In [None]:
def predict_input(single_input):
    input_df = pd.DataFrame([single_input])
    input_df[numerical_cols] = imputer_num.transform(input_df[numerical_cols])
    input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numerical_cols + list(encoded_cols)]
    pred = base_model.predict(X_input)[0]
    prob = base_model.predict_proba(X_input)[0][list(base_model.classes_).index(pred)]
    return pred, prob

In [204]:
# Create a single input example based on X_val structure
single_customer = {
    'SeniorCitizen': 0,
    'tenure': 24,
    'MonthlyCharges': 65.5,
    'TotalCharges': 1572.0,
    'gender': 'Male',
    'Partner': 'Yes',
    'Dependents': 'No',
    'PhoneService': 'Yes',
    'MultipleLines': 'No',
    'InternetService': 'Fiber optic',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'Yes',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check'
}

# Make prediction
prediction, probability = predict_input(single_customer)

print(f"Prediction: {prediction}")
print(f"Probability: {probability:.4f}")
print(f"\nCustomer Profile:")
print(f"  - Senior Citizen: {'Yes' if single_customer['SeniorCitizen'] == 1 else 'No'}")
print(f"  - Tenure: {single_customer['tenure']} months")
print(f"  - Monthly Charges: ${single_customer['MonthlyCharges']:.2f}")
print(f"  - Total Charges: ${single_customer['TotalCharges']:.2f}")
print(f"  - Internet Service: {single_customer['InternetService']}")
print(f"  - Contract Type: {single_customer['Contract']}")
print(f"\nChurn Prediction: {'Yes - Customer likely to churn' if prediction == 'Yes' else 'No - Customer likely to stay'}")
print(f"Confidence: {probability*100:.2f}%")

Prediction: No
Probability: 0.5757

Customer Profile:
  - Senior Citizen: No
  - Tenure: 24 months
  - Monthly Charges: $65.50
  - Total Charges: $1572.00
  - Internet Service: Fiber optic
  - Contract Type: Month-to-month

Churn Prediction: No - Customer likely to stay
Confidence: 57.57%


In [205]:
import joblib
model_package = {
    'model': base_model,
    'imputer_num': imputer_num,
    'imputer_cat': imputer_cat,
    'scaler': scaler,
    'encoder': encoder,
    'numerical_cols': numerical_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': list(encoded_cols)
}

# Save everything in one file
joblib.dump(model_package, 'customer_churn_prediction.joblib')

['customer_churn_prediction.joblib']

In [206]:
import joblib
import pandas as pd

# Load everything from single file
model_package = joblib.load('customer_churn_prediction.joblib')

# Extract components
model = model_package['model']
imputer_num = model_package['imputer_num']
imputer_cat = model_package['imputer_cat']
scaler = model_package['scaler']
encoder = model_package['encoder']
numerical_cols = model_package['numerical_cols']
categorical_cols = model_package['categorical_cols']
encoded_cols = model_package['encoded_cols']

def predict_churn(customer_data):
    """
    Predict churn for a single customer
    
    Args:
        customer_data (dict): Dictionary with customer information
        
    Returns:
        tuple: (prediction, probability)
    """
    # Convert to DataFrame
    input_df = pd.DataFrame([customer_data])
    
    # Handle numerical columns
    input_df[numerical_cols] = imputer_num.transform(input_df[numerical_cols])
    input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])
    
    # Handle categorical columns
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    
    # Prepare features
    X_input = input_df[numerical_cols + encoded_cols]
    
    # Make prediction
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    
    return pred, prob

# ===== USAGE EXAMPLE =====

customer = {
    'SeniorCitizen': 0,
    'tenure': 24,
    'MonthlyCharges': 65.5,
    'TotalCharges': 1572.0,
    'gender': 'Male',
    'Partner': 'Yes',
    'Dependents': 'No',
    'PhoneService': 'Yes',
    'MultipleLines': 'No',
    'InternetService': 'Fiber optic',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'Yes',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check'
}

prediction, confidence = predict_churn(customer)

print(f"\n{'='*50}")
print(f"CHURN PREDICTION RESULT")
print(f"{'='*50}")
print(f"Prediction: {prediction}")
print(f"Confidence: {confidence*100:.2f}%")
print(f"{'='*50}")

if prediction == 'Yes':
    print("⚠️  HIGH RISK - Customer is likely to churn")
else:
    print("✓ LOW RISK - Customer is likely to stay")


CHURN PREDICTION RESULT
Prediction: No
Confidence: 57.57%
✓ LOW RISK - Customer is likely to stay
