In [2]:
!pip install pandas scikit-learn numpy joblib




[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os
import shutil

# Define the file path
DATA_PATH = "Telco-Customer-Churn.csv"

# Load the dataset
df = pd.read_csv(DATA_PATH)

# Initial data cleaning (TotalCharges is NOT used but still good practice)
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)
df.dropna(subset=['TotalCharges'], inplace=True)
df['TotalCharges'] = df['TotalCharges'].astype(float)

# ====================================================================
# --- NEW: Define the EXACT six features required ---
# ====================================================================
REQUIRED_FEATURES = [
    'Contract', 
    'tenure', 
    'InternetService', 
    'MonthlyCharges', 
    'OnlineSecurity', 
    'TechSupport'
]
TARGET_COLUMN = 'Churn'

# Separate features (X) using only the required list, and target (y)
X = df[REQUIRED_FEATURES]
y = df[TARGET_COLUMN].apply(lambda x: 1 if x == 'Yes' else 0) # Convert 'Yes'/'No' to 1/0

print(f"Data Loaded. Shape: {df.shape}")
print(f"Features included: {REQUIRED_FEATURES}")

Data Loaded. Shape: (7032, 21)
Features included: ['Contract', 'tenure', 'InternetService', 'MonthlyCharges', 'OnlineSecurity', 'TechSupport']


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os
import shutil

# Define the file path
DATA_PATH = "Telco-Customer-Churn.csv"

# Load the dataset
df = pd.read_csv(DATA_PATH)

# Initial data cleaning (TotalCharges is NOT used but still good practice)
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)
df.dropna(subset=['TotalCharges'], inplace=True)
df['TotalCharges'] = df['TotalCharges'].astype(float)

# ====================================================================
# --- NEW: Define the EXACT six features required ---
# ====================================================================
REQUIRED_FEATURES = [
    'Contract', 
    'tenure', 
    'InternetService', 
    'MonthlyCharges', 
    'OnlineSecurity', 
    'TechSupport'
]
TARGET_COLUMN = 'Churn'

# Separate features (X) using only the required list, and target (y)
X = df[REQUIRED_FEATURES]
y = df[TARGET_COLUMN].apply(lambda x: 1 if x == 'Yes' else 0) # Convert 'Yes'/'No' to 1/0

print(f"Data Loaded. Shape: {df.shape}")
print(f"Features included: {REQUIRED_FEATURES}")

Data Loaded. Shape: (7032, 21)
Features included: ['Contract', 'tenure', 'InternetService', 'MonthlyCharges', 'OnlineSecurity', 'TechSupport']


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Identify column types based on the 6 features
# 'tenure' and 'MonthlyCharges' are numerical
numerical_features = ['tenure', 'MonthlyCharges']
# The remaining 4 are categorical
categorical_features = ['Contract', 'InternetService', 'OnlineSecurity', 'TechSupport']

# Create Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        # Apply Standardization to numerical features
        ('num', StandardScaler(), numerical_features),
        # Apply One-Hot Encoding to categorical features
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ]
)

print("\nPreprocessing pipeline created using only 6 features.")


Training set size: (4922, 6)
Test set size: (2110, 6)

Preprocessing pipeline created using only 6 features.


In [9]:
# Define the Model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Create the full pipeline: Preprocessing -> Model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf_classifier)
])

print("Starting Random Forest Model Training on 6 features...")

# Train the model
model_pipeline.fit(X_train, y_train)

print("Model Training Complete! ")

Starting Random Forest Model Training on 6 features...
Model Training Complete! 


In [10]:
# 1. Make predictions on the test set
y_pred = model_pipeline.predict(X_test)

# 2. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"\nModel Accuracy (6 features): {accuracy:.4f}")
print("\nClassification Report:")
print(report)

# 3. Save the model 
MODEL_PATH = "sklearn_churn_model_6_features.joblib" # Changed name to reflect 6 features

# Remove existing file if it exists
if os.path.exists(MODEL_PATH):
    os.remove(MODEL_PATH)

# Save the model
joblib.dump(model_pipeline, MODEL_PATH)

print(f"\nModel saved successfully to: {MODEL_PATH}")


Model Accuracy (6 features): 0.7654

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.87      0.84      1549
           1       0.57      0.49      0.53       561

    accuracy                           0.77      2110
   macro avg       0.70      0.68      0.69      2110
weighted avg       0.76      0.77      0.76      2110


Model saved successfully to: sklearn_churn_model_6_features.joblib


In [11]:
import joblib
import pandas as pd
import numpy as np

# Define the path to the saved model
MODEL_PATH = "sklearn_churn_model_6_features.joblib"

# Load the saved model pipeline
try:
    loaded_pipeline = joblib.load(MODEL_PATH)
    print(f"Model loaded successfully from: {MODEL_PATH}")
except FileNotFoundError:
    print(f"Error: Model file not found at {MODEL_PATH}. Run Cell 4 first!")
    raise

# --------------------------------------------------------------------------
# 1. Define the NEW CUSTOMER's data (must only contain the 6 features)
# --------------------------------------------------------------------------

new_customer_data = {
    'Contract': ['Month-to-month'], # High risk
    'tenure': [48],                  # Mid-range tenure
    'InternetService': ['Fiber optic'], # High risk
    'MonthlyCharges': [95.50],        # High
    'OnlineSecurity': ['No'],         # High risk
    'TechSupport': ['No']             # High risk
}

# Convert the dictionary into a Pandas DataFrame
new_customer_df = pd.DataFrame(new_customer_data)

print("\nNew Customer Data:")
print(new_customer_df)

# --------------------------------------------------------------------------
# 2. Make the Prediction
# --------------------------------------------------------------------------

prediction = loaded_pipeline.predict(new_customer_df)
prediction_proba = loaded_pipeline.predict_proba(new_customer_df)

# --------------------------------------------------------------------------
# 3. Interpret and Display the Result
# --------------------------------------------------------------------------

churn_status = "WILL CHURN (Prediction = 1)" if prediction[0] == 1 else "WILL NOT CHURN (Prediction = 0)"
churn_probability = prediction_proba[0][1] * 100

print("-" * 50)
print(f"Prediction: {churn_status}")
print(f"Probability of Churn (P=1): {churn_probability:.2f}%")
print("-" * 50)

Model loaded successfully from: sklearn_churn_model_6_features.joblib

New Customer Data:
         Contract  tenure InternetService  MonthlyCharges OnlineSecurity  \
0  Month-to-month      48     Fiber optic            95.5             No   

  TechSupport  
0          No  
--------------------------------------------------
Prediction: WILL NOT CHURN (Prediction = 0)
Probability of Churn (P=1): 49.00%
--------------------------------------------------


In [2]:
import joblib
import pandas as pd
import numpy as np

# Define the path to the saved model
MODEL_PATH = "sklearn_churn_model_6_features.joblib"

# Load the saved model pipeline
try:
    loaded_pipeline = joblib.load(MODEL_PATH)
    print(f"Model loaded successfully from: {MODEL_PATH}")
except FileNotFoundError:
    print(f"Error: Model file not found at {MODEL_PATH}. Run Cell 4 first!")
    raise

# --------------------------------------------------------------------------
# 1. Define the HIGH-RISK CUSTOMER's data
# --------------------------------------------------------------------------

high_risk_customer_data = {
    'Contract': ['Month-to-month'], # Highest risk
    'tenure': [2],                   # Low tenure = high risk
    'InternetService': ['Fiber optic'], # High cost/risk service
    'MonthlyCharges': [105.99],       # High charges
    'OnlineSecurity': ['No'],         # Lacking security
    'TechSupport': ['No']             # Lacking tech support
}

# Convert the dictionary into a Pandas DataFrame
new_customer_df = pd.DataFrame(high_risk_customer_data)

print("\nNew Customer Data (High Risk Profile):")
print(new_customer_df)

# --------------------------------------------------------------------------
# 2. Make the Prediction
# --------------------------------------------------------------------------

prediction = loaded_pipeline.predict(new_customer_df)
prediction_proba = loaded_pipeline.predict_proba(new_customer_df)

# --------------------------------------------------------------------------
# 3. Interpret and Display the Result
# --------------------------------------------------------------------------

churn_status = "WILL CHURN (Prediction = 1)" if prediction[0] == 1 else "WILL NOT CHURN (Prediction = 0)"
churn_probability = prediction_proba[0][1] * 100

print("-" * 50)
print(f"Prediction: {churn_status}")
print(f"Probability of Churn (P=1): {churn_probability:.2f}%")
print("-" * 50)

Model loaded successfully from: sklearn_churn_model_6_features.joblib

New Customer Data (High Risk Profile):
         Contract  tenure InternetService  MonthlyCharges OnlineSecurity  \
0  Month-to-month       2     Fiber optic          105.99             No   

  TechSupport  
0          No  
--------------------------------------------------
Prediction: WILL CHURN (Prediction = 1)
Probability of Churn (P=1): 81.00%
--------------------------------------------------
