<a href="https://colab.research.google.com/github/Burka-Developer/Machine-Learning/blob/main/Python_Code_for_Customer_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd # For data manipulation and analysis
from sklearn.model_selection import train_test_split # For splitting data into training and testing sets
from sklearn.linear_model import LogisticRegression # Our chosen ML model
from sklearn.preprocessing import StandardScaler, OneHotEncoder # For data preprocessing
from sklearn.compose import ColumnTransformer # For applying different transformations to different columns
from sklearn.pipeline import Pipeline # For chaining preprocessing and modeling steps
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # For evaluating model performance
import numpy as np # For numerical operations, especially for handling missing values

print("Libraries imported successfully!")

# --- 1. Load the Dataset ---
# We'll simulate loading a dataset. In a real scenario, you'd load from a CSV:
# df = pd.read_csv('Telco-Customer-Churn.csv')
# For demonstration, let's create a dummy dataset that mimics the structure of the Telco churn data.
# In a real scenario, you would download the 'Telco-Customer-Churn.csv' file and place it
# in the same directory as your Python script, then uncomment the line above.

# If you don't have the CSV, you can download it from Kaggle:
# https://www.kaggle.com/datasets/blastchar/telco-customer-churn/download?datasetVersionNumber=1

# For now, let's create a simplified dummy DataFrame for demonstration purposes.
# This will allow the code to run even without the actual CSV.
data = {
    'gender': ['Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male'],
    'SeniorCitizen': [0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
    'Partner': ['Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No'],
    'Dependents': ['No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No'],
    'tenure': [1, 34, 2, 45, 2, 8, 22, 10, 28, 62],
    'PhoneService': ['No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes'],
    'MultipleLines': ['No phone service', 'No', 'No', 'No phone service', 'No', 'Yes', 'Yes', 'No phone service', 'No', 'No'],
    'InternetService': ['DSL', 'DSL', 'DSL', 'DSL', 'Fiber optic', 'Fiber optic', 'Fiber optic', 'DSL', 'Fiber optic', 'DSL'],
    'OnlineSecurity': ['No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'Yes'],
    'OnlineBackup': ['Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes'],
    'DeviceProtection': ['No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'No'],
    'TechSupport': ['No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes'],
    'StreamingTV': ['No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes'],
    'StreamingMovies': ['No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes'],
    'Contract': ['Month-to-month', 'One year', 'Month-to-month', 'One year', 'Month-to-month', 'Month-to-month', 'Month-to-month', 'Month-to-month', 'One year', 'Two year'],
    'PaperlessBilling': ['Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No'],
    'PaymentMethod': ['Electronic check', 'Mailed check', 'Mailed check', 'Bank transfer (automatic)', 'Electronic check', 'Electronic check', 'Bank transfer (automatic)', 'Mailed check', 'Electronic check', 'Bank transfer (automatic)'],
    'MonthlyCharges': [29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.75, 104.80, 56.05],
    'TotalCharges': ['29.85', '1889.5', '108.15', '1840.75', '151.65', '820.5', '1949.4', '301.9', '3006.7', '3487.95'],
    'Churn': ['No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No']
}
df = pd.DataFrame(data)

print("\n--- Original Data Head ---")
print(df.head())
print("\n--- Data Info ---")
df.info()

# --- 2. Data Preprocessing ---

# Convert 'TotalCharges' to numeric. It's often loaded as object/string because of empty strings.
# Errors='coerce' will turn non-convertible values into NaN (Not a Number)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Handle missing values in 'TotalCharges' by filling with the median
# The median is often preferred over the mean for skewed data or when outliers are present.
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
print(f"\nMissing values in 'TotalCharges' after filling: {df['TotalCharges'].isnull().sum()}")

# Separate features (X) and target (y)
# X contains all columns except 'Churn'
X = df.drop('Churn', axis=1)
# y is the 'Churn' column, converted to binary (0 for No, 1 for Yes)
y = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Identify categorical and numerical features
# 'SeniorCitizen' is numerical (0 or 1) but often treated as categorical for encoding
# 'tenure', 'MonthlyCharges', 'TotalCharges' are numerical
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
# All other columns are considered categorical for one-hot encoding
categorical_features = X.columns.drop(numerical_features).tolist()

print(f"\nNumerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")

# Create preprocessing pipelines for numerical and categorical features
# Numerical pipeline: Just scale the features
numerical_transformer = StandardScaler()
# Categorical pipeline: One-hot encode the features
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # 'ignore' handles categories not seen during training

# Create a preprocessor using ColumnTransformer
# This allows us to apply different transformations to different columns in parallel
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("\nData preprocessing setup complete!")

# --- 3. Split Data into Training and Testing Sets ---
# We split the data to evaluate the model's performance on unseen data.
# test_size=0.20 means 20% of the data will be used for testing, 80% for training.
# random_state ensures reproducibility of the split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

print(f"\nData split into training and testing sets:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"Churn distribution in training set:\n{y_train.value_counts(normalize=True)}")
print(f"Churn distribution in test set:\n{y_test.value_counts(normalize=True)}")


# --- 4. Create and Train the Machine Learning Pipeline ---
# A pipeline chains together multiple processing steps.
# Here, it first applies the preprocessor (scaling and encoding)
# then trains the Logistic Regression model.
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, solver='liblinear')) # liblinear is good for small datasets
])

print("\nTraining the Logistic Regression model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete!")

# --- 5. Model Evaluation (Accuracy First Priority!) ---
# Make predictions on the test set
y_pred = model_pipeline.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy:.4f}") # Display accuracy with 4 decimal places

# Display Confusion Matrix
# This helps us understand where the model made mistakes.
# Rows are actual classes, columns are predicted classes.
# [[True Negatives (TN), False Positives (FP)],
#  [False Negatives (FN), True Positives (TP)]]
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Display Classification Report
# Provides Precision, Recall, F1-score for each class (0: No Churn, 1: Churn)
# Precision: Of all predicted 'Yes' (churn), how many were actually 'Yes'?
# Recall: Of all actual 'Yes' (churn), how many did the model correctly identify?
# F1-score: Harmonic mean of Precision and Recall.
class_report = classification_report(y_test, y_pred, target_names=['No Churn', 'Churn'])
print("\nClassification Report:")
print(class_report)

# --- 6. Make Predictions on New Data ---
# Let's simulate a new customer's data to see how our model predicts.
# This customer is male, senior, no partner/dependents, 1 month tenure, fiber optic, high monthly charges, etc.
new_customer_data = pd.DataFrame([{
    'gender': 'Male',
    'SeniorCitizen': 1,
    'Partner': 'No',
    'Dependents': 'No',
    'tenure': 1,
    'PhoneService': 'Yes',
    'MultipleLines': 'No',
    'InternetService': 'Fiber optic',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'No',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'No',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 80.00,
    'TotalCharges': 80.00
}])

print("\n--- Predicting for a New Customer ---")
print("New Customer Data:")
print(new_customer_data)

# Predict churn probability for the new customer
# predict_proba returns the probability of each class (0: No Churn, 1: Churn)
churn_probability = model_pipeline.predict_proba(new_customer_data)[:, 1][0]
print(f"\nProbability of Churn for New Customer: {churn_probability:.4f}")

# Predict the class (0 or 1) for the new customer
predicted_churn_class = model_pipeline.predict(new_customer_data)[0]
predicted_churn_label = "Yes (Churn)" if predicted_churn_class == 1 else "No (No Churn)"
print(f"Predicted Churn Status for New Customer: {predicted_churn_label}")

print("\n--- Model Building Complete ---")
print("This model can now automatically predict customer churn, saving businesses time and helping them retain customers proactively.")

Libraries imported successfully!

--- Original Data Head ---
   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0     Yes         No       1           No   
1    Male              0      No         No      34          Yes   
2  Female              1      No         No       2          Yes   
3    Male              0      No         No      45           No   
4  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No       

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
