In [None]:
import sys
print("Installing required packages...")
!{sys.executable} -m pip install -q pandas numpy matplotlib seaborn scikit-learn tensorflow
print("âœ“ All packages installed successfully!")

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Set visual style
sns.set(style="whitegrid")
print("Libraries imported successfully.")

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Block 2: Load and Clean Data
# Load dataset (Make sure to upload the CSV to Colab first!)
try:
    df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
    print("Dataset loaded.")
except FileNotFoundError:
    print("Error: Please upload the CSV file and ensure the name is correct.")

# 1. Handle Missing Values in TotalCharges
# TotalCharges is object type because of blank spaces ' '. We force them to NaN and fill with 0.
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)

# 2. Remove CustomerID (Not useful for prediction)
df.drop(columns=['customerID'], inplace=True)

# 3. Convert Target 'Churn' to Binary (0 and 1)
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

print("Data cleaning complete.")
print(f"Dataset Shape: {df.shape}")
df.head()

In [None]:
# Block 3: Exploratory Data Analysis (EDA)

# 1. Target Distribution (Class Imbalance)
plt.figure(figsize=(6, 4))
sns.countplot(x='Churn', data=df, palette='coolwarm')
plt.title('Class Distribution: Churn vs Non-Churn')
plt.show()

# 2. Numerical Features Distribution
num_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
plt.figure(figsize=(15, 5))
for i, col in enumerate(num_features):
    plt.subplot(1, 3, i+1)
    sns.histplot(df, x=col, hue='Churn', multiple="stack", palette='coolwarm')
    plt.title(f'{col} Distribution')
plt.tight_layout()
plt.show()

# 3. Categorical Correlations (e.g., Contract Type)
plt.figure(figsize=(8, 5))
sns.countplot(x='Contract', hue='Churn', data=df, palette='viridis')
plt.title('Churn Rate by Contract Type')
plt.show()

print("EDA Insight: 'Month-to-month' contracts have a significantly higher churn rate.")

In [None]:
# Block 4: Preprocessing

# 1. One-Hot Encoding for Categorical Variables
# This turns text columns (like "InternetService": "Fiber") into numbers.
df_encoded = pd.get_dummies(df, drop_first=True)

# 2. Split into X (Features) and y (Target)
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

# 3. Train-Test Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. Feature Scaling (Crucial for Neural Networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Preprocessing complete.")
print(f"Training features shape: {X_train_scaled.shape}")
print(f"Testing features shape: {X_test_scaled.shape}")

In [None]:
# Block 5: Decision Tree Implementation

# Define the model
dt = DecisionTreeClassifier(random_state=42)

# Hyperparameter Tuning using GridSearchCV
# We test different depths to find the best one
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get best model
best_dt = grid_search.best_estimator_

# Predict
y_pred_dt = best_dt.predict(X_test_scaled)
y_prob_dt = best_dt.predict_proba(X_test_scaled)[:, 1]

print(f"Best Parameters for Decision Tree: {grid_search.best_params_}")
print("Decision Tree Training Complete.")

In [None]:
# Block 6: Neural Network Implementation

# Build the model (Sequential API)
model = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],)), # Input Layer
    layers.Dropout(0.2), # Dropout to prevent overfitting
    layers.Dense(16, activation='relu'), # Hidden Layer
    layers.Dense(1, activation='sigmoid') # Output Layer (Sigmoid for binary classification)
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_scaled, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.2,
                    verbose=0) # Set verbose=1 to see training logs

# Predict
y_prob_nn = model.predict(X_test_scaled)
y_pred_nn = (y_prob_nn > 0.5).astype("int32")

print("Neural Network Training Complete.")

In [None]:
# Block 7: Evaluation and Comparison

def evaluate_model(y_test, y_pred, y_prob, name):
    print(f"--- {name} Results ---")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_prob):.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Evaluate Decision Tree
evaluate_model(y_test, y_pred_dt, y_prob_dt, "Decision Tree")

# Evaluate Neural Network
evaluate_model(y_test, y_pred_nn, y_prob_nn, "Neural Network")

# ROC Curve Comparison
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_prob_dt)
fpr_nn, tpr_nn, _ = roc_curve(y_test, y_prob_nn)

plt.figure(figsize=(8, 6))
plt.plot(fpr_dt, tpr_dt, label=f"Decision Tree (AUC = {roc_auc_score(y_test, y_prob_dt):.2f})")
plt.plot(fpr_nn, tpr_nn, label=f"Neural Network (AUC = {roc_auc_score(y_test, y_prob_nn):.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()