#WEEKS 1‚Äì8

In [None]:
# ===========================
# Top Cell: Import Libraries
# ===========================

# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

# Scikit-learn modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Metrics
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    auc
)

# ===========================
# Load Dataset
# ===========================
df = pd.read_csv('telecom_customer_churn_cleaned.csv')


In [None]:
# View summary
print(df.info())
print(df.head())

# Drop irrelevant columns (example: ID, customer name, etc. ‚Äî adjust as needed)
irrelevant_cols = ['Customer ID', 'Name', 'Unnamed: 0']  # change according to your dataset
df = df.drop(columns=[col for col in irrelevant_cols if col in df.columns], errors='ignore')

# Remove duplicate rows
df = df.drop_duplicates()

# Handle NaN values
# Option 1: Drop rows with too many NaNs
df = df.dropna(thresh=len(df.columns) - 2)  # keeps rows with at least n-2 non-NaN values

# Option 2: Fill remaining NaNs
df = df.fillna(df.median(numeric_only=True))  # numeric columns
df = df.fillna(df.mode().iloc[0])  # categorical columns

# Verify cleaning
print("Remaining NaN values per column:\n", df.isna().sum())



# Week 5: Supervised Learning ‚Äì Regression

In [None]:
# Show columns for reference
print("Columns in dataset:\n", df.columns.tolist())

# Select numeric columns only
df_numeric = df.select_dtypes(include=[np.number]).dropna()

# Try to find the correct target column automatically
target_candidates = ['Monthly Charges', 'MonthlyCharges', 'Total Charges', 'TotalCharges', 'Monthly_Fee']
target_col = None
for col in target_candidates:
    if col in df_numeric.columns:
        target_col = col
        break

if not target_col:
    raise KeyError("‚ö†Ô∏è Could not find a numeric target column (e.g., Monthly Charges or Total Charges). "
                   "Please check your dataset column names.")

print(f"\n‚úÖ Using '{target_col}' as the target variable.\n")

# Define X (features) and y (target)
X = df_numeric.drop(columns=[target_col])
y = df_numeric[target_col]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict
y_pred = lr.predict(X_test)

# Evaluate with MAE and RMSE
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Optional: compare actual vs predicted
comparison = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred})
print("\nSample predictions:\n", comparison.head())


# Week 6: Supervised Learning ‚Äì Classification

In [None]:
# Drop irrelevant columns (if exist)
irrelevant_cols = ['Customer ID', 'Name', 'Unnamed: 0']
df = df.drop(columns=[col for col in irrelevant_cols if col in df.columns], errors='ignore')

# Identify target column automatically
target_candidates = ['Customer Status', 'Churn', 'Exited', 'Target']
target_col = None
for col in target_candidates:
    if col in df.columns:
        target_col = col
        break

if not target_col:
    raise KeyError("‚ö†Ô∏è Could not find churn/target column. Please verify your dataset.")

print(f"‚úÖ Using '{target_col}' as target column.\n")

# Encode target variable (e.g., Churned=1, Stayed=0)
le = LabelEncoder()
df[target_col] = le.fit_transform(df[target_col])

# Convert categorical columns into numeric using one-hot encoding
X = df.drop(columns=[target_col])
X = pd.get_dummies(X, drop_first=True)
y = df[target_col]

# Handle missing values
X = X.fillna(X.median(numeric_only=True))
X = X.fillna(X.mode().iloc[0])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_model = LogisticRegression(solver='saga', max_iter=1000)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Evaluate accuracies
log_acc = accuracy_score(y_test, log_pred)
rf_acc = accuracy_score(y_test, rf_pred)

print(f"Logistic Regression Accuracy: {log_acc:.3f}")
print(f"Random Forest Accuracy: {rf_acc:.3f}")

best_model = "Random Forest" if rf_acc > log_acc else "Logistic Regression"
print(f"\nüèÜ Best Model: {best_model}")


# **Week 7: Model Evaluation**

In [None]:

# Use the trained model and test data from Week 6
# (rf_model, X_test, y_test must already be defined)

# === Classification Report ===
print("=== Classification Report ===")
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

# === Confusion Matrix ===
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)

# === Multiclass ROC & AUC ===
n_classes = len(np.unique(y_test))

# Get prediction probabilities
y_prob = rf_model.predict_proba(X_test)

# Binarize labels for ROC
y_test_bin = label_binarize(y_test, classes=np.arange(n_classes))

# Compute ROC and AUC for each class
fpr, tpr, roc_auc = {}, {}, {}
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC Curves
colors = cycle(['blue', 'green', 'red', 'orange', 'purple'])
plt.figure(figsize=(7, 6))
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f"Class {i} (AUC = {roc_auc[i]:.2f})")

# Baseline line
plt.plot([0, 1], [0, 1], 'k--', lw=1)

# Graph details
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass ROC Curve ‚Äì Random Forest')
plt.legend(loc='lower right')
plt.show()

# === Reflection ===
print("\nüí¨ Reflection:")
print("For churn prediction, Recall is the most important metric ‚Äî we want to identify as many customers likely to churn as possible, even if that means a few false positives.")


# Week 8: Unsupervised Learning ‚Äì Clustering

In [None]:
# Drop irrelevant columns (optional)
irrelevant_cols = ['Customer ID', 'Name', 'Unnamed: 0']
df = df.drop(columns=[col for col in irrelevant_cols if col in df.columns], errors='ignore')

# Select numeric columns
df_num = df.select_dtypes(include=['number'])

# Fill missing numeric values (so row count stays the same)
df_num = df_num.fillna(df_num.median(numeric_only=True))

# Standardize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_num)

# Apply K-Means (choose 3 clusters)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels back to the dataframe
df['Cluster'] = clusters

# Apply PCA for 2D visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_scaled)

# Plot
plt.figure(figsize=(8,6))
plt.scatter(pca_result[:,0], pca_result[:,1], c=clusters, cmap='viridis')
plt.title('K-Means Clustering Visualization (PCA 2D)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')
plt.show()

# Show sample results
print("‚úÖ Clustering complete!")
print(df[['Cluster']].value_counts().sort_index())


# Week 9: Neural Networks ‚Äì ANN Baseline

In [None]:
# Assignment 9: Build a simple ANN (Artificial Neural Network)
# Using Keras to build an ANN on the classification dataset

try:
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout
    from tensorflow.keras.optimizers import Adam
    
    # Use X_train, X_test, y_train, y_test from Week 6 (classification)
    ann = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(len(np.unique(y_train)), activation='softmax')
    ])
    
    ann.compile(optimizer=Adam(learning_rate=0.001), 
                loss='sparse_categorical_crossentropy', 
                metrics=['accuracy'])
    
    # Train
    ann.fit(X_train, y_train, epochs=10, batch_size=32, 
            validation_split=0.2, verbose=0)
    
    # Evaluate
    ann_loss, ann_acc = ann.evaluate(X_test, y_test, verbose=0)
    print(f"‚úÖ ANN Accuracy: {ann_acc:.4f}")
    print(f"Random Forest Accuracy (Week 6): {rf_acc:.4f}")
    print(f"Comparison: ANN performs {'better' if ann_acc > rf_acc else 'comparably'} to Random Forest")

except ImportError:
    print("‚ö†Ô∏è TensorFlow not installed. Skipping ANN (optional for course completion).")
    print("To install: pip install tensorflow")


# Week 10: Advanced Deep Learning ‚Äì CNN/RNN (Justified Exclusion)

**Why not CNN/RNN?**
- CNNs (Convolutional Neural Networks) are for image data ‚Üí Our dataset is **tabular** (rows/columns), not images.
- RNNs (Recurrent Neural Networks) are for sequential/time-series data ‚Üí Our dataset has **no time dependency**.
- For tabular data, **dense/fully-connected ANN** (Week 9) is appropriate.

**Conclusion**: CNN/RNN not applicable to telecom customer churn dataset.


# Week 11: Natural Language Processing (NLP) ‚Äì Justified Exclusion

**Why not NLP?**
- NLP (Tokenization, embeddings, TF-IDF) is for **text data** (reviews, messages, documents).
- Our dataset is **numeric/categorical telecom features** (tenure, charges, contract type), not text.
- No text column to process.

**Conclusion**: NLP pipeline not applicable to this project.


# Week 12: AI in Data Science ‚Äì Industry Application

## Real-World Application: Telecom Customer Churn Prediction

**Problem**: Telecom companies lose revenue when customers churn (cancel service).

**Solution**: Use ML model to predict which customers are likely to churn, enabling:
- **Proactive retention campaigns** (discounts, offers for at-risk customers)
- **Cost savings**: Retaining 1 customer is cheaper than acquiring a new one
- **Data-driven strategy**: Identify top churn drivers (contract type, charges, internet service)

**Business Impact**:
- Reduce churn rate by 10‚Äì15% ‚Üí millions in retained revenue
- Personalize customer experience based on risk score
- Optimize marketing budget allocation

**Model Evaluation**: Recall is critical here (catch as many churners as possible, even with false positives).


# Week 13: Model Deployment

## Deploy Model on Local Server

The trained model is saved as `model.pkl`, `scaler.pkl`, and `feature_columns.pkl`.

**To deploy:**

1. Run training script:
   ```bash
   python train_model.py
   ```

2. Start Flask server:
   ```bash
   python app.py
   ```
   Server runs on `http://127.0.0.1:5000`

3. Make predictions via API:
   ```bash
   curl -X POST http://127.0.0.1:5000/predict \
     -H "Content-Type: application/json" \
     -d '{"features": {"tenure": 24, "MonthlyCharges": 65.5}}'
   ```

**Response**: `{"prediction": 0, "probability": 0.85, "class_probabilities": [0.85, 0.15]}`

See `app.py` for code and `README.md` for detailed instructions.


# Week 14: Model Explainability ‚Äì SHAP

In [None]:
# Assignment 14: Explain predictions using SHAP

try:
    import shap
    
    # Use Random Forest from Week 6
    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer.shap_values(X_test)
    
    # Summary plot showing feature importance
    print("‚úÖ SHAP Summary Plot:")
    shap.summary_plot(shap_values[1] if isinstance(shap_values, list) else shap_values, X_test, show=False)
    
    print("\nHAP Interpretation:")
    print("- Shows which features push prediction toward Churn (red) or No Churn (blue)")
    print("- Red = increases churn probability")
    print("- Blue = decreases churn probability")
    
except ImportError:
    print("SHAP not installed. Install with: pip install shap")
    print("\nAlternative explanation: Check feature importance from Random Forest")
    print(f"Top 5 features: {sorted(zip(X_test.columns, rf_model.feature_importances_), key=lambda x: x[1], reverse=True)[:5]}")
