In [1]:
# Essential imports for data science and machine learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# PyTorch for deep learning
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix,
                           roc_auc_score, roc_curve, precision_recall_curve,
                           f1_score, precision_score, recall_score)
from sklearn.utils.class_weight import compute_class_weight

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Configure visualization
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
sns.set_palette("husl")

# Check device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🚀 Environment Setup:")
print(f"   Device: {device}")
print(f"   PyTorch Version: {torch.__version__}")
print(f"   Pandas Version: {pd.__version__}")
print(f"   NumPy Version: {np.__version__}")

if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

print("\n✅ All libraries imported successfully!")
print("📊 Ready to begin customer churn analysis...")

🚀 Environment Setup:
   Device: cpu
   PyTorch Version: 2.8.0+cpu
   Pandas Version: 2.3.3
   NumPy Version: 2.3.3

✅ All libraries imported successfully!
📊 Ready to begin customer churn analysis...


In [2]:
# Load the customer churn dataset
print("📥 Loading Telco Customer Churn dataset...")

data_paths = [
    '../data/Telco-Customer-Churn.csv'
]

df = None
for path in data_paths:
    try:
        df = pd.read_csv(path)
        print(f"✅ Data loaded successfully from: {path}")
        break
    except FileNotFoundError:
        continue

# Display basic information about the dataset
print(f"\n📊 Dataset Overview:")
print(f"   Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Display first few rows
print(f"\n👀 First 5 rows:")
print(df.head())

📥 Loading Telco Customer Churn dataset...
✅ Data loaded successfully from: ../data/Telco-Customer-Churn.csv

📊 Dataset Overview:
   Shape: 7,043 rows × 21 columns
   Memory usage: 6.8 MB

👀 First 5 rows:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone

In [3]:
# Deep dive into data quality issues
print("🧹 Data Cleaning and Quality Assessment")
print("=" * 50)

# Create a copy for cleaning
df_cleaned = df.copy()
original_shape = df_cleaned.shape

print(f"📊 Starting with {original_shape[0]:,} rows and {original_shape[1]} columns")

🧹 Data Cleaning and Quality Assessment
📊 Starting with 7,043 rows and 21 columns


In [4]:
# 1. Check for duplicate customer IDs
print(f"\n1️⃣ Checking for duplicate customers...")
duplicate_customers = df_cleaned['customerID'].duplicated().sum()
print(f"   Duplicate customerIDs: {duplicate_customers}")

if duplicate_customers > 0:
    print("   🔧 Removing duplicate customer records...")
    df_cleaned = df_cleaned.drop_duplicates(subset=['customerID'])
    print(f"   ✅ Removed {duplicate_customers} duplicate records")


1️⃣ Checking for duplicate customers...
   Duplicate customerIDs: 0


In [5]:
# 2. Handle TotalCharges data type issue (common in this dataset)
print(f"\n2️⃣ Fixing TotalCharges data type...")
print(f"   Current TotalCharges type: {df_cleaned['TotalCharges'].dtype}")

# Check for non-numeric values in TotalCharges
if df_cleaned['TotalCharges'].dtype == 'object':
    non_numeric_total = df_cleaned['TotalCharges'].apply(lambda x: not str(x).replace('.', '').replace(' ', '').isdigit())
    non_numeric_count = non_numeric_total.sum()
    print(f"   Non-numeric TotalCharges values: {non_numeric_count}")

    if non_numeric_count > 0:
        print("   🔧 Converting TotalCharges to numeric...")
        # Convert to numeric, invalid values become NaN
        df_cleaned['TotalCharges'] = pd.to_numeric(df_cleaned['TotalCharges'], errors='coerce')
        print(f"   ✅ TotalCharges converted to numeric type")


2️⃣ Fixing TotalCharges data type...
   Current TotalCharges type: object
   Non-numeric TotalCharges values: 11
   🔧 Converting TotalCharges to numeric...
   ✅ TotalCharges converted to numeric type


In [6]:
# 3. Handle missing values
print(f"\n3️⃣ Handling missing values...")
missing_values = df_cleaned.isnull().sum()
missing_percent = (missing_values / len(df_cleaned) * 100).round(2)

missing_summary = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Count': missing_values.values,
    'Missing %': missing_percent.values
})
missing_summary = missing_summary[missing_summary['Missing Count'] > 0]

if len(missing_summary) > 0:
    print("   Missing values found:")
    print(missing_summary.to_string(index=False))

    # Handle TotalCharges missing values (likely new customers with 0 tenure)
    if 'TotalCharges' in missing_summary['Column'].values:
        total_charges_missing = df_cleaned['TotalCharges'].isnull().sum()
        print(f"\n   🔧 Handling {total_charges_missing} missing TotalCharges values...")

        # Check if missing TotalCharges correspond to low tenure customers
        missing_tenure = df_cleaned[df_cleaned['TotalCharges'].isnull()]['tenure'].describe()
        print(f"   Tenure stats for missing TotalCharges customers:")
        print(f"   Mean tenure: {missing_tenure['mean']:.1f} months")

        if missing_tenure['mean'] < 3:  # Very new customers
            print("   💡 Missing TotalCharges likely represent new customers")
            print("   🔧 Filling missing TotalCharges with MonthlyCharges (first month)")
            df_cleaned['TotalCharges'] = df_cleaned['TotalCharges'].fillna(df_cleaned['MonthlyCharges'])
        else:
            print("   🔧 Filling missing TotalCharges with median value")
            df_cleaned['TotalCharges'] = df_cleaned['TotalCharges'].fillna(df_cleaned['TotalCharges'].median())

        print("   ✅ TotalCharges missing values handled")
else:
    print("   ✅ No missing values found")


3️⃣ Handling missing values...
   Missing values found:
      Column  Missing Count  Missing %
TotalCharges             11       0.16

   🔧 Handling 11 missing TotalCharges values...
   Tenure stats for missing TotalCharges customers:
   Mean tenure: 0.0 months
   💡 Missing TotalCharges likely represent new customers
   🔧 Filling missing TotalCharges with MonthlyCharges (first month)
   ✅ TotalCharges missing values handled


In [7]:
# 4. Check for outliers in numerical columns
print(f"\n4️⃣ Checking for outliers...")
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

outlier_summary = []
for col in numerical_cols:
    Q1 = df_cleaned[col].quantile(0.25)
    Q3 = df_cleaned[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df_cleaned[(df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)]
    outlier_count = len(outliers)
    outlier_percent = (outlier_count / len(df_cleaned)) * 100

    outlier_summary.append({
        'Column': col,
        'Outlier Count': outlier_count,
        'Outlier %': f"{outlier_percent:.1f}%",
        'Lower Bound': f"{lower_bound:.2f}",
        'Upper Bound': f"{upper_bound:.2f}"
    })

outlier_df = pd.DataFrame(outlier_summary)
print(outlier_df.to_string(index=False))


4️⃣ Checking for outliers...
        Column  Outlier Count Outlier % Lower Bound Upper Bound
        tenure              0      0.0%      -60.00      124.00
MonthlyCharges              0      0.0%      -46.02      171.38
  TotalCharges              0      0.0%    -4683.52     8868.67


In [8]:
# 5. Validate business logic
print(f"\n5️⃣ Validating business logic...")

# Check if TotalCharges makes sense with tenure and MonthlyCharges
df_cleaned['ExpectedTotalCharges'] = df_cleaned['tenure'] * df_cleaned['MonthlyCharges']
df_cleaned['ChargesDifference'] = abs(df_cleaned['TotalCharges'] - df_cleaned['ExpectedTotalCharges'])

# Allow for some variance (promotions, price changes, etc.)
threshold = df_cleaned['MonthlyCharges'] * 3  # 3 months worth of charges difference
suspicious_charges = df_cleaned['ChargesDifference'] > threshold
suspicious_count = suspicious_charges.sum()

print(f"   Customers with suspicious charge calculations: {suspicious_count} ({suspicious_count/len(df_cleaned)*100:.1f}%)")

if suspicious_count > 0 and suspicious_count < len(df_cleaned) * 0.1:  # Less than 10%
    print("   💡 Small number of suspicious charges - likely due to promotions/discounts")
    print("   ✅ Keeping all records (realistic business scenario)")
elif suspicious_count > len(df_cleaned) * 0.1:
    print("   ⚠️ High number of suspicious charges - data quality issue")

# Remove the temporary calculation columns
df_cleaned = df_cleaned.drop(['ExpectedTotalCharges', 'ChargesDifference'], axis=1)


5️⃣ Validating business logic...
   Customers with suspicious charge calculations: 172 (2.4%)
   💡 Small number of suspicious charges - likely due to promotions/discounts
   ✅ Keeping all records (realistic business scenario)


In [9]:
# 6. Standardize categorical values
print(f"\n6️⃣ Standardizing categorical values...")
categorical_cols = df_cleaned.select_dtypes(include=['object']).columns.tolist()
if 'customerID' in categorical_cols:
    categorical_cols.remove('customerID')

for col in categorical_cols:
    unique_values = df_cleaned[col].unique()
    print(f"   {col}: {unique_values}")

    # Check for common inconsistencies
    if any('no phone service' in str(val).lower() for val in unique_values):
        print(f"   🔧 Standardizing 'No phone service' values in {col}")
        df_cleaned[col] = df_cleaned[col].replace('No phone service', 'No')

    if any('no internet service' in str(val).lower() for val in unique_values):
        print(f"   🔧 Standardizing 'No internet service' values in {col}")
        df_cleaned[col] = df_cleaned[col].replace('No internet service', 'No')

print(f"\n✅ Data cleaning completed!")
print(f"📊 Final dataset: {df_cleaned.shape[0]:,} rows × {df_cleaned.shape[1]} columns")


6️⃣ Standardizing categorical values...
   gender: ['Female' 'Male']
   Partner: ['Yes' 'No']
   Dependents: ['No' 'Yes']
   PhoneService: ['No' 'Yes']
   MultipleLines: ['No phone service' 'No' 'Yes']
   🔧 Standardizing 'No phone service' values in MultipleLines
   InternetService: ['DSL' 'Fiber optic' 'No']
   OnlineSecurity: ['No' 'Yes' 'No internet service']
   🔧 Standardizing 'No internet service' values in OnlineSecurity
   OnlineBackup: ['Yes' 'No' 'No internet service']
   🔧 Standardizing 'No internet service' values in OnlineBackup
   DeviceProtection: ['No' 'Yes' 'No internet service']
   🔧 Standardizing 'No internet service' values in DeviceProtection
   TechSupport: ['No' 'Yes' 'No internet service']
   🔧 Standardizing 'No internet service' values in TechSupport
   StreamingTV: ['No' 'Yes' 'No internet service']
   🔧 Standardizing 'No internet service' values in StreamingTV
   StreamingMovies: ['No' 'Yes' 'No internet service']
   🔧 Standardizing 'No internet service' valu

In [10]:
# Summary of changes
rows_removed = original_shape[0] - df_cleaned.shape[0]
if rows_removed > 0:
    print(f"🗑️ Removed {rows_removed} rows ({rows_removed/original_shape[0]*100:.1f}%)")

print(f"🎯 Dataset ready for exploratory data analysis!")

# Quick validation
print(f"\n🔍 Post-cleaning validation:")
print(f"   Missing values: {df_cleaned.isnull().sum().sum()}")
print(f"   Duplicate customers: {df_cleaned['customerID'].duplicated().sum()}")
print(f"   Data types: {df_cleaned.dtypes.value_counts().to_dict()}")

# Save cleaned data reference
print(f"\n💾 Cleaned dataset ready for analysis")
print(f"   Shape: {df_cleaned.shape}")
print(f"   Memory usage: {df_cleaned.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Save for future analysis
df_cleaned.to_csv('../data/clean-processed-data.csv')
print(f"\n Cleaned and Processed Dataset is ready for further analysis")

🎯 Dataset ready for exploratory data analysis!

🔍 Post-cleaning validation:
   Missing values: 0
   Duplicate customers: 0
   Data types: {dtype('O'): 17, dtype('int64'): 2, dtype('float64'): 2}

💾 Cleaned dataset ready for analysis
   Shape: (7043, 21)
   Memory usage: 6.3 MB

 Cleaned and Processed Dataset is ready for further analysis
