In [None]:
import pandas as pd
df=pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv.xls")
print(df.head())
print(df.shape)
print(df.info())
print(df.describe())
print(df.dtypes)
print(df["Churn"].value_counts())
df["Churn"].value_counts(normalize=True)




duplicated=df.duplicated().sum()
print("Number of duplicated rows:",duplicated)

missing_values=df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

customer_count=df['customerID'].nunique()
print("Number of unique customers:",customer_count)

print(df[df["Churn"]=="Yes"].head())
print(df[df["Churn"]=="No"].head())



# change data types

print(df["TotalCharges"].head(20))
df["TotalCharges"]=pd.to_numeric(df["TotalCharges"],errors='coerce')

print(f"Missing TotalCharges after conversion: {df['TotalCharges'].isnull().sum()}")

df.dropna(subset=['TotalCharges'], inplace=True)

print("\nData types after fixing:")
print(df.dtypes)






# Analysis . 


import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better-looking plots
sns.set_style("whitegrid")

print("="*60)
print("TESTING YOUR PREDICTIONS")
print("="*60)

# PREDICTION 1: Senior Citizens
print("\n1. SENIOR CITIZEN CHURN RATE:")
senior_churn = df.groupby('SeniorCitizen')['Churn'].value_counts(normalize=True).unstack()
print(senior_churn)
print(f"\nNon-Senior (0) churn rate: {senior_churn.loc[0, 'Yes']*100:.1f}%")
print(f"Senior (1) churn rate: {senior_churn.loc[1, 'Yes']*100:.1f}%")

# PREDICTION 2: Contract Type
print("\n" + "="*60)
print("2. CONTRACT TYPE CHURN RATE:")
contract_churn = df.groupby('Contract')['Churn'].value_counts(normalize=True).unstack()
print(contract_churn)
print("\nChurn rates by contract:")
for contract in contract_churn.index:
    print(f"{contract}: {contract_churn.loc[contract, 'Yes']*100:.1f}%")

# PREDICTION 3: Internet Service
print("\n" + "="*60)
print("3. INTERNET SERVICE CHURN RATE:")
internet_churn = df.groupby('InternetService')['Churn'].value_counts(normalize=True).unstack()
print(internet_churn)
print("\nChurn rates by internet type:")
for service in internet_churn.index:
    if service != 'No':  # Skip "No internet" for now
        print(f"{service}: {internet_churn.loc[service, 'Yes']*100:.1f}%")

# PREDICTION 4: Monthly Charges
print("\n" + "="*60)
print("4. MONTHLY CHARGES:")
churned_charges = df[df['Churn'] == 'Yes']['MonthlyCharges'].mean()
stayed_charges = df[df['Churn'] == 'No']['MonthlyCharges'].mean()
print(f"Average monthly charges - Churned customers: ${churned_charges:.2f}")
print(f"Average monthly charges - Stayed customers: ${stayed_charges:.2f}")
print(f"Difference: ${churned_charges - stayed_charges:.2f}")

print("\n" + "="*60)
print("SUMMARY: Which predictions were correct?")
print("="*60)

# WHO are the high-risk churners?
# Let's find the profile of customers most likely to churn

print("HIGH-RISK CHURN PROFILE:")
print("="*60)

# Look at combinations
high_risk = df[
    (df['Contract'] == 'Month-to-month') &
    (df['SeniorCitizen'] == 1) &
    (df['InternetService'] == 'Fiber optic')
]

print(f"Seniors + Month-to-month + Fiber optic: {len(high_risk)} customers")
print(f"Churn rate: {(high_risk['Churn'] == 'Yes').sum() / len(high_risk) * 100:.1f}%")

print("\n" + "="*60)

# Compare to low-risk profile
low_risk = df[
    (df['Contract'] == 'Two year') &
    (df['SeniorCitizen'] == 0) &
    (df['InternetService'] == 'DSL')
]

print(f"Non-seniors + Two-year + DSL: {len(low_risk)} customers")
print(f"Churn rate: {(low_risk['Churn'] == 'Yes').sum() / len(low_risk) * 100:.1f}%")







# ============================================================================
# COMPREHENSIVE FEATURE ANALYSIS - ALL IN ONE CLEAN LOOP
# ============================================================================

print("\n" + "="*80)
print("COMPREHENSIVE FEATURE ANALYSIS")
print("="*80)

# All features to analyze (including the ones from predictions + additional ones)
features_to_analyze = {
    'Contract': None,
    'InternetService': None,
    'PaymentMethod': None,
    'TechSupport': None,
    'PaperlessBilling': None,
    'OnlineSecurity': None,
    'DeviceProtection': None,
    'StreamingTV': None,
    'StreamingMovies': None,
    'MultipleLines': None,
    'OnlineBackup': None,
    'PhoneService': None,
    'tenure': 'binned'  # Special handling
}

# Store results for ranking
feature_impact = {}

print("\nüìä CHURN RATES BY FEATURE:\n")

for i, (feature, special_handling) in enumerate(features_to_analyze.items(), 1):
    print(f"{i}. {feature.upper()}:")
    print("-" * 60)
    
    if special_handling == 'binned':
        # Bin tenure into groups
        df['TenureGroup'] = pd.cut(df['tenure'], 
                                    bins=[0, 12, 24, 48, 72], 
                                    labels=['0-1 year', '1-2 years', '2-4 years', '4-6 years'])
        churn_rates = df.groupby('TenureGroup')['Churn'].apply(lambda x: (x=='Yes').sum()/len(x)*100)
        feature_impact['TenureGroup'] = churn_rates.max() - churn_rates.min()
    else:
        churn_rates = df.groupby(feature)['Churn'].apply(lambda x: (x=='Yes').sum()/len(x)*100)
        feature_impact[feature] = churn_rates.max() - churn_rates.min()
    
    # Display sorted results
    for category, rate in churn_rates.sort_values(ascending=False).items():
        print(f"  {category}: {rate:.1f}%")
    print()

# Rank features by impact
print("="*80)
print("üèÜ FEATURE IMPACT RANKING (by churn rate variance)")
print("="*80)
sorted_features = sorted(feature_impact.items(), key=lambda x: x[1], reverse=True)
for rank, (feature, variance) in enumerate(sorted_features, 1):
    print(f"{rank:2d}. {feature:20s} ‚Üí {variance:5.1f}% variance")
print()

# ============================================================================
# UNIFIED CUSTOMER RISK PROFILES
# ============================================================================

print("="*80)
print("üî¥ HIGH-RISK vs üü¢ LOW-RISK CUSTOMER PROFILES")
print("="*80)

# Define comprehensive high-risk profile based on worst categories from each feature
high_risk_profile = df[
    (df['Contract'] == 'Month-to-month') &
    (df['InternetService'] == 'Fiber optic') &
    (df['TechSupport'] == 'No') &
    (df['OnlineSecurity'] == 'No') &
    (df['PaymentMethod'] == 'Electronic check') &
    (df['PaperlessBilling'] == 'Yes') &
    (df['tenure'] <= 12)
]

# Define low-risk profile based on best categories
low_risk_profile = df[
    (df['Contract'] == 'Two year') &
    (df['InternetService'] == 'DSL') &
    (df['TechSupport'] == 'Yes') &
    (df['OnlineSecurity'] == 'Yes') &
    (df['tenure'] > 24)
]

# Display high-risk profile
print("\nüî¥ HIGH-RISK PROFILE:")
print("-" * 80)
print("Characteristics:")
print("  ‚Ä¢ Contract: Month-to-month")
print("  ‚Ä¢ Internet: Fiber optic")
print("  ‚Ä¢ Tech Support: No")
print("  ‚Ä¢ Online Security: No")
print("  ‚Ä¢ Payment: Electronic check")
print("  ‚Ä¢ Paperless Billing: Yes")
print("  ‚Ä¢ Tenure: ‚â§ 12 months")
print(f"\nüìä Customers matching profile: {len(high_risk_profile)}")
if len(high_risk_profile) > 0:
    high_risk_churn_rate = (high_risk_profile['Churn'] == 'Yes').sum() / len(high_risk_profile) * 100
    print(f"‚ö†Ô∏è  Churn rate: {high_risk_churn_rate:.1f}%")

# Display low-risk profile
print("\nüü¢ LOW-RISK PROFILE:")
print("-" * 80)
print("Characteristics:")
print("  ‚Ä¢ Contract: Two year")
print("  ‚Ä¢ Internet: DSL")
print("  ‚Ä¢ Tech Support: Yes")
print("  ‚Ä¢ Online Security: Yes")
print("  ‚Ä¢ Tenure: > 24 months")
print(f"\nüìä Customers matching profile: {len(low_risk_profile)}")
if len(low_risk_profile) > 0:
    low_risk_churn_rate = (low_risk_profile['Churn'] == 'Yes').sum() / len(low_risk_profile) * 100
    print(f"‚úÖ Churn rate: {low_risk_churn_rate:.1f}%")

# Calculate risk difference
if len(high_risk_profile) > 0 and len(low_risk_profile) > 0:
    print(f"\nüí° Risk Difference: {high_risk_churn_rate - low_risk_churn_rate:.1f} percentage points")

print("="*80)


# ML MODEL -LOGISTIC REGRESSION


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd

print("="*60)
print("STEP 1: PREPARE DATA FOR MODELING")
print("="*60)

# Make a copy for modeling
df_model = df.copy()

# Convert Churn to binary (0/1)
df_model['Churn'] = df_model['Churn'].map({'No': 0, 'Yes': 1})

# Select features to use
features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
            'Contract', 'PaymentMethod', 'InternetService', 'TechSupport',
            'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
            'PaperlessBilling', 'gender', 'Partner', 'Dependents',
            'PhoneService', 'MultipleLines',
            'StreamingTV', 'StreamingMovies' 
]

# Create feature dataframe

X = df_model[features].copy()
y = df_model['Churn']

# Convert categorical variables to numbers

categorical_cols = X.select_dtypes(include=['object']).columns

print(f"\nConverting {len(categorical_cols)} categorical columns to numbers...")
print(f"Categorical columns: {list(categorical_cols)}")

# Use Label Encoding for categorical variables

le = LabelEncoder()
for col in categorical_cols:
    X[col] = le.fit_transform(X[col].astype(str))

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Churn distribution: {y.value_counts().to_dict()}")

print("\n" + "="*60)
print("STEP 2: SPLIT DATA INTO TRAIN AND TEST")
print("="*60)

# Split data: 80% training, 20% testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)} customers")
print(f"Test set size: {len(X_test)} customers")
print(f"Training set churn rate: {y_train.mean()*100:.1f}%")
print(f"Test set churn rate: {y_test.mean()*100:.1f}%")

# STEP 3: TRAIN LOGISTIC REGRESSION MODEL

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings

# Suppress warnings that often pop up with first-time Logistic Regression runs

warnings.filterwarnings('ignore')

print("="*60)
print("STEP 3: TRAIN THE MODEL")
print("="*60)

# 1. Initialize the model (This is where the model is created)
#C is the regularization strength (a parameter to prevent overfitting)

model = LogisticRegression(C=1.0, random_state=42, solver='liblinear') 

# 2. Train the model (This is the learning process)

model.fit(X_train, y_train)

print("Model training complete.")

print("\n" + "="*60)
print("STEP 4: TEST THE MODEL AND EVALUATE")
print("="*60)

# 3. Predict on the test set (The model guesses the outcome for unseen data)

y_pred = model.predict(X_test)

# 4. Evaluate the results

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy on Test Set: {accuracy*100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


print("="*60)
print("STEP 5: FEATURE IMPORTANCE")
print("="*60)

# Get feature importance from the model

feature_importance = pd.DataFrame({
    'Feature': features,
  'Importance': abs(model.coef_[0])
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features for Predicting Churn:")
print(feature_importance.head(10))

print("\n" + "="*60)
print("INTERPRETATION:")
print("="*60)
print("Higher importance = stronger impact on churn prediction")
print("These are the features the model relies on most")


# TRYING TO IMPROVE MODEL BY COMPARING WITH RANDOM FOREST AND LOGISTIC REGRESSION MODELS ACCURACY AND FEATURE IMPORTANCE AND THE RESULTS IT GAVE




from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

print("="*80)
print(" IMPROVEMENT EXPERIMENT: TESTING 3 DIFFERENT APPROACHES")
print("="*80)

# Store your current baseline results
baseline_accuracy = 0.7875  # Your current model
baseline_recall_churn = 0.49  # How many churners it catches

print("\nüìä BASELINE MODEL (Logistic Regression - Original):")
print(f"   Accuracy: {baseline_accuracy*100:.2f}%")
print(f"   Churn Detection (Recall): {baseline_recall_churn*100:.0f}%")
print(f"   ‚ö†Ô∏è  Problem: Only catches half of churners")

#==============================================================================
# APPROACH 1: RANDOM FOREST (Better Algorithm)
#==============================================================================

print("\n" + "="*80)
print("üå≤ APPROACH 1: RANDOM FOREST MODEL")
print("="*80)
print("Training Random Forest with 100 decision trees...")

# Create and train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,      # Number of trees
    max_depth=10,          # Limit tree depth to prevent overfitting
    min_samples_split=50,  # Minimum samples to split a node
    random_state=42
)

rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"\n‚úÖ Random Forest Results:")
print(f"   Accuracy: {accuracy_rf*100:.2f}%")
print("\n   Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Get detailed metrics
cm_rf = confusion_matrix(y_test, y_pred_rf)
recall_churn_rf = cm_rf[1,1] / (cm_rf[1,0] + cm_rf[1,1])
print(f"\n   Churn Detection (Recall): {recall_churn_rf*100:.1f}%")
print(f"   Improvement over baseline: {(recall_churn_rf - baseline_recall_churn)*100:.1f} percentage points")

#==============================================================================
# APPROACH 2: BALANCED DATA (Fix Class Imbalance)
#==============================================================================

print("\n" + "="*80)
print("‚öñÔ∏è  APPROACH 2: BALANCED LOGISTIC REGRESSION")
print("="*80)

# Create balanced dataset
print("Balancing the training data...")

# Combine X_train and y_train for resampling
df_train = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)

# Separate by class
df_majority = df_train[df_train['Churn']==0]
df_minority = df_train[df_train['Churn']==1]

print(f"   Original - No churn: {len(df_majority)}, Churn: {len(df_minority)}")
print(f"   Ratio: {len(df_majority)/len(df_minority):.1f}:1")

# Upsample minority class
df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)

# Combine
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Separate features and target
X_train_balanced = df_balanced.drop('Churn', axis=1)
y_train_balanced = df_balanced['Churn']

print(f"   After balancing - No churn: {(y_train_balanced==0).sum()}, Churn: {(y_train_balanced==1).sum()}")
print(f"   New ratio: 1:1 ‚úÖ")

# Train model on balanced data
print("\nTraining Logistic Regression on balanced data...")
model_balanced = LogisticRegression(C=1.0, random_state=42, solver='liblinear', max_iter=1000)
model_balanced.fit(X_train_balanced, y_train_balanced)

# Predict and evaluate
y_pred_balanced = model_balanced.predict(X_test)
accuracy_balanced = accuracy_score(y_test, y_pred_balanced)

print(f"\n‚úÖ Balanced Model Results:")
print(f"   Accuracy: {accuracy_balanced*100:.2f}%")
print("\n   Classification Report:")
print(classification_report(y_test, y_pred_balanced))

# Get detailed metrics
cm_balanced = confusion_matrix(y_test, y_pred_balanced)
recall_churn_balanced = cm_balanced[1,1] / (cm_balanced[1,0] + cm_balanced[1,1])
print(f"\n   Churn Detection (Recall): {recall_churn_balanced*100:.1f}%")
print(f"   Improvement over baseline: {(recall_churn_balanced - baseline_recall_churn)*100:.1f} percentage points")

#==============================================================================
# APPROACH 3: RANDOM FOREST + BALANCED DATA (Both Techniques)
#==============================================================================

print("\n" + "="*80)
print("üöÄ APPROACH 3: RANDOM FOREST + BALANCED DATA (COMBO)")
print("="*80)
print("Training Random Forest on balanced data...")

# Train Random Forest on balanced data
rf_balanced = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=50,
    random_state=42
)

rf_balanced.fit(X_train_balanced, y_train_balanced)

# Predict and evaluate
y_pred_rf_balanced = rf_balanced.predict(X_test)
accuracy_rf_balanced = accuracy_score(y_test, y_pred_rf_balanced)

print(f"\n‚úÖ Random Forest + Balanced Results:")
print(f"   Accuracy: {accuracy_rf_balanced*100:.2f}%")
print("\n   Classification Report:")
print(classification_report(y_test, y_pred_rf_balanced))

# Get detailed metrics
cm_rf_balanced = confusion_matrix(y_test, y_pred_rf_balanced)
recall_churn_rf_balanced = cm_rf_balanced[1,1] / (cm_rf_balanced[1,0] + cm_rf_balanced[1,1])
print(f"\n   Churn Detection (Recall): {recall_churn_rf_balanced*100:.1f}%")
print(f"   Improvement over baseline: {(recall_churn_rf_balanced - baseline_recall_churn)*100:.1f} percentage points")

#==============================================================================
# FINAL COMPARISON
#==============================================================================

print("\n" + "="*80)
print("üìä FINAL COMPARISON: ALL 4 MODELS")
print("="*80)

comparison = pd.DataFrame({
    'Model': [
        'Baseline (Logistic Reg)',
        'Random Forest',
        'Balanced Logistic Reg',
        'Random Forest + Balanced'
    ],
    'Accuracy': [
        baseline_accuracy,
        accuracy_rf,
        accuracy_balanced,
        accuracy_rf_balanced
    ],
    'Churn Detection (Recall)': [
        baseline_recall_churn,
        recall_churn_rf,
        recall_churn_balanced,
        recall_churn_rf_balanced
    ]
})

comparison['Accuracy'] = comparison['Accuracy'] * 100
comparison['Churn Detection (Recall)'] = comparison['Churn Detection (Recall)'] * 100

print("\n")
print(comparison.to_string(index=False))

# Find best model
best_idx = comparison['Churn Detection (Recall)'].idxmax()
best_model = comparison.iloc[best_idx]

print("\n" + "="*80)
print("üèÜ WINNER: BEST MODEL")
print("="*80)
print(f"\nBest Model: {best_model['Model']}")
print(f"Accuracy: {best_model['Accuracy']:.2f}%")
print(f"Churn Detection: {best_model['Churn Detection (Recall)']:.1f}%")
print(f"\nüí° This model catches {best_model['Churn Detection (Recall)']:.0f}% of churners")
print(f"   Improvement: {best_model['Churn Detection (Recall)'] - baseline_recall_churn*100:.1f} percentage points better!")

print("\n" + "="*80)
print("‚úÖ EXPERIMENT COMPLETE")
print("="*80)





# now using the best model to predict churn on new data


# ============================================================================
# ML MODEL FEATURE IMPORTANCE (Different from EDA Analysis)
# ============================================================================

import pandas as pd
import numpy as np

print("\n" + "="*80)
print("üìä ML MODEL FEATURE IMPORTANCE (From WINNING MODEL: Balanced Logistic Reg)")
print("="*80)

print("\nüí° NOTE: This is different from the EDA feature variance we saw earlier!")
print("   - EDA variance = How much churn RATES differ across categories")
print("   - ML importance = How much the MODEL relies on each feature for predictions")
print()

# The winning model was trained on the BALANCED data (X_train_balanced)
try:
    feature_names = X_train_balanced.columns
    model_to_use = model_balanced
except NameError:
    print("‚ö†Ô∏è  Error: Required model variables not found.")
    print("Please ensure the balanced model code was run successfully before this step.")
    exit()

# Extract Feature Importance using absolute coefficients for Logistic Regression
# Why absolute? Because negative coefficients also have strong impact (just in opposite direction)

feature_importance_ml = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': model_to_use.coef_[0],  # Original coefficient (with sign)
    'Importance': np.abs(model_to_use.coef_[0])  # Absolute value for ranking
}).sort_values('Importance', ascending=False)

print("üèÜ Top 10 Features the ML Model Uses for Prediction:")
print("-" * 80)
print(feature_importance_ml.head(10).to_string(index=False))

print("\n" + "="*80)
print("üìñ HOW TO INTERPRET:")
print("="*80)
print("‚Ä¢ Importance Score = How strongly this feature influences the prediction")
print("‚Ä¢ Positive Coefficient = Higher value ‚Üí Higher churn probability")
print("‚Ä¢ Negative Coefficient = Higher value ‚Üí Lower churn probability")
print()
print("Example: If 'Contract' has coefficient -0.82 (negative):")
print("  ‚Üí Longer contracts (encoded as higher numbers) = LESS likely to churn")
print()

# Add comparison with EDA findings
print("="*80)
print("üîÑ COMPARING EDA vs ML FEATURE IMPORTANCE:")
print("="*80)

print("\nEDA Top 3 (by churn rate variance):")
print("  1. Contract - 77.1% variance")
print("  2. InternetService - ~40% variance")
print("  3. TechSupport - ~30% variance")

print("\nML Model Top 3 (by coefficient magnitude):")
top_3_ml = feature_importance_ml.head(3)
for idx, row in top_3_ml.iterrows():
    print(f"  {idx+1}. {row['Feature']} - {row['Importance']:.6f} coefficient")

print("\nüí° KEY INSIGHT:")
print("Both methods agree on the most important features, which validates our analysis!")
print("="*80)



# business impact

print("\n" + "="*80)
print("üíº BUSINESS IMPACT ANALYSIS")

print("="*80)

#Estimate saving from contract incenctives

mtm_customers = 2500  # Month-to-month customers
current_churn = 0.42
target_churn = 0.11  # One-year contract churn rate
conversion_rate = 0.30  # Convert 30% to annual
customer_ltv = 2000

customers_converted = mtm_customers * conversion_rate
churn_reduction = current_churn - target_churn
customers_saved = customers_converted * churn_reduction
revenue_saved = customers_saved * customer_ltv

print(f"Month-to-month customers: {mtm_customers}")
print(f"Target conversion: {conversion_rate*100}%")
print(f"Churn reduction: {churn_reduction*100}%")
print(f"Customers saved: {customers_saved:.0f}")
print(f"Revenue saved: ${revenue_saved:,.0f}")

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

  churn_rates = df.groupby('TenureGroup')['Churn'].apply(lambda x: (x=='Yes').sum()/len(x)*100)


Training set size: 5625 customers
Test set size: 1407 customers
Training set churn rate: 26.6%
Test set churn rate: 26.6%
STEP 3: TRAIN THE MODEL
Model training complete.

STEP 4: TEST THE MODEL AND EVALUATE
Model Accuracy on Test Set: 78.75%

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1033
           1       0.63      0.49      0.55       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.71      1407
weighted avg       0.78      0.79      0.78      1407

STEP 5: FEATURE IMPORTANCE

Top 10 Most Important Features for Predicting Churn:
             Feature  Importance
15      PhoneService    0.916956
4           Contract    0.790826
0      SeniorCitizen    0.297320
8     OnlineSecurity    0.292272
11  PaperlessBilling    0.284898
7        TechSupport    0.263505
14        Dependents    0.242195
6    InternetService    0.174845
9       OnlineBackup    0.12