<h2 style="text-align:center; color: orange">⌞Data pre-processing⌝</h2>

<h4 style="text-decoration:underline">1. Generate descriptive statistics.</h4>

In [None]:
# Import required libraries
import pandas as pd
import numpy as np

In [3]:
# Load the dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Display the columns
df.columns.to_list()


['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [None]:
num_cols = df.select_dtypes(include=['number']).columns.to_list()
str_cols = df.select_dtypes(exclude=['number']).columns.to_list()

# numerical columns
print(num_cols)

# categorical columns
print(str_cols)

In [11]:
# --- 1. Numerical Statistics ---
print("\n" + "="*50)
print(f"{' NUMERICAL STATISTICS ':=^50}")
print("="*50)

if num_cols:
    # We transpose (.T) the describe result so variables are rows.
    # This fits better in a vertical terminal window.
    num_stats = df[num_cols].describe().T
    
    # Optional: Add Median explicitly if you prefer it over '50%'
    num_stats['median'] = df[num_cols].median()
    
    # Reorder columns to put median next to mean
    cols_order = ['count', 'mean', 'median', 'std', 'min', '25%', '50%', '75%', 'max']
    # Filter to ensure we only use columns that exist
    cols_order = [c for c in cols_order if c in num_stats.columns]
    
    print(num_stats[cols_order].round(2))
else:
    print("No numerical columns found.")



                 count   mean  median    std    min   25%    50%    75%  \
SeniorCitizen   7043.0   0.16    0.00   0.37   0.00   0.0   0.00   0.00   
tenure          7043.0  32.37   29.00  24.56   0.00   9.0  29.00  55.00   
MonthlyCharges  7043.0  64.76   70.35  30.09  18.25  35.5  70.35  89.85   

                   max  
SeniorCitizen     1.00  
tenure           72.00  
MonthlyCharges  118.75  


In [13]:
# --- 2. Categorical (Mode) Statistics ---
print("\n" + "="*50)
print(f"{' CATEGORICAL ANALYSIS ':=^50}")
print("="*50)

if str_cols:
    # Create a custom DataFrame to hold the summary
    cat_stats = pd.DataFrame(index=str_cols, columns=['Mode', 'Unique_Count', 'Top_Freq', '%_Dominance'])
    
    for col in str_cols:
        # Get the mode (handling cases with multiple modes by taking the first one)
        modes = df[col].mode()
        mode_val = modes.iloc[0] if not modes.empty else "N/A"
        
        # Get basic counts
        unique_count = df[col].nunique()
        total_count = df[col].count()
        
        # Get frequency of the most common value
        try:
            top_freq = df[col].value_counts().iloc[0]
            dominance = (top_freq / total_count) * 100 if total_count > 0 else 0
        except IndexError:
            top_freq = 0
            dominance = 0
            
        cat_stats.loc[col] = [mode_val, unique_count, top_freq, f"{dominance:.1f}%"]
    
    print(cat_stats)
else:
    print("No categorical columns found.")

print("="*50 + "\n")


                              Mode Unique_Count Top_Freq %_Dominance
customerID              0002-ORFBO         7043        1        0.0%
gender                        Male            2     3555       50.5%
Partner                         No            2     3641       51.7%
Dependents                      No            2     4933       70.0%
PhoneService                   Yes            2     6361       90.3%
MultipleLines                   No            3     3390       48.1%
InternetService        Fiber optic            3     3096       44.0%
OnlineSecurity                  No            3     3498       49.7%
OnlineBackup                    No            3     3088       43.8%
DeviceProtection                No            3     3095       43.9%
TechSupport                     No            3     3473       49.3%
StreamingTV                     No            3     2810       39.9%
StreamingMovies                 No            3     2785       39.5%
Contract            Month-to-mont