In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os

In [16]:
# ---------- User: set this path if needed ----------
CSV_PATH = "Dataset_ATS_v2.csv"   # change if running locally
OUT_DIR = "./stage2_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

In [17]:
# ---------- 1. Load ----------
df = pd.read_csv(CSV_PATH)
print("Loaded rows:", df.shape[0], "cols:", df.shape[1])
print(df.head())

Loaded rows: 7043 cols: 10
   gender  SeniorCitizen Dependents  tenure PhoneService MultipleLines  \
0  Female              0         No       1           No            No   
1    Male              0         No      41          Yes            No   
2  Female              0        Yes      52          Yes            No   
3  Female              0         No       1          Yes            No   
4    Male              0         No      67          Yes            No   

  InternetService        Contract  MonthlyCharges Churn  
0             DSL  Month-to-month              25   Yes  
1             DSL        One year              25    No  
2             DSL  Month-to-month              19    No  
3             DSL        One year              76   Yes  
4     Fiber optic  Month-to-month              51    No  


In [18]:
# ---------- 2. Clean / Impute ----------
# Convert numeric-like columns
for c in ["tenure","MonthlyCharges","SeniorCitizen"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Impute numeric with median
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
for c in num_cols:
    df[c] = df[c].fillna(df[c].median())

# Fill categorical
cat_cols = df.select_dtypes(include=["object","category"]).columns.tolist()
for c in cat_cols:
    df[c] = df[c].fillna("Missing")

# Encode target
if "Churn" in df.columns:
    df["Churn_binary"] = df["Churn"].map({"Yes":1,"No":0}).fillna(0).astype(int)

In [27]:
print(df.head())

   gender  SeniorCitizen Dependents  tenure PhoneService MultipleLines  \
0  Female              0         No       1           No            No   
1    Male              0         No      41          Yes            No   
2  Female              0        Yes      52          Yes            No   
3  Female              0         No       1          Yes            No   
4    Male              0         No      67          Yes            No   

  InternetService        Contract  MonthlyCharges Churn  Churn_binary  
0             DSL  Month-to-month              25   Yes             1  
1             DSL        One year              25    No             0  
2             DSL  Month-to-month              19    No             0  
3             DSL        One year              76   Yes             1  
4     Fiber optic  Month-to-month              51    No             0  


In [19]:
# ---------- 3. One-hot encode categorical features ----------
to_encode = [c for c in cat_cols if c != "Churn"]
df_enc = pd.get_dummies(df.drop(columns=["Churn"] if "Churn" in df.columns else []),
                        columns=to_encode, drop_first=False)

In [24]:
print(df_enc.head())

   SeniorCitizen  tenure  MonthlyCharges  Churn_binary  gender_Female  \
0              0       1              25             1              1   
1              0      41              25             0              0   
2              0      52              19             0              1   
3              0       1              76             1              1   
4              0      67              51             0              0   

   gender_Male  Dependents_No  Dependents_Yes  PhoneService_No  \
0            0              1               0                1   
1            1              1               0                0   
2            0              0               1                0   
3            0              1               0                0   
4            1              1               0                0   

   PhoneService_Yes  MultipleLines_No  MultipleLines_Yes  InternetService_DSL  \
0                 0                 1                  0                    1   
1 

In [21]:
# ---------- 4. Prepare features and target ----------
feature_cols = [c for c in df_enc.columns if c != "Churn_binary"]
X = df_enc[feature_cols]
y = df_enc["Churn_binary"] if "Churn_binary" in df_enc.columns else None

In [25]:
print(X.head())

   SeniorCitizen  tenure  MonthlyCharges  gender_Female  gender_Male  \
0              0       1              25              1            0   
1              0      41              25              0            1   
2              0      52              19              1            0   
3              0       1              76              1            0   
4              0      67              51              0            1   

   Dependents_No  Dependents_Yes  PhoneService_No  PhoneService_Yes  \
0              1               0                1                 0   
1              1               0                0                 1   
2              0               1                0                 1   
3              1               0                0                 1   
4              1               0                0                 1   

   MultipleLines_No  MultipleLines_Yes  InternetService_DSL  \
0                 1                  0                    1   
1             

In [26]:
print(y.head())

0    1
1    0
2    0
3    1
4    0
Name: Churn_binary, dtype: int32


In [23]:
missing_counts_per_column = df.isnull().sum()
missing_counts_per_column

gender             0
SeniorCitizen      0
Dependents         0
tenure             0
PhoneService       0
MultipleLines      0
InternetService    0
Contract           0
MonthlyCharges     0
Churn              0
Churn_binary       0
dtype: int64