In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("../data/customer_churn.csv")


In [5]:

print(f"\nShape: {df.shape}")
print(f"\nColumns: {df.columns}")
print(f"\nNumerical columns: {df.describe()}")
print(f"\nCategorical columns: {df.describe(include='object')}") # type: ignore
print(f"\nData lookup:\n{df.sample(5)}")


Shape: (7043, 21)

Columns: Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

Numerical columns:        SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000

Categorical columns:         customerID gender Partner Dependents PhoneService MultipleLines  \
count         7043   70

In [6]:
df["Churn"].values

array(['No', 'No', 'Yes', ..., 'No', 'Yes', 'No'],
      shape=(7043,), dtype=object)

In [7]:
churn, notChurn = 1869, 5174
churnPercentage, NoChurnPercentage = churn*100/(churn+notChurn), notChurn*100/(churn+notChurn)
print(f"Churn Percentage: {churnPercentage:.3f}\nNot Churn Percentage: {NoChurnPercentage:.3f}")
print("As we can clearly see the class imbalance.")

Churn Percentage: 26.537
Not Churn Percentage: 73.463
As we can clearly see the class imbalance.


In [8]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [9]:
df.duplicated().sum()

np.int64(0)

In [10]:
df.describe(include='object')

Unnamed: 0,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn
count,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043.0,7043
unique,7043,2,2,2,2,3,3,3,3,3,3,3,3,3,2,4,6531.0,2
top,3186-AJIEK,Male,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,20.2,No
freq,1,3555,3641,4933,6361,3390,3096,3498,3088,3095,3473,2810,2785,3875,4171,2365,11.0,5174


In [11]:
numerical, categorical = [], []

In [12]:
for col in df.columns:
    categorical.append(col) if(df[col].dtype == 'object') else numerical.append(col)

In [13]:
len(numerical), len(categorical)

(3, 18)

In [14]:
numerical

['SeniorCitizen', 'tenure', 'MonthlyCharges']

In [15]:
categorical

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'TotalCharges',
 'Churn']

In [16]:
df.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
292,2294-SALNE,Male,0,Yes,Yes,23,Yes,No,Fiber optic,No,...,Yes,Yes,No,No,One year,No,Mailed check,86.8,1940.8,No
5342,3521-SYVOR,Female,0,No,No,37,Yes,No,DSL,No,...,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,64.75,2345.2,Yes
2808,6645-MXQJT,Male,0,Yes,Yes,2,Yes,No,Fiber optic,Yes,...,No,No,Yes,Yes,Month-to-month,No,Electronic check,97.1,184.15,No
3917,3871-IKPYH,Male,1,No,No,1,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,No,Electronic check,69.1,69.1,Yes
1542,9798-OPFEM,Female,0,No,No,46,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Electronic check,21.1,937.1,No


In [17]:
for col in df.columns:
    print(df[col].value_counts())
    print("\n\n")

customerID
3186-AJIEK    1
7590-VHVEG    1
5575-GNVDE    1
8775-CEBBJ    1
2823-LKABH    1
             ..
6713-OKOMC    1
1452-KIOVK    1
9305-CDSKC    1
9237-HQITU    1
7795-CFOCW    1
Name: count, Length: 7043, dtype: int64



gender
Male      3555
Female    3488
Name: count, dtype: int64



SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64



Partner
No     3641
Yes    3402
Name: count, dtype: int64



Dependents
No     4933
Yes    2110
Name: count, dtype: int64



tenure
1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: count, Length: 73, dtype: int64



PhoneService
Yes    6361
No      682
Name: count, dtype: int64



MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64



InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64



OnlineSecurity
No                     3498
Yes                    201

In [18]:
df["TotalCharges"]

0         29.85
1        1889.5
2        108.15
3       1840.75
4        151.65
         ...   
7038     1990.5
7039     7362.9
7040     346.45
7041      306.6
7042     6844.5
Name: TotalCharges, Length: 7043, dtype: object

In [19]:
df["MonthlyCharges"].dtype

dtype('float64')

In [20]:
df["TotalCharges"].dtype

dtype('O')

In [21]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

In [22]:
df["TotalCharges"].dtype

dtype('float64')

In [23]:
pd.crosstab(df[col], df["Churn"], normalize="index")

Churn,No,Yes
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1.0,0.0
Yes,0.0,1.0


In [None]:
df["PaperlessBilling"]

In [None]:
requiresMapping = ["gender","Partner","Dependents","PhoneService","PaperlessBilling","Churn"]

In [30]:
# df["Partner"] = df["Partner"].map({'No':0, 'Yes':1})
# df["Partner"].value_counts()

df[requiresMapping] = df[requiresMapping].replace({"No":0, "Yes":1})
# df[requiresMapping].value_

  df[requiresMapping] = df[requiresMapping].replace({"No":0, "Yes":1})


In [31]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,1,0,1,0,No phone service,DSL,No,...,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,0,0,34,1,No,DSL,Yes,...,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,0,0,2,1,No,DSL,Yes,...,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,0,0,45,0,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,0,0,2,1,No,Fiber optic,No,...,No,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1


In [32]:
internet_cols = [
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies"
]

df[internet_cols] = df[internet_cols].replace(
    {"No internet service": "No"}
)

df[internet_cols] = df[internet_cols].replace(
    {"Yes": 1, "No": 0}
)

  df[internet_cols] = df[internet_cols].replace(


In [2]:
import pandas as pd
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Load data
df = pd.read_csv("../data/customer_churn.csv")

# Drop ID
df = df.drop(columns=["customerID"])

# Fix TotalCharges
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

# Map Yes/No
yes_no_cols = ["gender","Partner", "Dependents", "PhoneService", "PaperlessBilling", "Churn"]
df[yes_no_cols] = df[yes_no_cols].replace({"Yes": 1, "No": 0, "Male":1,"Female":0})

# Replace "No internet service"
internet_cols = [
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies"
]

df[internet_cols] = df[internet_cols].replace({"No internet service": "No"})

# Features & target
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Column groups
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
cat_cols = [
    "MultipleLines", "InternetService",
    "OnlineSecurity", "OnlineBackup",
    "DeviceProtection", "TechSupport",
    "StreamingTV", "StreamingMovies",
    "Contract", "PaymentMethod"
]

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(drop="first"), cat_cols)
    ],
    remainder="passthrough"  # keeps binary columns
)

# Full pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save full pipeline
os.makedirs("models", exist_ok=True)
joblib.dump(pipeline, "models/churn_pipeline.pkl")

  df[yes_no_cols] = df[yes_no_cols].replace({"Yes": 1, "No": 0, "Male":1,"Female":0})


Accuracy: 0.7508871540099361
F1 Score: 0.6377708978328174
Confusion Matrix:
 [[749 287]
 [ 64 309]]


['models/churn_pipeline.pkl']