In [None]:
import kagglehub
import pandas as pd
import os
import numpy as np
from pathlib import Path

KAGGLE_DATASET = "blastchar/telco-customer-churn"

# try:
#     dataset_dir = kagglehub.dataset_download(KAGGLE_DATASET)
#     print(f"Dataset downloaded to: {dataset_dir}")
# except Exception as e:
#     raise RuntimeError(f"Failed to download Kaggle dataset: {e}")

BASE_DIR = Path.cwd()

data_file = BASE_DIR / "DataBase" / "Churn_raw_data.csv"

if not data_file.is_file():
    raise FileNotFoundError(f"Data file not found: {data_file}")

df = pd.read_csv(data_file, encoding="utf-8")
print("Data loaded successfully:", df.shape)

print("Current working directory:", os.getcwd())

  from .autonotebook import tqdm as notebook_tqdm


Data loaded successfully: (7043, 21)
Current working directory: d:\KMITL\year_4\Intro_to_DL_and_ML\Project\Telco-Churn-Prediction-Model-Intro-to-ML-and-DL-Project-


In [4]:

df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [6]:
print(type(df['TechSupport'].unique()))
print(type(df['TechSupport'].nunique()))
print(df['TechSupport'].unique())
print(df['TechSupport'].nunique())

print(df['TechSupport'].value_counts())
print(type(df['TechSupport'].value_counts()))

<class 'numpy.ndarray'>
<class 'int'>
['No' 'Yes' 'No internet service']
3
TechSupport
No                     3473
Yes                    2044
No internet service    1526
Name: count, dtype: int64
<class 'pandas.core.series.Series'>


In [7]:
for i in df.columns:
    print(df[i].value_counts())

customerID
7590-VHVEG    1
5575-GNVDE    1
3668-QPYBK    1
7795-CFOCW    1
9237-HQITU    1
             ..
6840-RESVB    1
2234-XADUH    1
4801-JZAZL    1
8361-LTMKD    1
3186-AJIEK    1
Name: count, Length: 7043, dtype: int64
gender
Male      3555
Female    3488
Name: count, dtype: int64
SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64
Partner
No     3641
Yes    3402
Name: count, dtype: int64
Dependents
No     4933
Yes    2110
Name: count, dtype: int64
tenure
1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: count, Length: 73, dtype: int64
PhoneService
Yes    6361
No      682
Name: count, dtype: int64
MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64
InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64
OnlineSecurity
No                     3498
Yes                    2019
No internet service    15

In [8]:
np.set_printoptions(threshold=np.inf)

for col in df.columns:
    print(f"Column: {col}")
    mask = df[col].isnull() | (df[col].astype(str).str.strip() == "")

    print("Total missing/blank:", mask.sum())
    
    print("-" * 40)


Column: customerID
Total missing/blank: 0
----------------------------------------
Column: gender
Total missing/blank: 0
----------------------------------------
Column: SeniorCitizen
Total missing/blank: 0
----------------------------------------
Column: Partner
Total missing/blank: 0
----------------------------------------
Column: Dependents
Total missing/blank: 0
----------------------------------------
Column: tenure
Total missing/blank: 0
----------------------------------------
Column: PhoneService
Total missing/blank: 0
----------------------------------------
Column: MultipleLines
Total missing/blank: 0
----------------------------------------
Column: InternetService
Total missing/blank: 0
----------------------------------------
Column: OnlineSecurity
Total missing/blank: 0
----------------------------------------
Column: OnlineBackup
Total missing/blank: 0
----------------------------------------
Column: DeviceProtection
Total missing/blank: 0
-------------------------------

In [9]:
missing_value_column = df["TotalCharges"].astype(str).str.strip() == ""
null_column = df["TotalCharges"].isnull()

print(null_column)

0       False
1       False
2       False
3       False
4       False
        ...  
7038    False
7039    False
7040    False
7041    False
7042    False
Name: TotalCharges, Length: 7043, dtype: bool


In [None]:
null_row_df = df[df["TotalCharges"].isnull()]
miss_row = df[df["TotalCharges"].replace(r'^\s*$', pd.NA, regex=True).isnull()]

missing_data_row = pd.concat([null_row_df, miss_row]).drop_duplicates()
display(missing_data_row)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [11]:
inspect_df = missing_data_row[['tenure','Contract','MonthlyCharges','TotalCharges','Churn']]
display(inspect_df)

missing_row_list = inspect_df.index.to_list()
missing_row_list 

Unnamed: 0,tenure,Contract,MonthlyCharges,TotalCharges,Churn
488,0,Two year,52.55,,No
753,0,Two year,20.25,,No
936,0,Two year,80.85,,No
1082,0,Two year,25.75,,No
1340,0,Two year,56.05,,No
3331,0,Two year,19.85,,No
3826,0,Two year,25.35,,No
4380,0,Two year,20.0,,No
5218,0,One year,19.7,,No
6670,0,Two year,73.35,,No


[488, 753, 936, 1082, 1340, 3331, 3826, 4380, 5218, 6670, 6754]

In [12]:
df['TotalCharges'] = df['TotalCharges'].replace(" ", 0).astype('float32')

subset = df.iloc[missing_row_list][['tenure','Contract','MonthlyCharges','TotalCharges','Churn']]
subset

Unnamed: 0,tenure,Contract,MonthlyCharges,TotalCharges,Churn
488,0,Two year,52.55,0.0,No
753,0,Two year,20.25,0.0,No
936,0,Two year,80.85,0.0,No
1082,0,Two year,25.75,0.0,No
1340,0,Two year,56.05,0.0,No
3331,0,Two year,19.85,0.0,No
3826,0,Two year,25.35,0.0,No
4380,0,Two year,20.0,0.0,No
5218,0,One year,19.7,0.0,No
6670,0,Two year,73.35,0.0,No


In [13]:
save_file = BASE_DIR / "DataBase" / "Cleaned_data.csv"

df.to_csv(save_file, index=False)