### Import Section

In [1]:
import os
import sys

In [2]:
print(sys.version)

3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]


In [3]:
import numpy as np
import pandas as pd

In [4]:
print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)

NumPy version: 1.20.3
Pandas version: 1.3.4


In [5]:
from sklearn.model_selection import train_test_split

### Loading Data

In [6]:
churn = pd.read_csv('Preprocessed_Data/preprocessed_dataset.csv')

### Data View

In [7]:
churn.head()

Unnamed: 0,gender,SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,InternetService,Contract,MonthlyCharges,Churn
0,Female,0,No,1,No,No,DSL,Month-to-month,25,Yes
1,Male,0,No,41,Yes,No,DSL,One year,25,No
2,Female,0,Yes,52,Yes,No,DSL,Month-to-month,19,No
3,Female,0,No,1,Yes,No,DSL,One year,76,Yes
4,Male,0,No,67,Yes,No,Fiber optic,Month-to-month,51,No


In [8]:
print(churn.columns)

Index(['gender', 'SeniorCitizen', 'Dependents', 'tenure', 'PhoneService',
       'MultipleLines', 'InternetService', 'Contract', 'MonthlyCharges',
       'Churn'],
      dtype='object')


In [9]:
print(churn.shape)

(7043, 10)


In [10]:
print(churn.size)

70430


In [11]:
print("Loaded rows:", churn.shape[0], "cols:", churn.shape[1])

Loaded rows: 7043 cols: 10


In [12]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   gender           7043 non-null   object
 1   SeniorCitizen    7043 non-null   int64 
 2   Dependents       7043 non-null   object
 3   tenure           7043 non-null   int64 
 4   PhoneService     7043 non-null   object
 5   MultipleLines    7043 non-null   object
 6   InternetService  7043 non-null   object
 7   Contract         7043 non-null   object
 8   MonthlyCharges   7043 non-null   int64 
 9   Churn            7043 non-null   object
dtypes: int64(3), object(7)
memory usage: 550.4+ KB


In [13]:
churn.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.758768
std,0.368612,24.559481,30.09165
min,0.0,0.0,18.0
25%,0.0,9.0,36.0
50%,0.0,29.0,70.0
75%,0.0,55.0,90.0
max,1.0,72.0,119.0


In [14]:
churn["gender"].value_counts()

Male      3555
Female    3488
Name: gender, dtype: int64

In [15]:
churn["Dependents"].value_counts()

No     4933
Yes    2110
Name: Dependents, dtype: int64

In [16]:
churn["PhoneService"].value_counts()

Yes    6361
No      682
Name: PhoneService, dtype: int64

In [17]:
churn["MultipleLines"].value_counts()

No     4072
Yes    2971
Name: MultipleLines, dtype: int64

In [18]:
churn["InternetService"].value_counts()

DSL            3947
Fiber optic    3096
Name: InternetService, dtype: int64

In [19]:
churn["Contract"].value_counts()

Month-to-month    3875
Two year          1695
One year          1473
Name: Contract, dtype: int64

In [20]:
churn["Churn"].value_counts()    

No     5174
Yes    1869
Name: Churn, dtype: int64

In [21]:
missing_counts_per_column = churn.isnull().sum()

In [22]:
missing_counts_per_column

gender             0
SeniorCitizen      0
Dependents         0
tenure             0
PhoneService       0
MultipleLines      0
InternetService    0
Contract           0
MonthlyCharges     0
Churn              0
dtype: int64

### Data Clean 

In [23]:
for c in ["tenure","MonthlyCharges","SeniorCitizen"]:
    if c in churn.columns:
        churn[c] = pd.to_numeric(churn[c], errors="coerce")

In [24]:
num_cols = churn.select_dtypes(include=["number"]).columns.tolist()
for c in num_cols:
    churn[c] = churn[c].fillna(churn[c].median())

In [25]:
cat_cols = churn.select_dtypes(include=["object","category"]).columns.tolist()
for c in cat_cols:
    churn[c] = churn[c].fillna("Missing")

In [26]:
if "Churn" in churn.columns:
    churn["Churn_binary"] = churn["Churn"].map({"Yes":1,"No":0}).fillna(0).astype(int)

### One-hot encode

In [27]:
to_encode = [c for c in cat_cols if c != "Churn"]

In [28]:
to_encode

['gender',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'Contract']

In [29]:
churn_enc = pd.get_dummies(churn.drop(columns=["Churn"] if "Churn" in churn.columns else []),
                        columns=to_encode, drop_first=False)

In [30]:
print(churn_enc.columns)

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'Churn_binary',
       'gender_Female', 'gender_Male', 'Dependents_No', 'Dependents_Yes',
       'PhoneService_No', 'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_Yes', 'InternetService_DSL',
       'InternetService_Fiber optic', 'Contract_Month-to-month',
       'Contract_One year', 'Contract_Two year'],
      dtype='object')


In [31]:
churn_enc

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,Churn_binary,gender_Female,gender_Male,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,Contract_Month-to-month,Contract_One year,Contract_Two year
0,0,1,25,1,1,0,1,0,1,0,1,0,1,0,1,0,0
1,0,41,25,0,0,1,1,0,0,1,1,0,1,0,0,1,0
2,0,52,19,0,1,0,0,1,0,1,1,0,1,0,1,0,0
3,0,1,76,1,1,0,1,0,0,1,1,0,1,0,0,1,0
4,0,67,51,0,0,1,1,0,0,1,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,95,1,0,1,1,0,0,1,0,1,1,0,0,1,0
7039,0,23,91,0,1,0,0,1,0,1,0,1,0,1,0,1,0
7040,0,12,21,0,0,1,0,1,0,1,1,0,1,0,1,0,0
7041,1,12,99,1,0,1,1,0,0,1,0,1,0,1,1,0,0


###  Prepare features and target

In [32]:
feature_cols = [c for c in churn_enc.columns if c != "Churn_binary"]

In [33]:
feature_cols

['SeniorCitizen',
 'tenure',
 'MonthlyCharges',
 'gender_Female',
 'gender_Male',
 'Dependents_No',
 'Dependents_Yes',
 'PhoneService_No',
 'PhoneService_Yes',
 'MultipleLines_No',
 'MultipleLines_Yes',
 'InternetService_DSL',
 'InternetService_Fiber optic',
 'Contract_Month-to-month',
 'Contract_One year',
 'Contract_Two year']

In [34]:
X = churn_enc[feature_cols]
y = churn_enc["Churn_binary"] if "Churn_binary" in churn_enc.columns else None

In [35]:
X

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,gender_Female,gender_Male,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,Contract_Month-to-month,Contract_One year,Contract_Two year
0,0,1,25,1,0,1,0,1,0,1,0,1,0,1,0,0
1,0,41,25,0,1,1,0,0,1,1,0,1,0,0,1,0
2,0,52,19,1,0,0,1,0,1,1,0,1,0,1,0,0
3,0,1,76,1,0,1,0,0,1,1,0,1,0,0,1,0
4,0,67,51,0,1,1,0,0,1,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,95,0,1,1,0,0,1,0,1,1,0,0,1,0
7039,0,23,91,1,0,0,1,0,1,0,1,0,1,0,1,0
7040,0,12,21,0,1,0,1,0,1,1,0,1,0,1,0,0
7041,1,12,99,0,1,1,0,0,1,0,1,0,1,1,0,0


In [36]:
y

0       1
1       0
2       0
3       1
4       0
       ..
7038    1
7039    0
7040    0
7041    1
7042    0
Name: Churn_binary, Length: 7043, dtype: int32

### Train Test Split

In [37]:
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=42, stratify=y)
else:
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

### Save Data

In [38]:
# A preprocessed dataset addressing missing data points and encoding categorical variables

In [39]:
churn_enc.to_csv(os.path.join("./Preprocessed_Data", "preprocessed_data_with_encoding_categorical.csv"), index=False)

In [40]:
# Training data

In [41]:
X_train.to_csv(os.path.join("./Training_Data", "X_train.csv"), index=False)

In [42]:
y_train.to_csv(os.path.join("./Training_Data", "y_train.csv"), index=False)

In [43]:
# Testing Data

In [44]:
X_test.to_csv(os.path.join("./Testing_Data", "X_test.csv"), index=False)

In [45]:
y_test.to_csv(os.path.join("./Testing_Data", "y_test.csv"), index=False)