Data Prep

In [27]:
#importing packages

import pandas as pd
import matplotlib

In [28]:
#reading the dataset and checking output

df = pd.read_csv("Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [29]:
#Check data type of each column to identify target and fix any discrepancies

df.dtypes
# numeric columns: SeniorCitizen, tenure, MonthlyCharges
# y: Churn

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [30]:
#Removing empty row in "Total Charges" column and reclassifying the rest of the column as float instead of string

df = df[df['TotalCharges'] != ' ']
df['TotalCharges'] = df['TotalCharges'].astype('float')

In [31]:
#Generating all possible categories of each column

list_col = ['Churn',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']
for str in list_col:
    print(df[str].unique())

['No' 'Yes']
['Female' 'Male']
['Yes' 'No']
['No' 'Yes']
['No' 'Yes']
['No phone service' 'No' 'Yes']
['DSL' 'Fiber optic' 'No']
['No' 'Yes' 'No internet service']
['Yes' 'No' 'No internet service']
['No' 'Yes' 'No internet service']
['No' 'Yes' 'No internet service']
['No' 'Yes' 'No internet service']
['No' 'Yes' 'No internet service']
['Month-to-month' 'One year' 'Two year']
['Yes' 'No']
['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']


In [32]:
#hot-encoding (1,0) categorical columns

# from sklearn.preprocessing import OneHotEncoder
df = pd.get_dummies(df, columns = list_col)
df.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn_No,Churn_Yes,gender_Female,gender_Male,Partner_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,1,29.85,29.85,1,0,1,0,0,...,0,1,0,0,0,1,0,0,1,0
1,5575-GNVDE,0,34,56.95,1889.5,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,1
2,3668-QPYBK,0,2,53.85,108.15,0,1,0,1,1,...,0,1,0,0,0,1,0,0,0,1
3,7795-CFOCW,0,45,42.3,1840.75,1,0,0,1,1,...,0,0,1,0,1,0,1,0,0,0
4,9237-HQITU,0,2,70.7,151.65,0,1,1,0,1,...,0,1,0,0,0,1,0,0,1,0


In [33]:
##Check for duplicates in column "customerID" as it's supposed to be unique for each customer

# Check for duplicates in the specified column
duplicates = df.duplicated(subset='customerID', keep=False)

# Get the duplicate rows based on the specified column
duplicate_rows = df[duplicates]

# Print the duplicate rows
print(duplicate_rows)

Empty DataFrame
Columns: [customerID, SeniorCitizen, tenure, MonthlyCharges, TotalCharges, Churn_No, Churn_Yes, gender_Female, gender_Male, Partner_No, Partner_Yes, Dependents_No, Dependents_Yes, PhoneService_No, PhoneService_Yes, MultipleLines_No, MultipleLines_No phone service, MultipleLines_Yes, InternetService_DSL, InternetService_Fiber optic, InternetService_No, OnlineSecurity_No, OnlineSecurity_No internet service, OnlineSecurity_Yes, OnlineBackup_No, OnlineBackup_No internet service, OnlineBackup_Yes, DeviceProtection_No, DeviceProtection_No internet service, DeviceProtection_Yes, TechSupport_No, TechSupport_No internet service, TechSupport_Yes, StreamingTV_No, StreamingTV_No internet service, StreamingTV_Yes, StreamingMovies_No, StreamingMovies_No internet service, StreamingMovies_Yes, Contract_Month-to-month, Contract_One year, Contract_Two year, PaperlessBilling_No, PaperlessBilling_Yes, PaymentMethod_Bank transfer (automatic), PaymentMethod_Credit card (automatic), PaymentMe

In [34]:
## Drop "customerID" as it's irrelevant for the analysis

df = df.drop('customerID', axis=1)

In [35]:
## Drop "Churn_No" column and rename column "Churn_Yes" as just "Churn", designating it as a target variable

df = df.drop('Churn_No', axis=1)

df = df.rename(columns={'Churn_Yes': 'Churn'})

In [36]:
df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
1,0,34,56.95,1889.5,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,0,45,42.3,1840.75,0,0,1,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,0,2,70.7,151.65,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0


In [39]:
#moving target column to be first for simplicity

columns = df.columns.tolist()

columns.insert(0, columns.pop(columns.index('Churn')))

In [None]:

#Extracting Independent (target) Variable  

x= df.iloc[:, 0].values  

  

#Extracting Dependent variable  

y= df.iloc[:, 1:].values  

In [None]:
#standardizing the numeric variables (to make sure large variables like 'total charges' don't influence the future model too much) - output will be between -1 and 1

from sklearn.preprocessing import StandardScaler  

st_x= StandardScaler()  

x_train= st_x.fit_transform(x_train) 

x_test= st_x.transform(x_test)

In [11]:
#splitting the dataset into test and train subsets with a 70:30 ratio

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.3, random_state=0)

#1

NameError: name 'x' is not defined