In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Logistic Regression

In [2]:
#Check the datatypes of all the columns in the data
churnData = pd.read_csv("DATA_Customer-Churn.csv")
churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
#Check the datatypes of all the columns in the data
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [4]:
#Convert TotalCharges column into numeric type using pd.to_numeric function.
churnData["TotalCharges"] = pd.to_numeric(churnData["TotalCharges"], errors="coerce")

In [5]:
#Check for null values in the dataframe
for column in churnData:
    print(churnData[column].isnull().value_counts(dropna=False))

False    7043
Name: gender, dtype: int64
False    7043
Name: SeniorCitizen, dtype: int64
False    7043
Name: Partner, dtype: int64
False    7043
Name: Dependents, dtype: int64
False    7043
Name: tenure, dtype: int64
False    7043
Name: PhoneService, dtype: int64
False    7043
Name: OnlineSecurity, dtype: int64
False    7043
Name: OnlineBackup, dtype: int64
False    7043
Name: DeviceProtection, dtype: int64
False    7043
Name: TechSupport, dtype: int64
False    7043
Name: StreamingTV, dtype: int64
False    7043
Name: StreamingMovies, dtype: int64
False    7043
Name: Contract, dtype: int64
False    7043
Name: MonthlyCharges, dtype: int64
False    7032
True       11
Name: TotalCharges, dtype: int64
False    7043
Name: Churn, dtype: int64


In [67]:
#Replace the null values
churnData["TotalCharges"].describe()

count    7043.000000
mean     2283.300441
std      2265.000258
min        18.800000
25%       402.225000
50%      1400.550000
75%      3786.600000
max      8684.800000
Name: TotalCharges, dtype: float64

In [68]:
churnData["TotalCharges"].fillna(churnData["TotalCharges"].mean(), inplace=True)

In [69]:
churnData["TotalCharges"].isnull().value_counts(dropna=False)

False    7043
Name: TotalCharges, dtype: int64

In [70]:
#For tenure, SeniorCitizen, MonthlyCharges and TotalCharges: 
#    -Scale the features either by using normalizer or a standard scaler.
churnData["SeniorCitizen"].value_counts()

0    5901
1    1142
Name: SeniorCitizen, dtype: int64

In [71]:
scaler = StandardScaler()
series_to_scale = churnData[["tenure", "MonthlyCharges", "TotalCharges"]]
scaled_data = scaler.fit_transform(series_to_scale)
scaled_data_df = pd.DataFrame(data=scaled_data, columns=series_to_scale.columns)
scaled_data_df

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.994971
1,0.066327,-0.259629,-0.173876
2,-1.236724,-0.362660,-0.960399
3,0.514251,-0.746535,-0.195400
4,-1.236724,0.197365,-0.941193
...,...,...,...
7038,-0.340876,0.665992,-0.129281
7039,1.613701,1.277533,2.242808
7040,-0.870241,-1.168632,-0.855182
7041,-1.155283,0.320338,-0.872777


In [72]:
churnData_ready = pd.concat([churnData.drop(["tenure","MonthlyCharges", "TotalCharges", "Churn"],axis=1),scaled_data_df], axis=1)
churnData_ready

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,tenure,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,No,No,Yes,No,No,No,No,Month-to-month,-1.277445,-1.160323,-0.994971
1,Male,0,No,No,Yes,Yes,No,Yes,No,No,No,One year,0.066327,-0.259629,-0.173876
2,Male,0,No,No,Yes,Yes,Yes,No,No,No,No,Month-to-month,-1.236724,-0.362660,-0.960399
3,Male,0,No,No,No,Yes,No,Yes,Yes,No,No,One year,0.514251,-0.746535,-0.195400
4,Female,0,No,No,Yes,No,No,No,No,No,No,Month-to-month,-1.236724,0.197365,-0.941193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,-0.340876,0.665992,-0.129281
7039,Female,0,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,One year,1.613701,1.277533,2.242808
7040,Female,0,Yes,Yes,No,Yes,No,No,No,No,No,Month-to-month,-0.870241,-1.168632,-0.855182
7041,Male,1,Yes,No,Yes,No,No,No,No,No,No,Month-to-month,-1.155283,0.320338,-0.872777


In [73]:
churnData_ready["Contract"].value_counts()

Month-to-month    3875
Two year          1695
One year          1473
Name: Contract, dtype: int64

In [74]:
dict = {
        "Yes": 1,
        "No": 0,
        "Female": 1,
        "Male": 0,
        "Month-to-month": 0,
        "One year": 1,
        "Two year": 2
        }
churnData_ready.replace({
    "gender": dict, 
    "Partner": dict, 
    "Dependents": dict, 
    "PhoneService": dict, 
    "OnlineSecurity": dict, 
    "OnlineBackup": dict,
    "DeviceProtection": dict,
    "TechSupport": dict,
    "StreamingTV": dict,
    "StreamingMovies": dict,
    "Contract": dict
}, inplace=True)

In [75]:
churnData_ready = churnData_ready[["tenure", "MonthlyCharges", "TotalCharges"]]

In [76]:
churnData_ready

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.994971
1,0.066327,-0.259629,-0.173876
2,-1.236724,-0.362660,-0.960399
3,0.514251,-0.746535,-0.195400
4,-1.236724,0.197365,-0.941193
...,...,...,...
7038,-0.340876,0.665992,-0.129281
7039,1.613701,1.277533,2.242808
7040,-0.870241,-1.168632,-0.855182
7041,-1.155283,0.320338,-0.872777


In [77]:
X = churnData_ready
y = churnData["Churn"]

In [78]:
#Split the data into a training set and a test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [79]:
#Fit a logistic regression model on the training data.
model = LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression()

In [82]:
#Check the accuracy on the test data.
model.score(X_test, y_test)

0.7926881720430108

# Imbalance in the dataset

In [83]:
churnData["Churn"].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [85]:
ratio = len(churnData[churnData["Churn"]=="No"])/len(churnData["Churn"])
ratio

0.7346301292063041

### Upsampling with SMOTE

In [88]:
from imblearn.over_sampling import SMOTE

In [90]:
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [91]:
#Split the data into a training set and a test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.7926881720430108