+ Import the required libraries and modules that you would need.
+ Read that data into Python and call the dataframe churnData.
+ Check the datatypes of all the columns in the data. You would see that the column TotalCharges is object  type. Convert this column into numeric type using pd.to_numeric function.
+ Check for null values in the dataframe. Replace the null values.
+ Use the following features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges:

    Scale the features either by using normalizer or a standard scaler.
    Split the data into a training set and a test set.
    Fit a logistic regression model on the training data."
    Check the accuracy on the test data


In [43]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer
from sklearn.utils import resample
import statsmodels.api as sm
# from sklearn.k doenst work

In [2]:
churnData= pd.read_csv("./DATA_Customer-Churn.csv")

In [3]:
churnData.dtypes #checking dtypes - > convert totalcharges

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [4]:
#convert total charge

churnData["TotalCharges"] =churnData["TotalCharges"].apply(pd.to_numeric, errors='coerce')

In [5]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [6]:
#checking null values

churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [7]:
churnData['TotalCharges'] = churnData['TotalCharges'].replace([' '],'0')

In [8]:
churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [9]:
churnData.shape

(7043, 16)

In [10]:
churnData=churnData.dropna(axis="index")

In [11]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [12]:
#Use the following features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges ->numerical

#Scale the features either by using normalizer or a standard scaler.

churn_feat=churnData[["tenure" , "SeniorCitizen", "MonthlyCharges", "TotalCharges"]]

In [13]:
churn_feat

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,1,0,29.85,29.85
1,34,0,56.95,1889.50
2,2,0,53.85,108.15
3,45,0,42.30,1840.75
4,2,0,70.70,151.65
...,...,...,...,...
7038,24,0,84.80,1990.50
7039,72,0,103.20,7362.90
7040,11,0,29.60,346.45
7041,4,1,74.40,306.60


In [14]:

X=churnData[["tenure" , "SeniorCitizen", "MonthlyCharges", "TotalCharges"]]
y=churnData["Churn"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) #split data

In [16]:
scaler = StandardScaler() #transform data
scaler.fit(X_train)
X_test = scaler.transform(X_test)
X_train = scaler.transform(X_train)

In [17]:
#    Fit a logistic regression model on the training data."


clf= LogisticRegression(random_state=0).fit(X, y)#classification
clf.predict(X_test)
clf.score(X, y)


0.7915244596131968

In [18]:
#Check the accuracy on the test data


from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

In [19]:
dt = DecisionTreeClassifier(max_depth=3)

dt.fit(X_train, y_train)

y_pred_train_dt = dt.predict(X_train)
y_pred_test_dt = dt.predict(X_test)
print("The Accuracy on the test data:" , dt.score(X_test, y_test))

The Accuracy on the test data: 0.771563981042654


Note: So far we have not balanced the data.

Managing imbalance in the dataset

    Check for the imbalance.
    Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.
    Each time fit the model and see how the accuracy of the model is.


In [20]:
# check for the imbalance : #huge inbalance Category 0 ->5174

churnData['Churn'].value_counts()

No     5163
Yes    1869
Name: Churn, dtype: int64

### Upsampling Smote

In [21]:
y_smote = churnData['Churn']
X_smote = churnData[['TotalCharges','tenure', 'SeniorCitizen','MonthlyCharges']]

In [22]:


from imblearn.over_sampling import SMOTE
smote = SMOTE()



In [23]:

Xsm, ysm = smote.fit_resample(X, y)
ysm.value_counts()

No     5163
Yes    5163
Name: Churn, dtype: int64

### Upsamling smote fitting model

In [24]:
Xsm_train, Xsm_test, ysm_train, ysm_test = train_test_split(Xsm, ysm, test_size=0.30, random_state=42) #split data

In [25]:
clf= LogisticRegression(random_state=0).fit(Xsm, ysm)#classification
clf.predict(Xsm_test)
clf.score(Xsm, ysm)

0.7289366647298082

In [26]:
dt = DecisionTreeClassifier(max_depth=3)

dt.fit(Xsm_train, ysm_train)

clf= LogisticRegression(random_state=0).fit(Xsm, ysm)#classification
clf.predict(Xsm_test)
clf.score(Xsm, ysm)
print("The Accuracy of the Smote Model:" , dt.score(Xsm_test, ysm_test))

The Accuracy of the Smote Model: 0.7220787604906391


## Downsampling -Tomeklinks

In [27]:
tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_resample(X, y)
y_tl.value_counts()

No     4609
Yes    1869
Name: Churn, dtype: int64

## Downsampling

In [38]:
category_Yes = churnData[churnData['Churn'] == "Yes"]
category_No = churnData[churnData['Churn'] == "No"]

In [40]:
print(category_Yes.shape) #smaller
print(category_No.shape) #bigger

(1869, 16)
(5163, 16)


In [44]:
category_Yes_down = resample(category_Yes, replace=True,n_samples = len(category_No))

In [46]:
print(category_Yes_down.shape)
print(category_No.shape)

(5163, 16)
(5163, 16)


In [47]:
churnData_down = pd.concat([category_Yes_down, category_No], axis=0)
churnData_down = churnData_down.sample(frac=1)
churnData_down["Churn"].value_counts()

No     5163
Yes    5163
Name: Churn, dtype: int64

## Model

In [48]:
ydown = churnData['Churn']
Xdown = churnData[['TotalCharges','tenure', 'SeniorCitizen','MonthlyCharges']]

In [None]:
Xdown_train, Xdown_test, ydown_train, ydown_test = train_test_split(Xsm, ysm, test_size=0.30, random_state=42) #split data