In [3]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import TomekLinks
import warnings
import os
warnings.filterwarnings('ignore')

In [4]:
os.getcwd()

'/Users/Kumar/Desktop/lab-cross-validation'

In [5]:
df=pd.read_csv(r"files_for_lab/Customer-churn.csv")
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [6]:
df['Churn'].value_counts() # As we can see, the data is imbalanced. 

No     5174
Yes    1869
Name: Churn, dtype: int64

In [7]:
col_names=df.columns.values.tolist()
for col in range(len(col_names)):
    col_names[col]=col_names[col].lower()
    col_names[col]=col_names[col].replace(" ", "_")

In [8]:
for i in range(len(col_names)):
    df.rename(columns={df.columns.values[i]:col_names[i]},inplace=True)

In [9]:
# We are going to apply SMOTE to balance the data

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   seniorcitizen     7043 non-null   int64  
 2   partner           7043 non-null   object 
 3   dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   phoneservice      7043 non-null   object 
 6   onlinesecurity    7043 non-null   object 
 7   onlinebackup      7043 non-null   object 
 8   deviceprotection  7043 non-null   object 
 9   techsupport       7043 non-null   object 
 10  streamingtv       7043 non-null   object 
 11  streamingmovies   7043 non-null   object 
 12  contract          7043 non-null   object 
 13  monthlycharges    7043 non-null   float64
 14  totalcharges      7043 non-null   object 
 15  churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [10]:
df.isnull().sum() #we cannot see any kind of nulls 

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [11]:
#we need to stadadize the numerical columns and encode the categorical columns
#we can see that monthly charges,total charges, tenure and seniorization are  categorical variables

In [12]:
df = df[df.totalcharges != " "]

In [13]:
df["totalcharges"].value_counts()

20.2      11
19.75      9
20.05      8
19.9       8
19.65      8
          ..
6849.4     1
692.35     1
130.15     1
3211.9     1
6844.5     1
Name: totalcharges, Length: 6530, dtype: int64

In [14]:
df['totalcharges'] = df['totalcharges'].astype(float)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   seniorcitizen     7032 non-null   int64  
 2   partner           7032 non-null   object 
 3   dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   phoneservice      7032 non-null   object 
 6   onlinesecurity    7032 non-null   object 
 7   onlinebackup      7032 non-null   object 
 8   deviceprotection  7032 non-null   object 
 9   techsupport       7032 non-null   object 
 10  streamingtv       7032 non-null   object 
 11  streamingmovies   7032 non-null   object 
 12  contract          7032 non-null   object 
 13  monthlycharges    7032 non-null   float64
 14  totalcharges      7032 non-null   float64
 15  churn             7032 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

In [42]:
# categorical features and numerical ones are going to be treated differently.
# we split every column except the target in numerical and categorical
df_num = df.select_dtypes(include = np.number)
df_cat = df.select_dtypes(include = object)


In [43]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   seniorcitizen   7032 non-null   int64  
 1   tenure          7032 non-null   int64  
 2   monthlycharges  7032 non-null   float64
 3   totalcharges    7032 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 274.7 KB


In [44]:
scaler=StandardScaler()

In [45]:
scaled=scaler.fit_transform(df_num)

In [51]:
df_num = pd.DataFrame(scaled,columns=df_num.columns)

In [52]:
df_num_st.head(3)

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges
0,-0.440327,-1.280248,-1.161694,-0.994194
1,-0.440327,0.064303,-0.260878,-0.17374
2,-0.440327,-1.239504,-0.363923,-0.959649


In [47]:
#Let's drop some useless columns
df_cat = df.select_dtypes(include = 'object')
df_cat


Unnamed: 0,gender,partner,dependents,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,churn
0,Female,Yes,No,No,No,Yes,No,No,No,No,Month-to-month,No
1,Male,No,No,Yes,Yes,No,Yes,No,No,No,One year,No
2,Male,No,No,Yes,Yes,Yes,No,No,No,No,Month-to-month,Yes
3,Male,No,No,No,Yes,No,Yes,Yes,No,No,One year,No
4,Female,No,No,Yes,No,No,No,No,No,No,Month-to-month,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,No
7039,Female,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,One year,No
7040,Female,Yes,Yes,No,Yes,No,No,No,No,No,Month-to-month,No
7041,Male,Yes,No,Yes,No,No,No,No,No,No,Month-to-month,Yes


In [48]:
#Let's drop some useless columns
df_cat = df_cat.drop(['churn'], axis=1)
df_cat

Unnamed: 0,gender,partner,dependents,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract
0,Female,Yes,No,No,No,Yes,No,No,No,No,Month-to-month
1,Male,No,No,Yes,Yes,No,Yes,No,No,No,One year
2,Male,No,No,Yes,Yes,Yes,No,No,No,No,Month-to-month
3,Male,No,No,No,Yes,No,Yes,Yes,No,No,One year
4,Female,No,No,Yes,No,No,No,No,No,No,Month-to-month
...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,One year
7039,Female,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,One year
7040,Female,Yes,Yes,No,Yes,No,No,No,No,No,Month-to-month
7041,Male,Yes,No,Yes,No,No,No,No,No,No,Month-to-month


In [49]:
#One Hot/Label Encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first').fit(df_cat)
encoded = encoder.transform(df_cat).toarray()
cols = encoder.get_feature_names(input_features=df_cat.columns)
onehot_encoded = pd.DataFrame(encoded, columns=cols)
onehot_encoded#.head()

Unnamed: 0,gender_Male,partner_Yes,dependents_Yes,phoneservice_Yes,onlinesecurity_No internet service,onlinesecurity_Yes,onlinebackup_No internet service,onlinebackup_Yes,deviceprotection_No internet service,deviceprotection_Yes,techsupport_No internet service,techsupport_Yes,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No internet service,streamingmovies_Yes,contract_One year,contract_Two year
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
7028,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
7029,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7030,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
X_concat=np.concatenate([df_num,encoded], axis=1)

In [54]:
#x-y split 
X_train, X_test, y_train, y_test = train_test_split(X_concat, y, test_size=0.3, random_state=42)

In [55]:
#lets use smote tecnique to balance the dataset
smote=SMOTE()


In [56]:
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
y_train_sm.value_counts()

Yes    3614
No     3614
Name: churn, dtype: int64

In [57]:
#Logistic regression
classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_train_sm, y_train_sm)

y_sm_predictions = classification.predict(X_test)
print(classification_report(y_test, y_sm_predictions))

              precision    recall  f1-score   support

          No       0.91      0.72      0.80      1549
         Yes       0.51      0.79      0.62       561

    accuracy                           0.74      2110
   macro avg       0.71      0.76      0.71      2110
weighted avg       0.80      0.74      0.76      2110



In [58]:
#lets use the decision tree and compare the results
clf_model = DecisionTreeClassifier(criterion="gini", random_state=42,max_depth=4, min_samples_leaf=5)   
clf_model.fit(X_train_sm,y_train_sm)

In [59]:

y_predict_dt = clf_model.predict(X_test)


In [60]:
accuracy_score(y_test,y_predict_dt)

0.7180094786729858

In [61]:
print(classification_report(y_test, y_predict_dt))

              precision    recall  f1-score   support

          No       0.89      0.70      0.79      1549
         Yes       0.48      0.76      0.59       561

    accuracy                           0.72      2110
   macro avg       0.69      0.73      0.69      2110
weighted avg       0.78      0.72      0.73      2110



In [None]:
#in both the models the results are similar but we have better resuts in logistic regression

In [62]:
#tomelinks downsampling method
tomek = TomekLinks()

In [63]:
X_train_tl,y_train_tl=tomek.fit_resample(X_train,y_train)

In [64]:
y_train_tl.value_counts()

No     3257
Yes    1308
Name: churn, dtype: int64

In [65]:
classification_lr=LogisticRegression(random_state=42, max_iter=10000)
classification_lr.fit(X_train_tl,y_train_tl)

In [66]:
y_pred_tl=classification_lr.predict(X_test)

In [67]:
accuracy_score(y_test,y_pred_tl)

0.7881516587677725

In [68]:
print(classification_report(y_test, y_pred_tl))

              precision    recall  f1-score   support

          No       0.86      0.84      0.85      1549
         Yes       0.60      0.63      0.61       561

    accuracy                           0.79      2110
   macro avg       0.73      0.74      0.73      2110
weighted avg       0.79      0.79      0.79      2110



In [69]:
clf_model_tl = DecisionTreeClassifier(criterion="gini", random_state=42,max_depth=3, min_samples_leaf=5) 
clf_model_tl.fit(X_train_tl,y_train_tl)

In [70]:
y_predict_dt_tl = clf_model.predict(X_test)

In [71]:
accuracy_score(y_test,y_predict_dt_tl)

0.7180094786729858

In [72]:
print(classification_report(y_test, y_predict_dt_tl))

              precision    recall  f1-score   support

          No       0.89      0.70      0.79      1549
         Yes       0.48      0.76      0.59       561

    accuracy                           0.72      2110
   macro avg       0.69      0.73      0.69      2110
weighted avg       0.78      0.72      0.73      2110



In [None]:
#from using the two modes we can see that 
#when predictic yes values logistic regression has better f1 score and recall values ,
#it has pretty l

