In [146]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

In [90]:
import warnings
warnings.filterwarnings("ignore")

# Data Set 1: Telco Churn Data

In [11]:
ds1 = pd.read_csv('Telco-Customer-Churn.csv')
ds1.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In this data set, I can see the overall data.As it can be seen in the head rows, there are obviously some data that are redundant as in the MultipleLines column. and I need to find out if there are more redundant values in the columns. 
At first, I need the information if there are null data in the data set.

In [12]:
ds1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Total charges is object but it should be float. Thus, i need to tranform it. New customers do not have total payment column value. Therefore TotalCharges can be equal to MonthlyCharges
With using numpy where function missing values are filled.

In [13]:
ds1[ds1.TotalCharges == ' ']
ds1["TotalCharges"] = np.where(ds1.TotalCharges == ' ', ds1.MonthlyCharges, ds1.TotalCharges)
ds1.TotalCharges = ds1.TotalCharges.astype("float64")

All of the data is non-null and most of it is object. There is just a few integer data in the data set. I need integer encoding for the categorical data and dummy encoding if the object that cannot be ordered.

In [14]:
ds1_unique = ds1.nunique().to_frame().reset_index()
ds1_unique.columns = ['Variable','DistinctCount']
ds1_unique.loc[ds1_unique['DistinctCount'] > 2].transpose()

Unnamed: 0,0,5,7,8,9,10,11,12,13,14,15,17,18,19
Variable,customerID,tenure,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaymentMethod,MonthlyCharges,TotalCharges
DistinctCount,7043,73,3,3,3,3,3,3,3,3,3,4,1585,6534


With the information above, I filtered the objects which includes more than three different values and examined them.

In [15]:
str_cols = ds1.select_dtypes(['object']).columns
str_cols

Index(['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn'],
      dtype='object')

In [134]:
for column_name in str_cols.drop('customerID'):
    print(column_name)
    s = ds1[column_name].unique()
    for i in s:
        print(str(i))
    print(" ")

gender
Female
Male
 
Partner
Yes
No
 
Dependents
No
Yes
 
PhoneService
No
Yes
 
MultipleLines
No
Yes
 
InternetService
DSL
Fiber optic
No
 
OnlineSecurity
No
Yes
 
OnlineBackup
Yes
No
 
DeviceProtection
No
Yes
 
TechSupport
No
Yes
 
StreamingTV
No
Yes
 
StreamingMovies
No
Yes
 
Contract
Month-to-month
One year
Two year
 
PaperlessBilling
Yes
No
 
PaymentMethod
Electronic check
Mailed check
Bank transfer (automatic)
Credit card (automatic)
 
Churn
No
Yes
 


I need dummy encoding for all of the columns as they cannot be ordered.

In [25]:
categoricColumns = ['gender', 'Partner', 'Dependents', 'PhoneService','MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup','DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
categoricColumns

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [28]:
#Creating a new dataframe to concat new numerical columns on. 
dummy_ds1 = pd.DataFrame()
#By using a loop concating all columns in a df
for var in categoricColumns:
    dummy_ds1 = pd.concat([dummy_ds1, pd.get_dummies(ds1[var], prefix=var)], axis=1)
dummy_ds1

Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,0,1,1,0,1,0,1,0,...,0,1,0,0,0,1,0,0,1,0
1,0,1,1,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,1
2,0,1,1,0,1,0,0,1,1,0,...,0,1,0,0,0,1,0,0,0,1
3,0,1,1,0,1,0,1,0,1,0,...,0,0,1,0,1,0,1,0,0,0
4,1,0,1,0,1,0,0,1,1,0,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,0,1,0,1,0,1,0,1,...,1,0,1,0,0,1,0,0,0,1
7039,1,0,0,1,0,1,0,1,0,1,...,1,0,1,0,0,1,0,1,0,0
7040,1,0,0,1,0,1,1,0,1,0,...,0,1,0,0,0,1,0,0,1,0
7041,0,1,0,1,1,0,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1


In [35]:
new_ds1 = pd.concat([dummy_ds1, ds1[['SeniorCitizen','tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']]], axis=1)
new_ds1

Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_Yes,...,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,1,0,0,1,1,0,1,0,1,0,...,1,0,0,1,0,0,1,29.85,29.85,No
1,0,1,1,0,1,0,0,1,1,0,...,0,0,0,0,1,0,34,56.95,1889.50,No
2,0,1,1,0,1,0,0,1,1,0,...,1,0,0,0,1,0,2,53.85,108.15,Yes
3,0,1,1,0,1,0,1,0,1,0,...,0,1,0,0,0,0,45,42.30,1840.75,No
4,1,0,1,0,1,0,0,1,1,0,...,1,0,0,1,0,0,2,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,0,1,0,1,0,1,0,1,...,1,0,0,0,1,0,24,84.80,1990.50,No
7039,1,0,0,1,0,1,0,1,0,1,...,1,0,1,0,0,0,72,103.20,7362.90,No
7040,1,0,0,1,0,1,1,0,1,0,...,1,0,0,1,0,0,11,29.60,346.45,No
7041,0,1,0,1,1,0,0,1,0,1,...,1,0,0,0,1,1,4,74.40,306.60,Yes


In [46]:
new_ds1.groupby('Churn').count()

Unnamed: 0_level_0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_Yes,...,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
No,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174,...,5174,5174,5174,5174,5174,5174,5174,5174,5174,5174
Yes,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869,...,1869,1869,1869,1869,1869,1869,1869,1869,1869,1869


In [92]:
new_ds1['Churn'] = pd.Series(np.where(new_ds1.Churn == 'Yes', 1, 0), new_ds1.index)
new_ds1

Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_Yes,...,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,1,0,0,1,1,0,1,0,1,0,...,1,0,0,1,0,0,1,29.85,29.85,0
1,0,1,1,0,1,0,0,1,1,0,...,0,0,0,0,1,0,34,56.95,1889.50,0
2,0,1,1,0,1,0,0,1,1,0,...,1,0,0,0,1,0,2,53.85,108.15,1
3,0,1,1,0,1,0,1,0,1,0,...,0,1,0,0,0,0,45,42.30,1840.75,0
4,1,0,1,0,1,0,0,1,1,0,...,1,0,0,1,0,0,2,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,0,1,0,1,0,1,0,1,...,1,0,0,0,1,0,24,84.80,1990.50,0
7039,1,0,0,1,0,1,0,1,0,1,...,1,0,1,0,0,0,72,103.20,7362.90,0
7040,1,0,0,1,0,1,1,0,1,0,...,1,0,0,1,0,0,11,29.60,346.45,0
7041,0,1,0,1,1,0,0,1,0,1,...,1,0,0,0,1,1,4,74.40,306.60,1


## Train and Test Data

In [94]:
X_train, X_test, y_train, y_test = train_test_split(new_ds1.drop('Churn', axis=1), new_ds1['Churn'], test_size=0.2, random_state=42, stratify=new_ds1.Churn)

## Selected Parameters

In [78]:
param_grid_sgb = dict(max_depth = [2,3,5,7,10],
                  learning_rate = [0.1, 0.2, 0.5, 0.7],
                  n_estimators = [10,30,50,100,200])
param_grid_tree = dict(ccp_alpha = [0.001, 0.002, 0.003, 0.004], 
                       min_samples_leaf = [2,3,5,7,10])
param_grid_rf = dict(n_estimators = [500],
                     min_samples_leaf = [5],
                     max_features = [2,3,4,5])

In [79]:
kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)

# Classifiers and Regressors

#### Lasso

In [105]:
#Lasso
lasso_reg = LassoCV(cv = kfold)
lasso_reg = lasso_reg.fit(np.array(X_train), np.array(y_train))
lasso_reg.alpha_

0.19498318961188135

In [117]:
p_lasso_val = lasso_reg.predict(np.array(X_test))
p_lasso_val = np.where(p_lasso_val > 0.5, 1, 0).tolist()
print(r2_score(p_lasso_val, y_test) / len(y_test))

-0.00038777021401481705


#### Decision Tree

In [135]:
tree_clf = DecisionTreeClassifier()
grid_search_tree_clf = GridSearchCV(tree_clf, param_grid_tree, cv = kfold )
results_tree_clf = grid_search_tree_clf.fit(np.array(X_train), np.array(y_train))

#tree_reg = DecisionTreeRegressor()
#grid_search_tree_reg = GridSearchCV(tree_reg, param_grid_tree, cv = kfold)
#results_tree_reg = grid_search_tree_reg.fit(np.array(X_train), np.array(y_train))


#### Random Forest

In [136]:
rf_clf = RandomForestClassifier()
grid_search_rf_clf = GridSearchCV(rf_clf, param_grid_rf, cv = kfold )
results_rf_clf = grid_search_rf_clf.fit(np.array(X_train), np.array((y_train)))

#rf_reg = RandomForestRegressor()
#grid_search_rf_reg = GridSearchCV(rf_reg, param_grid_rf, cv = kfold)
#results_rf_reg = grid_search_rf_reg.fit(np.array(X_train), np.array(y_train))


#### Stochastic Gradient Boosting

In [137]:
sgb_classifier = xgboost.XGBClassifier(min_child_weight=10, verbosity = 0)
grid_search_clf = GridSearchCV(sgb_classifier, param_grid_sgb, cv = kfold)
results_sgb_clf = grid_search_clf.fit(np.array(X_train), np.array(y_train))

#sgb_regressor = xgboost.XGBRegressor(min_child_weight=10)
#grid_search = GridSearchCV(sgb_regressor, param_grid_sgb, cv = kfold )
#results_sgb_reg = grid_search.fit(np.array(X_train), np.array(y_train))

### Best Parameters

In [173]:
#Classification
print("Best parameters of \n")
print("Alpha Value of Lasso: \n{}\n".format(lasso_reg.alpha_))
print("Best parameters of Decision Tree: \n{}\n".format(results_tree_clf.best_params_))
print("Best parameters of Random Tree: \n{}\n".format(results_rf_clf.best_params_))
print("Best parameters of Stochastic Gradient Boosting: \n{}\n".format(results_sgb_clf.best_params_))

#Regression
#print("Alpha Value of Lasso: \n{}\n".format(lasso_reg.alpha_))
#print("Best parameters of Decision Tree: \n{}\n".format(results_tree_reg.best_params_))
#print("Best parameters of Random Tree: \n{}\n".format(results_rf_reg.best_params_))
#print("Best parameters of Stochastic Gradient Boosting: \n{}\n".format(results_sgb_reg.best_params_))

Best parameters of 

Alpha Value of Lasso: 
0.19498318961188135

Best parameters of Decision Tree: 
{'ccp_alpha': 0.003, 'min_samples_leaf': 2}

Best parameters of Random Tree: 
{'max_features': 5, 'min_samples_leaf': 5, 'n_estimators': 500}

Best parameters of Stochastic Gradient Boosting: 
{'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 30}



### Accuracy Score and  Best Score

In [158]:
p_lasso_val = lasso_reg.predict(np.array(X_test))
p_lasso_val = np.where(p_lasso_val > 0.5, 1, 0).tolist()

d = ["LassoCV", "Decision Tree","Random Forest","Stochastic Gradient Boosting"]
score_table = pd.DataFrame(d, columns=['Models'])

accuracy_list = [accuracy_score(p_lasso_val,(y_test)),
                 accuracy_score(results_tree_clf.best_estimator_.predict(np.array(X_test)),(y_test)),
                 accuracy_score(results_rf_clf.best_estimator_.predict(np.array(X_test)), (y_test)),
                 accuracy_score(results_sgb_clf.best_estimator_.predict(np.array(X_test)), (y_test))]

score_table['Accuracy Score'] = accuracy_list

bestscore_list = [np.nan, results_tree_clf.best_score_, results_rf_clf.best_score_, results_sgb_clf.best_score_]
score_table['Best Score'] = bestscore_list
score_table

Unnamed: 0,Models,Accuracy Score,Best Score
0,LassoCV,0.775018,
1,Decision Tree,0.782115,0.790016
2,Random Forest,0.799858,0.804755
3,Stochastic Gradient Boosting,0.801987,0.80387


### Classification Report

In [140]:
from sklearn.metrics import classification_report
print('Decision Tree')
print(classification_report(y_test,results_tree_clf.best_estimator_.predict(np.array(X_test))))
print('Random Forest')
print(classification_report(y_test,results_rf_clf.best_estimator_.predict(np.array(X_test))))
print('Stochastic Gradient Boost')
print(classification_report(y_test,results_sgb_clf.best_estimator_.predict(np.array(X_test))))

Decision Tree
              precision    recall  f1-score   support

           0       0.80      0.94      0.86      1035
           1       0.67      0.35      0.46       374

    accuracy                           0.78      1409
   macro avg       0.74      0.65      0.66      1409
weighted avg       0.77      0.78      0.76      1409

Random Forest
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1035
           1       0.66      0.50      0.57       374

    accuracy                           0.80      1409
   macro avg       0.75      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409

Stochastic Gradient Boost
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.66      0.52      0.58       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.73      1409
weighted avg       0

In [None]:
print(r2_score(p_lasso_val, y_test) / len(y_test))

# Comment Section

In the classification data set, to predict if a customer will churn, I chose accuracy as the performance metric of the model. All of the models predict good enough to have an idea about Churn since all of them are in between 77 % 80. However, Stochastic Gradient Boosting model has best accuracy with 0.80.

Second best model is the Random Forest compared to Decision Tree. However, all of them can be tuned by the parameter to give better results.