## Building BLR, SVM and Decision Tree model using Telcom data.

We will identify the best model to predict customer churn

In [1]:
import pandas as pd
import numpy as np

In [2]:
# [4] Read the provided CSV file/data set.
df = pd.read_csv("Customer_data.csv") 

# [5] Print the table.
df.head()  

Unnamed: 0,Age,Occupation,Status,Edu,House,Loan,Comm,Month,DOW,Duration,Campaign,PosDays,Last_out,Var_rate,Price_idx,Conf_idx,Month_rate,Quarterly_emp,Target
0,44,blue-collar,married,basic.4y,yes,no,cellular,aug,thu,210,1,999,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,cellular,nov,fri,138,1,999,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,yes,no,cellular,jun,thu,339,3,6,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,cellular,apr,fri,185,2,999,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,yes,no,cellular,aug,fri,137,1,3,success,-2.9,92.201,-31.4,0.869,5076.2,1


In [3]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            41188 non-null  int64  
 1   Occupation     41188 non-null  object 
 2   Status         41188 non-null  object 
 3   Edu            41188 non-null  object 
 4   House          41188 non-null  object 
 5   Loan           41188 non-null  object 
 6   Comm           41188 non-null  object 
 7   Month          41188 non-null  object 
 8   DOW            41188 non-null  object 
 9   Duration       41188 non-null  int64  
 10  Campaign       41188 non-null  int64  
 11  PosDays        41188 non-null  int64  
 12  Last_out       41188 non-null  object 
 13  Var_rate       41188 non-null  float64
 14  Price_idx      41188 non-null  float64
 15  Conf_idx       41188 non-null  float64
 16  Month_rate     41188 non-null  float64
 17  Quarterly_emp  41188 non-null  float64
 18  Target

In [4]:
df['Edu'].unique()

array(['basic.4y', 'unknown', 'university.degree', 'high.school',
       'basic.9y', 'professional.course', 'basic.6y', 'illiterate'],
      dtype=object)

In [5]:
# convert Edu column to single word 

df.loc[df['Edu'].str.contains('basic'), 'Edu'] = 'pre-school'
df.loc[df['Edu'].str.contains('professional'), 'Edu'] = 'masters'
df.loc[df['Edu'].str.contains('high'), 'Edu'] = 'high-school'
df.loc[df['Edu'].str.contains('illi'), 'Edu'] = 'other'
df.loc[df['Edu'].str.contains('unknown'), 'Edu'] = 'other'
df.loc[df['Edu'].str.contains('university'), 'Edu'] = 'uni'

df['Edu'].unique()

array(['pre-school', 'other', 'uni', 'high-school', 'masters'],
      dtype=object)

### Dummy variables for categorical columns

In [6]:
df.columns

Index(['Age', 'Occupation', 'Status', 'Edu', 'House', 'Loan', 'Comm', 'Month',
       'DOW', 'Duration', 'Campaign', 'PosDays', 'Last_out', 'Var_rate',
       'Price_idx', 'Conf_idx', 'Month_rate', 'Quarterly_emp', 'Target'],
      dtype='object')

In [7]:
features = ['Occupation', 'Status', 'Edu', 'House', 'Loan', 'Comm', 'Month',
       'DOW', 'Last_out']

new_df = pd.get_dummies(df, columns = features)

new_df.head()

Unnamed: 0,Age,Duration,Campaign,PosDays,Var_rate,Price_idx,Conf_idx,Month_rate,Quarterly_emp,Target,...,Month_oct,Month_sep,DOW_fri,DOW_mon,DOW_thu,DOW_tue,DOW_wed,Last_out_failure,Last_out_nonexistent,Last_out_success
0,44,210,1,999,1.4,93.444,-36.1,4.963,5228.1,0,...,0,0,0,0,1,0,0,0,1,0
1,53,138,1,999,-0.1,93.2,-42.0,4.021,5195.8,0,...,0,0,1,0,0,0,0,0,1,0
2,28,339,3,6,-1.7,94.055,-39.8,0.729,4991.6,1,...,0,0,0,0,1,0,0,0,0,1
3,39,185,2,999,-1.8,93.075,-47.1,1.405,5099.1,0,...,0,0,1,0,0,0,0,0,1,0
4,55,137,1,3,-2.9,92.201,-31.4,0.869,5076.2,1,...,0,0,1,0,0,0,0,0,0,1


## Balance the data

In [8]:
# Determine if values in a column are balanced.
df['Target'].value_counts()  

0    36548
1     4640
Name: Target, dtype: int64

In [9]:
import statsmodels.api as sm   
import imblearn
# [1b] Helps split data into sets to create BLR.
from imblearn.over_sampling import SMOTE  
from sklearn.model_selection import train_test_split 

# [1c] Indicates situations that aren’t necessarily exceptions.
import warnings  
warnings.filterwarnings("ignore")


df_final = new_df.fillna(0)

X = df_final.loc[:, df_final.columns !='Target']
y = df_final.loc[:, df_final.columns == 'Target']

x_test, x_train, y_test, y_train = train_test_split(X,y, test_size=0.3, random_state=0)

os = SMOTE(random_state=0)

columns = x_train.columns

# resample data
os_data_X, os_data_y = os.fit_resample(x_train, y_train)

# set variables

os_data_X = pd.DataFrame(data= os_data_X, columns = columns)
os_data_y = pd.DataFrame(data= os_data_y, columns = ['Target'])

print('length of resampled data is', len(os_data_X))
os_data_y

length of resampled data is 21962


Unnamed: 0,Target
0,0
1,0
2,0
3,0
4,0
...,...
21957,1
21958,1
21959,1
21960,1


In [10]:
os_data_y.value_counts()

Target
0         10981
1         10981
dtype: int64

## Check Multicolinearity: Variance Inflation Factor (VIF)

In [11]:
df_final.dtypes

Age                           int64
Duration                      int64
Campaign                      int64
PosDays                       int64
Var_rate                    float64
Price_idx                   float64
Conf_idx                    float64
Month_rate                  float64
Quarterly_emp               float64
Target                        int64
Occupation_admin.             uint8
Occupation_blue-collar        uint8
Occupation_entrepreneur       uint8
Occupation_housemaid          uint8
Occupation_management         uint8
Occupation_retired            uint8
Occupation_self-employed      uint8
Occupation_services           uint8
Occupation_student            uint8
Occupation_technician         uint8
Occupation_unemployed         uint8
Occupation_unknown            uint8
Status_divorced               uint8
Status_married                uint8
Status_single                 uint8
Status_unknown                uint8
Edu_high-school               uint8
Edu_masters                 

In [12]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

df_num = df.select_dtypes(include=numerics)

vif = pd.DataFrame()

vif['Features'] = df_num.columns
vif['Vif Factors'] = [variance_inflation_factor(df_num.values, i) 
                      for i in range (len(df_num.columns))]

vif

Unnamed: 0,Features,Vif Factors
0,Age,16.045844
1,Duration,2.444768
2,Campaign,1.921042
3,PosDays,34.949748
4,Var_rate,28.952264
5,Price_idx,22554.990597
6,Conf_idx,120.794996
7,Month_rate,225.576222
8,Quarterly_emp,26802.864918
9,Target,1.662161


## Notes:

Apart from Duration and Campaign all of the other numerical variables show high degree of multicolinearity

# Box_Tidwell: 

to check if there is a non-linearity between the continous varibales and the log odds

In [13]:
# [1] Import necessary libraries, modules, classes and packages.
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families

# [2] Select all the continuous variables.
continuous_vars = list(df_num.columns[:-1])

# [3] Make a copy of the DataFrame.
df_test = df.copy()

# [4] Add logit transform interaction terms (natural log) for 
# continuous variables e.g.. Age * Log(Age).
for var in continuous_vars:
    df_test[f'{var}:Log_{var}'] = df_test[var].apply(lambda x: x * np.log(x))
    
    # [5] Keep columns related to continuous variables.
    cols_to_keep = continuous_vars + [_ for _ in df_test.columns if 'Log_' in _]

# [6] View output
list(cols_to_keep) 

['Age',
 'Duration',
 'Campaign',
 'PosDays',
 'Var_rate',
 'Price_idx',
 'Conf_idx',
 'Month_rate',
 'Quarterly_emp',
 'Age:Log_Age',
 'Duration:Log_Duration',
 'Campaign:Log_Campaign',
 'PosDays:Log_PosDays',
 'Var_rate:Log_Var_rate',
 'Price_idx:Log_Price_idx',
 'Conf_idx:Log_Conf_idx',
 'Month_rate:Log_Month_rate',
 'Quarterly_emp:Log_Quarterly_emp']

In [14]:
# Redefining variables to include interaction terms
# [1] replace missing values with 0
X_lt = df_test[cols_to_keep].fillna(0)

# [2] Add constant term
X_lt_constant = sm.add_constant(X_lt, prepend=False)
  
# [3] Building model and fit the data (using statsmodel's Logit)
logit_results = GLM(y, X_lt_constant, family=families.Binomial()).fit()

# [4] Display summary results
print(logit_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 Target   No. Observations:                41188
Model:                            GLM   Df Residuals:                    41170
Model Family:                Binomial   Df Model:                           17
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -8371.3
Date:                Thu, 02 Jun 2022   Deviance:                       16743.
Time:                        17:51:36   Pearson chi2:                 2.71e+04
No. Iterations:                     8                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Age       

## Notes:

we can see from the results that all of the selected variables have a non-linear relationship with the log odds except fro campaign and month_rate.

# Recursive Feature ELimination (RFE)

We use RFE to identify the most suited variables

In [15]:
# Recursive feature elimination (RFE):
# [1] Create a new DataFrame.
data_final_vars=df_final.columns.values.tolist()

# [2a] Set the dependent variable.
y=['Target']  
# [2b] Set the independent variable.
X=[i for i in data_final_vars if i not in y]  

# [3] Import two packages from sklearn:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# [4] Indicate ‘logreg’ equals ‘LogisticRegression()’. 
logreg = LogisticRegression()

# [5] Specify ‘rfe’ value and no. of features.
# Here are two options for rfe. See which one works for you.
selector = RFE(logreg,n_features_to_select=1)

# [6] Indicate the fit with ‘fit()’.
selector = selector.fit(os_data_X, os_data_y.values.ravel())  

order = selector.ranking_

feature_ranking = []

for i in order:
    feature_ranking.append(f"{i}. {data_final_vars[i]}")
    
feature_ranking

['55. Last_out_nonexistent',
 '51. DOW_thu',
 '48. Month_sep',
 '56. Last_out_success',
 '49. DOW_fri',
 '47. Month_oct',
 '53. DOW_wed',
 '39. Month_apr',
 '50. DOW_mon',
 '32. House_unknown',
 '27. Edu_masters',
 '30. Edu_uni',
 '35. Loan_unknown',
 '29. Edu_pre-school',
 '37. Comm_cellular',
 '33. House_yes',
 '28. Edu_other',
 '36. Loan_yes',
 '34. Loan_no',
 '31. House_no',
 '26. Edu_high-school',
 '16. Occupation_self-employed',
 '17. Occupation_services',
 '18. Occupation_student',
 '19. Occupation_technician',
 '7. Month_rate',
 '9. Target',
 '8. Quarterly_emp',
 '6. Conf_idx',
 '10. Occupation_admin.',
 '12. Occupation_entrepreneur',
 '22. Status_divorced',
 '13. Occupation_housemaid',
 '24. Status_single',
 '11. Occupation_blue-collar',
 '23. Status_married',
 '15. Occupation_retired',
 '14. Occupation_management',
 '40. Month_aug',
 '52. DOW_tue',
 '54. Last_out_failure',
 '46. Month_nov',
 '43. Month_jun',
 '25. Status_unknown',
 '38. Comm_telephone',
 '41. Month_dec',
 '45

## BLR model efficacy

In [23]:
# [1] Name the new DataFrame and [2] specify all the columns for BLR:
nec_cols = ['Status_divorced', 'Status_married', 'Status_single',
            'Status_unknown', 'Edu_high-school', 'Edu_masters', 
            'Edu_other', 'Edu_pre-school', 'Edu_uni', 'House_no',
            'House_unknown', 'House_yes', 'Loan_no', 'Loan_unknown',
            'Loan_yes', 'DOW_fri', 'DOW_mon']

# [3a] Set the independent variable.
X=os_data_X[nec_cols]  
# [3b] Set the dependent variable.
y=os_data_y['Target']  

# [4] Set the logit() to accept y and x as parameters and return the logit object:
logit_model=sm.Logit(y, X)

# [5] Indicate result = logit_model.fit() function.
result=logit_model.fit()  

# [6] Print the results.
print(result.summary2()) 


Optimization terminated successfully.
         Current function value: 0.504517
         Iterations 6
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.272     
Dependent Variable: Target           AIC:              22192.4220
Date:               2022-06-02 17:53 BIC:              22320.3751
No. Observations:   21962            Log-Likelihood:   -11080.   
Df Model:           15               LL-Null:          -15223.   
Df Residuals:       21946            LLR p-value:      0.0000    
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     6.0000                                       
-----------------------------------------------------------------
                  Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-----------------------------------------------------------------
Status_divorced  -0.7810   0.0858  -9.1055 0.0000 -0.9491 -0.6129
Status_married    0.0464   0.0460   1.0082 0.3133 -0.0438  0.1365


## Fit and test BLR model

In [24]:
# [1] Import necessary packages:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# [2] Split X and y data sets into ‘train’ and ‘test’ in a 30:70 ratio:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=0)

# [2a] Set LogisticRegression() to ‘logreg’.
logreg = LogisticRegression() 

# [2b] Fit the X_train and y_train data sets to logreg. 
logreg.fit(X_train, y_train) 

y_pred = logreg.predict(X_test)

LogisticRegression()

## BLR model evaluation

In [26]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(confusion_matrix(y_test,y_pred))
print('BLR model accuracy:', metrics.accuracy_score(y_test,y_pred))
print('BLR model precison:', metrics.precision_score(y_test, y_pred))
print('BLR model recall:', metrics.recall_score(y_test, y_pred))

[[3287    0]
 [ 550 2752]]
BLR model accuracy: 0.9165275459098498
BLR model precison: 1.0
BLR model recall: 0.8334342822531798


### BLR Model notes:

The model has a very high precision, accuracy and recall rate

# 2. Build SVM MODEL

In [31]:
from sklearn import svm 

# [3a] Set the independent variable.
x=os_data_X[nec_cols]  
# [3b] Set the dependent variable.
y=os_data_y['Target'] 

x_test, x_train, y_test, y_train = train_test_split(x,y, test_size = 0.3, random_state=0)

clf = svm.SVC(kernel='linear', gamma= 'scale')

clf.fit(x_train, y_train)

prediction = clf.predict(x_test)

## SVM model evaluation

In [32]:
print(confusion_matrix(y_test,prediction))
print('SVM model accuracy:', metrics.accuracy_score(y_test,prediction))
print('SVM model precison:', metrics.precision_score(y_test, prediction))
print('SVM model recall:', metrics.recall_score(y_test, prediction))

[[7694    0]
 [1334 6345]]
SVM model accuracy: 0.9132244844857867
SVM model precison: 1.0
SVM model recall: 0.8262794634718063


# 3. Build Decision Tree MODEL

In [48]:
from sklearn.tree import DecisionTreeClassifier

# [3a] Set the independent variable.
X=os_data_X[nec_cols]  
# [3b] Set the dependent variable.
y=os_data_y['Target'] 

x_test, x_train, y_test, y_train = train_test_split(X, y, test_size=0.3, random_state=0)

dtc = DecisionTreeClassifier(criterion='gini',max_depth=4, random_state=1)

dtc = dtc.fit(x_train, y_train)

dtc_pred = dtc.predict(x_test)


## Decision Tree model evaluation

In [49]:
print(confusion_matrix(y_test,dtc_pred))
print('DTC model accuracy:', metrics.accuracy_score(y_test,dtc_pred))
print('DTC model precison:', metrics.precision_score(y_test, dtc_pred))
print('DTC model recall:', metrics.recall_score(y_test, dtc_pred))

[[6333 1361]
 [1596 6083]]
DTC model accuracy: 0.8076497755805633
DTC model precison: 0.8171681891456206
DTC model recall: 0.7921604375569735
