### **Importing Libraries** <a id="head1"></a>

In [1]:
import numpy as np
import pandas as pd
                                                                                   
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

### **Loading Data** <a id="head2"></a>

In [2]:
telco_customer = pd.read_csv("ChurnTrainDataset.csv")

### **Preprocessing** <a id="head5"></a>

In [3]:
# Encoding categorical data using cat codes
for col in telco_customer.columns[telco_customer.dtypes == 'object']:
    if col!='churn':
        telco_customer[col]=telco_customer[col].astype('category').cat.codes

# Fill Null Values of target column
telco_customer['churn'] =  telco_customer['churn'].fillna(telco_customer['churn'].mode()[0])

# Manual encoding the target variable
a={'yes':1,'no':0}
telco_customer['churn']=telco_customer['churn'].map(a)

### **Seperate Features & Target Variable** <a id="head6"></a>

In [4]:
X = telco_customer.drop('churn',axis=1)
y=telco_customer['churn']

### Categorical & Numeric Features

In [5]:
cat_cols = X.select_dtypes(include=['int8']).columns.values
num_cols = X.select_dtypes(include=['float64']).columns.values

### **Train Test Split**  <a id="head7"></a>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

## Feature Engineering pipeline

In [7]:
# Imputation Transformer to fill null values
fill_null_col = ColumnTransformer([
        ('FillCat',SimpleImputer(strategy='most_frequent'),cat_cols),
        ('FillNumeric',SimpleImputer(strategy='median'),num_cols),
    ],remainder='passthrough')

# Scaling
scale_col = ColumnTransformer([
    ('scale', StandardScaler(),slice(5,18))
],remainder='passthrough')

# Principal Component Analysis
pca_col = ColumnTransformer([
    ('PCA', PCA(n_components=10),slice(0,18))
])

# # Model
logistic=LogisticRegression()
decision=DecisionTreeClassifier()
random = RandomForestClassifier()

### **Model Pipeline** <a id="head9"></a>

In [8]:
pipeline_model1 = Pipeline([('fill_null_col', fill_null_col),
                 ('scale_col', scale_col),
                 ('pca_col', pca_col),
                 ('classifier1', logistic)])

pipeline_model2 = Pipeline([('fill_null_col', fill_null_col),
                 ('scale_col', scale_col),
                 ('pca_col', pca_col),
                 ('classifier2', decision)])

pipeline_model3 = Pipeline([('fill_null_col', fill_null_col),
                 ('scale_col', scale_col),
                 ('pca_col', pca_col),
                 ('classifier3', random)])

In [9]:
pipelines=[pipeline_model1,pipeline_model2,pipeline_model3]

In [10]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [11]:
pipe_dict={0:'Logistic Regression',1:'Decision Tree',2:'Random Forest'}

# Fitting pipelines
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [12]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy : {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy : 0.8541176470588235
Decision Tree Test Accuracy : 0.7929411764705883
Random Forest Test Accuracy : 0.8823529411764706


In [13]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with the best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with the best accuracy:Random Forest


### **Best Hyperparameters for Random Forest Classifier Using GridSearchCV using a pipeline** 
<a id="head10"></a>

In [14]:
params = [{"classifier3__n_estimators": range(100, 501, 100),
           "classifier3__max_depth":range(10,30,10),
           "classifier3__min_samples_leaf":[1, 2, 4],
           "classifier3__min_samples_split": [2, 5, 10],
          }]
                 
randomized_cv = GridSearchCV(estimator = pipeline_model3,
                           param_grid = params,
                           cv = 10 ,verbose = 1, n_jobs=-1)

randomized_cv.fit(X_train,y_train)

In [15]:
randomized_cv.best_params_


In [16]:
pipeline_model3 = Pipeline([('fill_null_col', fill_null_col),
                 ('scale_col', scale_col),
                 ('pca_col', pca_col),
                 ('classifier3', RandomForestClassifier(max_depth=20,min_samples_leaf=1,min_samples_split=2,n_estimators=300))])

pipeline_model3.fit(X_train,y_train)
print(pipeline_model3.score(X_test,y_test))

0.8847058823529412


### **Model Prediction**<a id="head11"></a>

In [17]:
prediction = pipeline_model3.predict(X_test)
prediction

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### **Model Evaluation** <a id="head12"></a>

In [18]:
conf_matrix = confusion_matrix(y_test, prediction)
print("confusion matrix")
print(conf_matrix)
print(classification_report(y_test,prediction))

confusion matrix
[[720   4]
 [ 94  32]]
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       724
           1       0.89      0.25      0.40       126

    accuracy                           0.88       850
   macro avg       0.89      0.62      0.67       850
weighted avg       0.89      0.88      0.86       850

