**Importing Libraries**

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
telco_customer = pd.read_csv("ChurnTrainDataset.csv")

In [3]:
telco_customer.head()

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107.0,area_code_415,no,yes,26.0,161.6,123.0,27.47,195.5,103.0,16.62,254.4,103.0,11.45,13.7,3.0,3.7,1.0,no
1,NJ,137.0,area_code_415,no,no,0.0,243.4,114.0,41.38,121.2,110.0,10.3,162.6,104.0,7.32,12.2,5.0,3.29,0.0,no
2,OH,84.0,area_code_408,yes,no,0.0,299.4,71.0,50.9,61.9,88.0,5.26,196.9,89.0,8.86,6.6,7.0,1.78,2.0,no
3,OK,75.0,area_code_415,yes,no,0.0,166.7,113.0,28.34,148.3,122.0,12.61,186.9,121.0,8.41,10.1,3.0,2.73,3.0,no
4,MA,121.0,area_code_510,no,yes,24.0,218.2,88.0,37.09,348.5,108.0,29.62,212.6,118.0,9.57,7.5,7.0,2.03,3.0,no


In [4]:
telco_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4250 entries, 0 to 4249
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          4232 non-null   object 
 1   account_length                 4216 non-null   float64
 2   area_code                      4234 non-null   object 
 3   international_plan             4250 non-null   object 
 4   voice_mail_plan                4237 non-null   object 
 5   number_vmail_messages          4216 non-null   float64
 6   total_day_minutes              4240 non-null   float64
 7   total_day_calls                4248 non-null   float64
 8   total_day_charge               4242 non-null   float64
 9   total_eve_minutes              4215 non-null   float64
 10  total_eve_calls                4233 non-null   float64
 11  total_eve_charge               4242 non-null   float64
 12  total_night_minutes            4248 non-null   f

**Encoding Categorical Data**

In [5]:
telco_customer['international_plan']=telco_customer['international_plan'].astype('category').cat.codes
telco_customer['voice_mail_plan']=telco_customer['voice_mail_plan'].astype('category').cat.codes
telco_customer['area_code']=telco_customer['area_code'].astype('category').cat.codes
telco_customer['state']=telco_customer['state'].astype('category').cat.codes
telco_customer.head()


Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,35,107.0,1,0,1,26.0,161.6,123.0,27.47,195.5,103.0,16.62,254.4,103.0,11.45,13.7,3.0,3.7,1.0,no
1,31,137.0,1,0,0,0.0,243.4,114.0,41.38,121.2,110.0,10.3,162.6,104.0,7.32,12.2,5.0,3.29,0.0,no
2,35,84.0,0,1,0,0.0,299.4,71.0,50.9,61.9,88.0,5.26,196.9,89.0,8.86,6.6,7.0,1.78,2.0,no
3,36,75.0,1,1,0,0.0,166.7,113.0,28.34,148.3,122.0,12.61,186.9,121.0,8.41,10.1,3.0,2.73,3.0,no
4,19,121.0,2,0,1,24.0,218.2,88.0,37.09,348.5,108.0,29.62,212.6,118.0,9.57,7.5,7.0,2.03,3.0,no


**Checking Imbalanced Data**

In [6]:
target_count = telco_customer['churn'].value_counts()
print('No Churn:', target_count[0])
print('Churn:', target_count[1])

No Churn: 3634
Churn: 594


**Skewness of numerical features**

In [7]:
for col in telco_customer.columns[~telco_customer.columns.isin(['churn'])]:
  print(col,telco_customer[col].skew())

state -0.06080213040903678
account_length 0.1268316461253467
area_code -0.040251724449793726
international_plan 2.800108521822102
voice_mail_plan 1.0009869540132517
number_vmail_messages 1.3677764565575425
total_day_minutes -0.00806604810577159
total_day_calls -0.08592918726283431
total_day_charge -0.006588931686722552
total_eve_minutes -0.028783392726859886
total_eve_calls -0.02652599850243013
total_eve_charge -0.0303235056412153
total_night_minutes 0.009185789590414285
total_night_calls 0.004321197371724993
total_night_charge 0.007870314590049787
total_intl_minutes -0.24228848731042327
total_intl_calls 1.3597269223539128
total_intl_charge -0.2447346909143826
number_customer_service_calls 1.0819789789664567


**Checking Null Values**

In [8]:
telco_customer.isnull().sum()

state                             0
account_length                   34
area_code                         0
international_plan                0
voice_mail_plan                   0
number_vmail_messages            34
total_day_minutes                10
total_day_calls                   2
total_day_charge                  8
total_eve_minutes                35
total_eve_calls                  17
total_eve_charge                  8
total_night_minutes               2
total_night_calls                 5
total_night_charge                7
total_intl_minutes                5
total_intl_calls                 13
total_intl_charge                30
number_customer_service_calls     3
churn                            22
dtype: int64

**Filling Null Values**

In [9]:
for col in telco_customer.columns[~telco_customer.columns.isin(['state','area_code','churn','international_plan','voice_mail_plan'])]:
    telco_customer[col] =  telco_customer[col].fillna(telco_customer[col].median())

for col in telco_customer[['state','area_code','churn','international_plan','voice_mail_plan']]:
    telco_customer[col] =  telco_customer[col].fillna(telco_customer[col].mode()[0])    

**Checking if any more null value remains**

In [10]:
telco_customer.isna().any()

state                            False
account_length                   False
area_code                        False
international_plan               False
voice_mail_plan                  False
number_vmail_messages            False
total_day_minutes                False
total_day_calls                  False
total_day_charge                 False
total_eve_minutes                False
total_eve_calls                  False
total_eve_charge                 False
total_night_minutes              False
total_night_calls                False
total_night_charge               False
total_intl_minutes               False
total_intl_calls                 False
total_intl_charge                False
number_customer_service_calls    False
churn                            False
dtype: bool

**Label Encoding of target variable**

In [11]:
le=LabelEncoder()
telco_customer['churn'] = le.fit_transform(telco_customer['churn'])

**Seperate Features & Target Variable**

In [12]:
X = telco_customer.drop('churn',axis=1)
y=telco_customer['churn']

**Feature Scaling**

In [13]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X

array([[0.70588235, 0.43801653, 0.66666667, ..., 0.15      , 0.68518519,
        0.11111111],
       [0.62745098, 0.56198347, 0.66666667, ..., 0.25      , 0.60925926,
        0.        ],
       [0.70588235, 0.34297521, 0.33333333, ..., 0.35      , 0.32962963,
        0.22222222],
       ...,
       [0.54901961, 0.30578512, 0.33333333, ..., 0.35      , 0.34444444,
        0.11111111],
       [0.23529412, 0.20247934, 0.33333333, ..., 0.25      , 0.49444444,
        0.22222222],
       [0.92156863, 0.35123967, 0.66666667, ..., 0.8       , 0.46481481,
        0.        ]])

In [14]:
X.shape

(4250, 19)

**Performing PCA**

In [15]:
pca = PCA(random_state=17)
df_X_pca = pca.fit_transform(X)


In [16]:
tot = sum(pca.explained_variance_) # total explained variance of all principal components
var_exp = [(i / tot) * 100 for i in sorted(pca.explained_variance_, reverse=True)] # individual explained variance
cum_var_exp = np.cumsum(var_exp) # cumulative explained variance
cum_var_exp

array([ 18.74738243,  32.87831676,  46.79447592,  56.30006   ,
        64.0978233 ,  70.5376636 ,  76.77940128,  82.14808722,
        86.56417849,  90.07686102,  92.57985154,  94.96813748,
        97.23129996,  99.39947539,  99.962837  ,  99.97711402,
        99.99127761,  99.99758163, 100.        ])

As per the graph, first 10 features are giving around 80% cumilative explained variance, we will consider first 10 columns

In [17]:
n_components = 10
df_X_reduced = np.dot(df_X_pca, pca.components_[:n_components,:].T)
df_X_reduced = pd.DataFrame(df_X_reduced, columns=["PC#%d" % (x + 1) for x in range(n_components)])
df_X_reduced

Unnamed: 0,PC#1,PC#2,PC#3,PC#4,PC#5,PC#6,PC#7,PC#8,PC#9,PC#10
0,0.056316,0.226239,-0.427871,0.271765,0.139918,0.148895,0.025693,0.047559,0.018191,0.014491
1,0.084135,-0.116709,0.168436,0.099893,-0.029316,0.173983,0.025417,0.037872,-0.021739,-0.007095
2,0.090330,0.278189,0.388679,-0.308074,-0.108234,-0.140447,-0.022417,0.004583,0.838069,0.004426
3,0.253671,-0.015896,0.181614,-0.318895,0.034554,-0.019990,0.000322,0.118436,0.844173,-0.005835
4,-0.614760,-0.034216,-0.528045,-0.011921,-0.140894,-0.207607,-0.063667,0.055109,-0.069759,0.004465
...,...,...,...,...,...,...,...,...,...,...
4245,-0.122503,-0.128773,0.168462,0.014665,0.071842,-0.018331,0.007371,-0.030257,-0.080345,-0.003787
4246,0.252561,0.121000,0.371373,0.413629,0.162615,-0.131669,-0.006915,-0.091192,0.110898,0.009491
4247,0.009883,0.172036,0.354384,0.058821,-0.059885,0.042290,0.013155,0.079730,-0.112297,-0.010284
4248,-0.307700,0.630380,-0.446666,-0.073493,0.166325,0.084039,0.040495,-0.024268,-0.233467,0.027931


**Train Test Split**

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(df_X_reduced, y, test_size=0.2, random_state=17)


## **Random Forest Classifier Model**

**Best Hyperparameters for Random Forest Classifier Using RandomizedSearchCv**

In [19]:
model = RandomForestClassifier()

params = {"criterion": ['gini', 'entropy'],
          "max_features" :['auto', 'log2'],
          "n_estimators":range(100, 501, 10),
          "max_depth":range(10,30),
          "min_samples_split" : [2,5,10],
          "min_samples_leaf" :[1, 2, 4],
          "bootstrap" :[True, False]
         }

rf_randomized = RandomizedSearchCV(estimator = model ,
                           param_distributions = params,
                           n_iter = 10,cv = 10,verbose = 2,scoring='balanced_accuracy', n_jobs=-1)

rf_randomized.fit(X_train_pca, y_train_pca)
print(rf_randomized.best_estimator_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.4min finished


RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=21, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=380,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


**Model Prediction**

In [27]:
rf = rf_randomized.best_estimator_
rf.fit(X_train_pca,y_train_pca)
prediction = rf.predict(X_test_pca)

**Model Evaluation**

In [28]:
conf_matrix = confusion_matrix(y_test_pca, prediction)
acc_score = accuracy_score(y_test_pca, prediction)
print("confusion matrix")
print(conf_matrix)
print("\n")
print("Model Accuracy:",acc_score*100,'\n')

confusion matrix
[[700  24]
 [ 79  47]]


Model Accuracy: 87.88235294117646 



**Model with Stratified KFold Cross Validation**

In [29]:
rf = rf_randomized.best_estimator_

accuracy=[]
skf= StratifiedKFold(n_splits=10)
skf.get_n_splits(df_X_reduced,y)
for train_index, test_index in skf.split(df_X_reduced,y):
    X1_train,X1_test=df_X_reduced.iloc[train_index],df_X_reduced.iloc[test_index]
    y1_train,y1_test=y.iloc[train_index],y.iloc[test_index]
                            
    rf.fit(X1_train, y1_train)
    prediction=rf.predict(X1_test)
    score=accuracy_score(prediction,y1_test)
    accuracy.append(score)

print(accuracy)
print("\nMinimum Accuracy:",np.amin(accuracy)*100)
print("\nMaximum Accuracy:",np.amax(accuracy)*100)
print("\nMean Accuracy:",np.array(accuracy).mean()*100)

[0.8635294117647059, 0.8847058823529412, 0.908235294117647, 0.88, 0.8964705882352941, 0.8823529411764706, 0.9129411764705883, 0.9035294117647059, 0.8823529411764706, 0.8894117647058823]

Minimum Accuracy: 86.3529411764706

Maximum Accuracy: 91.29411764705883

Mean Accuracy: 89.03529411764704
