# Telecom Churn Case Study

### Business objective : To predict the churn in the last (i.e. the ninth) month using the data (features) from the first three months

###### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import warnings

from scipy import special 
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
import math
from IPython.display import Markdown, display ,HTML


from sklearn.model_selection import train_test_split


sns.set(style="whitegrid")
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', None) # make sure data and columns are displayed correctly withput purge
pd.options.display.float_format = '{:20,.2f}'.format # display float value with correct precision 

warnings.filterwarnings('ignore')

###### Reading Data

In [2]:
tcom_data = pd.read_csv("telecom_churn_data.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'telecom_churn_data.csv'

In [None]:
# Displaying the first 10 field with all columns in the dataset
tcom_data.head(10)

In [None]:
tcom_data.shape

In [None]:
tcom_data.info(verbose=True)

In [None]:
tcom_data.describe()

###### Checking the overall missing values in the dataset

In [None]:
((tcom_data.isnull().sum()/tcom_data.shape[0])*100).round(2).sort_values(ascending=False)

In [None]:
# selecting all the columns with datetime format
date_col= tcom_data.select_dtypes(include=['object'])
print("\nThese are the columns available with datetime format represented as object\n",date_col.columns)

# Converting the selected columns to datetime format
for i in date_col.columns:
    tcom_data[i] = pd.to_datetime(tcom_data[i])

# Current dimension of the dataset
tcom_data.shape

###### Handling missing values with respect to data recharge attributes

In [None]:

tcom_data[['date_of_last_rech_data_6','total_rech_data_6','max_rech_data_6']].head(10)

##### date_of_last_rech_data, total_rech_data, max_rech_data has missing values. These columns represents the the customer has not done any recharge for mobile interenet. Imputing 0 as their values

In [None]:
for i in range(len(tcom_data)):
  # Handling 'total_rech_data', 'max_rech_data' and for month 6
    if pd.isnull((tcom_data['total_rech_data_6'][i]) and (tcom_data['max_rech_data_6'][i])):
        if pd.isnull(tcom_data['date_of_last_rech_data_6'][i]):
            tcom_data['total_rech_data_6'][i]=0
            tcom_data['max_rech_data_6'][i]=0

  # Handling 'total_rech_data', 'max_rech_data' and for month 7
    if pd.isnull((tcom_data['total_rech_data_7'][i]) and (tcom_data['max_rech_data_7'][i])):
        if pd.isnull(tcom_data['date_of_last_rech_data_7'][i]):
            tcom_data['total_rech_data_7'][i]=0
            tcom_data['max_rech_data_7'][i]=0

  # Handling 'total_rech_data', 'max_rech_data' and for month 8
    if pd.isnull((tcom_data['total_rech_data_8'][i]) and (tcom_data['max_rech_data_8'][i])):
        if pd.isnull(tcom_data['date_of_last_rech_data_8'][i]):
            tcom_data['total_rech_data_8'][i]=0
            tcom_data['max_rech_data_8'][i]=0

  # Handling 'total_rech_data', 'max_rech_data' and for month 9
    if pd.isnull((tcom_data['total_rech_data_9'][i]) and (tcom_data['max_rech_data_9'][i])):
        if pd.isnull(tcom_data['date_of_last_rech_data_9'][i]):
            tcom_data['total_rech_data_9'][i]=0
            tcom_data['max_rech_data_9'][i]=0

In [None]:
tcom_data[['count_rech_2g_6','count_rech_3g_6','total_rech_data_6']].head(10)

##### From the above tabular the column values of total_rech_data for each month from 6 to 9 respectively is the sum of the columns values of count_rech_2g for each month from 6 to 9 respectively and count_rech_3g for each month from 6 to 9 respectively, which derives to a multicollinearity issue. In order to reduce the multicollinearity, we can drop the columns count_rech_2g for each month from 6 to 9 respectively and count_rech_3g for each month from 6 to 9 respectively.

In [None]:
# Dropping the columns 'count_rech_2g_*' & 'count_rech_3g_*' for the months 6,7,8 and 9 
tcom_data.drop(['count_rech_2g_6','count_rech_3g_6',
                   'count_rech_2g_7','count_rech_3g_7',
                   'count_rech_2g_8','count_rech_3g_8',
                   'count_rech_2g_9','count_rech_3g_9'],axis=1, inplace=True)

print("The 'count_rech_2g_6','count_rech_3g_6','count_rech_2g_7','count_rech_3g_7','count_rech_2g_8','count_rech_3g_8','count_rech_2g_9','count_rech_3g_9' columns are dropped as they can be explained from the 'total_rech_data'column")

##### Columns with unique value 1 can be dropped as there will not be variance in the model

In [None]:
# lets check the columns unique values and drop such columns with its value as 1
unique_1_col=[]
for i in tcom_data.columns:
    if tcom_data[i].nunique() == 1:
        unique_1_col.append(i)
    else:
        pass


In [None]:
tcom_data.drop(unique_1_col, axis=1, inplace = True)

In [None]:
tcom_data.shape

##### Handling the missing values for the attributes arpu_3g_*,arpu_2g_* for month 6,7,8 and 9

In [None]:
# Checking the related columns values
tcom_data[['arpu_3g_6','arpu_2g_6','av_rech_amt_data_6']].head(10)

In [None]:
print("Correlation table for month 6\n\n", tcom_data[['arpu_3g_6','arpu_2g_6','av_rech_amt_data_6']].corr())
print("\nCorrelation table for month 7\n\n", tcom_data[['arpu_3g_7','arpu_2g_7','av_rech_amt_data_7']].corr())
print("\nCorrelation table for month 8\n\n", tcom_data[['arpu_3g_8','arpu_2g_8','av_rech_amt_data_8']].corr())
print("\nCorrelation table for month 9\n\n", tcom_data[['arpu_3g_9','arpu_2g_9','av_rech_amt_data_9']].corr())

In [None]:
# Dropping the columns 'arpu_3g_*'&'arpu_2g_*' in month 6,7,8 and 9 datafrom the dataset
tcom_data.drop(['arpu_3g_6','arpu_2g_6',
                  'arpu_3g_7','arpu_2g_7',
                  'arpu_3g_8','arpu_2g_8',
                  'arpu_3g_9','arpu_2g_9'],axis=1, inplace=True)
print("\nThe columns'arpu_3g_6','arpu_2g_6','arpu_3g_7','arpu_2g_7','arpu_3g_8','arpu_2g_8','arpu_3g_9','arpu_2g_9' are dropped from the dataset due to high corellation between their respective arpu_* variable in the dataset\n")

In [None]:
# The curent dimensions of the dataset
tcom_data.shape

##### fb_user_* and night_pck_user_* for each month from 6 to 9 respectively has a missing values above 50%. So dropping the columns

In [None]:
tcom_data.drop(['fb_user_6','fb_user_7','fb_user_8','fb_user_9',
                  'night_pck_user_6','night_pck_user_7','night_pck_user_8','night_pck_user_9'],
                  axis=1, inplace=True)
print("\nThe columns 'fb_user_6','fb_user_7','fb_user_8','fb_user_9','night_pck_user_6','night_pck_user_7','night_pck_user_8','night_pck_user_9' are dropped from the dataset as it has no meaning to the data snd has high missing values above 50%\n")

In [None]:
tcom_data.shape

In [None]:
# Checking the related columns values
tcom_data[['av_rech_amt_data_7','max_rech_data_7','total_rech_data_7']].head(10)

##### missing values for the column av_rech_amt_data_* for each month from 6 to 9 can be replaced as 0 if the total_rech_data_* for each month from 6 to 9 respectively is 0. i.e. if the total recharge done is 0 then the average recharge amount shall also be 0.

In [None]:
for i in range(len(tcom_data)):
  # Handling `av_rech_amt_data`  for month 6
    if (pd.isnull(tcom_data['av_rech_amt_data_6'][i]) and (tcom_data['total_rech_data_6'][i]==0)):
        tcom_data['av_rech_amt_data_6'][i] = 0

  # Handling `av_rech_amt_data`  for month 7
    if (pd.isnull(tcom_data['av_rech_amt_data_7'][i]) and (tcom_data['total_rech_data_7'][i]==0)):
        tcom_data['av_rech_amt_data_7'][i] = 0

  # Handling `av_rech_amt_data`  for month 8
    if (pd.isnull(tcom_data['av_rech_amt_data_8'][i]) and (tcom_data['total_rech_data_8'][i]==0)):
        tcom_data['av_rech_amt_data_8'][i] = 0

  # Handling `av_rech_amt_data`  for month 9
    if (pd.isnull(tcom_data['av_rech_amt_data_9'][i]) and (tcom_data['total_rech_data_9'][i]==0)):
        tcom_data['av_rech_amt_data_9'][i] = 0


In [None]:
# Checkng the overall missing values in the dataset
((tcom_data.isnull().sum()/tcom_data.shape[0])*100).round(2).sort_values(ascending=False)

In [None]:
tcom_data.info()

##### date_of_last_rech_data_* corresponding to months 6,7,8 and 9 are of no value after the conditional imputation of of columns total_rech_data_*, max_rech_data_*are completes. missing value percentage is high for these columns and can be dropped from the dataset

In [None]:
# Dropping the columns related to datetime dtype from the dataset
tcom_data.drop(["date_of_last_rech_data_6","date_of_last_rech_data_7",
                   "date_of_last_rech_data_8","date_of_last_rech_data_9"], axis=1, inplace=True)
print("\nThe columns 'date_of_last_rech_data_6','date_of_last_rech_data_7','date_of_last_rech_data_8','date_of_last_rech_data_9' are dropped as it has no significance to the data\n")

##### we can drop the date_of_last_rech_data_* column corresponding to months 6,7,8 and 9 respectively

In [None]:
# Dropping the columns related to datetime dtype from the dataset
tcom_data.drop(["date_of_last_rech_6","date_of_last_rech_7",
                   "date_of_last_rech_8","date_of_last_rech_9"], axis=1, inplace=True)
print("\nThe columns 'date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8','date_of_last_rech_9' are dropped as it has no significance to the data\n")

In [None]:
# The curent dimensions of the dataset
tcom_data.shape

### 1. Filter high-value customers

In [None]:
# Filtering the data 
 # We are filtering the data in accordance to total revenue generated per customer.

 # first we need the total amount recharge amount done for data alone, we have average rechage amount done. 

 # Calculating the total recharge amount done for data alone in months 6,7,8 and 9
tcom_data['total_rech_amt_data_6']=tcom_data['av_rech_amt_data_6'] * tcom_data['total_rech_data_6']
tcom_data['total_rech_amt_data_7']=tcom_data['av_rech_amt_data_7'] * tcom_data['total_rech_data_7']

# Calculating the overall recharge amount for the months 6,7,8 and 9
tcom_data['overall_rech_amt_6'] = tcom_data['total_rech_amt_data_6'] + tcom_data['total_rech_amt_6']
tcom_data['overall_rech_amt_7'] = tcom_data['total_rech_amt_data_7'] + tcom_data['total_rech_amt_7']

# Calculating the average recharge done by customer in months June and July(i.e. 6th and 7th month)
tcom_data['avg_rech_amt_6_7'] = (tcom_data['overall_rech_amt_6'] + tcom_data['overall_rech_amt_7'])/2

# Finding the value of 70th percentage in the overall revenues defining the high value customer creteria for the company
cut_off = tcom_data['avg_rech_amt_6_7'].quantile(0.70)
print("\nThe 70th quantile value to determine the High Value Customer is: ",cut_off,"\n")

# Filtering the data to the top 30% considered as High Value Customer
tcom_data = tcom_data[tcom_data['avg_rech_amt_6_7'] >= cut_off]

In [None]:
# Let us check the missing values percentages again for the HVC group
    # Checkng the overall missing values in the dataset
((tcom_data.isnull().sum()/tcom_data.shape[0])*100).round(2).sort_values(ascending=False)

In [None]:
# Numerical columns available
num_col = tcom_data.select_dtypes(include = ['int64','float64']).columns.tolist()

In [None]:
# Importing the libraries for Scaling and Imputation
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# Calling the Scaling function
scalar = MinMaxScaler()

# Scaling and transforming the data for the columns that are numerical
tcom_data[num_col]=scalar.fit_transform(tcom_data[num_col])

# Calling the KNN Imputer function
knn=KNNImputer(n_neighbors=3)

# Imputing the NaN values using KNN Imputer

tcom_data_knn = pd.DataFrame(knn.fit_transform(tcom_data[num_col]))
tcom_data_knn.columns=tcom_data[num_col].columns


In [None]:
tcom_data_knn.isnull().sum().sum()

In [None]:
# Converting the scaled data back to the original data
tcom_data[num_col]=scalar.inverse_transform(tcom_data_knn)

# Checking the top 10 data
tcom_data.head(10)

In [None]:
# Checking the overall missing values in the dataset
((tcom_data.isnull().sum()/tcom_data.shape[0])*100).round(2).sort_values(ascending=False)

In [None]:
tcom_data.isnull().sum().sum()

### 2. Tag churners and remove attributes of the churn phase

##### derive churn variable using total_ic_mou_9,total_og_mou_9,vol_2g_mb_9 and vol_3g_mb_9 attributes

In [None]:
# Selecting the columns to define churn variable (i.e. TARGET Variable)
churn_col=['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9']
tcom_data[churn_col].info()

In [None]:
# Initializing the churn variable.
tcom_data['churn']=0

# Imputing the churn values based on the condition
tcom_data['churn'] = np.where(tcom_data[churn_col].sum(axis=1) == 0, 1, 0)

In [None]:
tcom_data.head(10)

###### Performing EDA for Data Insights

In [None]:
#churn/non churn percentage
print((tcom_data['churn'].value_counts()/len(tcom_data))*100)
((tcom_data['churn'].value_counts()/len(tcom_data))*100).plot(kind="pie")
plt.show()

###### Recharge amount related variables

In [None]:
recharge_amount_columns =  tcom_data.columns[tcom_data.columns.str.contains('rech_amt')]
recharge_amount_columns.tolist()

In [None]:
def plot_box_chart(attribute):
    plt.figure(figsize=(20,16))
    df = tcom_data
    plt.subplot(2,3,1)
    sns.boxplot(data=df, y=attribute+"_6",x="churn",hue="churn",
                showfliers=False,palette=("plasma"))
    plt.subplot(2,3,2)
    sns.boxplot(data=df, y=attribute+"_7",x="churn",hue="churn",
                showfliers=False,palette=("plasma"))
    plt.subplot(2,3,3)
    sns.boxplot(data=df, y=attribute+"_8",x="churn",hue="churn",
                showfliers=False,palette=("plasma"))
    plt.show()

In [None]:
plot_box_chart('max_rech_amt')

###### There is a drop in the max recharge amount for churned customers in the 8th Month (Action Phase)

In [None]:
sns.boxplot(data=tcom_data, y='avg_rech_amt_6_7',x="churn",hue="churn",
                showfliers=False,palette=("plasma"))

In [None]:
sns.boxplot(data=tcom_data, y='av_rech_amt_data_8',x="churn",hue="churn",
                showfliers=False,palette=("plasma"))

In [None]:
sns.boxplot(data=tcom_data, y='overall_rech_amt_6',x="churn",hue="churn",
                showfliers=False,palette=("plasma"))

In [None]:
sns.boxplot(data=tcom_data, y='overall_rech_amt_7',x="churn",hue="churn",
                showfliers=False,palette=("plasma"))

In [None]:
plot_box_chart('vol_2g_mb')

In [None]:
plot_box_chart('vol_3g_mb')

###### 2G and 3G usage for churned customers drops in 8th month

###### It can be observed that 2G/3G usage is higher for non-churned customers indicating that churned customers might be from areas where 2G/3G service is not properly available

In [None]:
def plot_mean_bar_chart(df,columns_list):
    df_0 = df[df.churn==0].filter(columns_list)
    df_1 = df[df.churn==1].filter(columns_list)

    mean_df_0 = pd.DataFrame([df_0.mean()],index={'Non Churn'})
    mean_df_1 = pd.DataFrame([df_1.mean()],index={'Churn'})

    frames = [mean_df_0, mean_df_1]
    mean_bar = pd.concat(frames)

    mean_bar.T.plot.bar(figsize=(10,5),rot=0)
    plt.show()
    
    return mean_bar

In [None]:
vbc_column = tcom_data.columns[tcom_data.columns.str.contains('vbc_3g',regex=True)]
vbc_column.tolist()

In [None]:
plot_mean_bar_chart(tcom_data, vbc_column)

###### It can be observed that volume-based cost for 3G is much lower for Churned customers as compared to Non-Churned Customers and there is a drop in vbc in 8th month also.


In [None]:
SC_2g_or_3g_col = tcom_data.columns[tcom_data.columns.str.contains('sachet_2g|sachet_3g',regex=True)]

In [None]:
plot_mean_bar_chart(tcom_data, SC_2g_or_3g_col)

###### drop in sachet services in 8th month for churned cutsomers

In [None]:
# Checking columns for average revenue per user
arpu_cols = tcom_data.columns[tcom_data.columns.str.contains('arpu_')]


In [None]:
arpu_cols

In [None]:
plot_box_chart('arpu')

In [None]:
plot_mean_bar_chart(tcom_data, arpu_cols)

###### there is drop for Arpu in 8th month for churned customers

In [None]:
offnet_usage_service_col = tcom_data.columns[tcom_data.columns.str.contains('offnet.*mou',regex=True)]

In [None]:
offnet_usage_service_col

In [None]:
plot_box_chart('offnet_mou')

In [None]:
plot_mean_bar_chart(tcom_data, offnet_usage_service_col)

###### There is a drop for offnet mou services in the 8th month

In [None]:
onnet_usage_service =  tcom_data.columns[tcom_data.columns.str.contains('^onnet.*mou',regex=True)]

In [None]:
onnet_usage_service

In [None]:
plot_box_chart('onnet_mou')

In [None]:
plot_mean_bar_chart(tcom_data, onnet_usage_service)

###### there is a drop in onnet_usage_Service in the 8th month

#### Proceeding with further data preparation, remove all the attributes corresponding to the churn phase (all attributes having ‘ _9’, etc. in their names)

In [None]:
churn_phase_cols = [col for col in tcom_data.columns if '_9' in col]

In [None]:
# Dropping the selected churn phase columns
tcom_data.drop(churn_phase_cols, axis=1, inplace=True)

# The curent dimension of the dataset after dropping the churn related columns
tcom_data.shape

In [None]:
tcom_data.drop(['total_rech_amt_data_6','av_rech_amt_data_6',
                   'total_rech_data_6','total_rech_amt_6',
                  'total_rech_amt_data_7','av_rech_amt_data_7',
                   'total_rech_data_7','total_rech_amt_7'], axis=1, inplace=True)

In [None]:
# creating a list of column names for each month
mon_6_cols = [col for col in tcom_data.columns if '_6' in col]
mon_7_cols = [col for col in tcom_data.columns if '_7' in col]
mon_8_cols = [col for col in tcom_data.columns if '_8' in col]

In [None]:
# lets check the correlation amongst the independent variables, drop the highly correlated ones
tcom_data_corr = tcom_data.corr()
tcom_data_corr.loc[:,:] = np.tril(tcom_data_corr, k=-1)
tcom_data_corr = tcom_data_corr.stack()
tcom_data_corr
tcom_data_corr[(tcom_data_corr > 0.80) | (tcom_data_corr < -0.80)].sort_values(ascending=False)

In [None]:
col_to_drop=['total_rech_amt_8','isd_og_mou_8','isd_og_mou_7','sachet_2g_8','total_ic_mou_6',
            'total_ic_mou_8','total_ic_mou_7','std_og_t2t_mou_6','std_og_t2t_mou_8','std_og_t2t_mou_7',
            'std_og_t2m_mou_7','std_og_t2m_mou_8',]

# These columns can be dropped as they are highly collinered with other predictor variables.
# criteria set is for collinearity of 85%

#  dropping these column
tcom_data.drop(col_to_drop, axis=1, inplace=True)

In [None]:
tcom_data.shape

In [None]:
# creating a new variable 'tenure'
tcom_data['tenure'] = (tcom_data['aon']/30).round(0)

# Since we derived a new column from 'aon', we can drop it
tcom_data.drop('aon',axis=1, inplace=True)

In [None]:
tn_range = [0, 6, 12, 24, 60, 61]
tn_label = [ '0-6 Months', '6-12 Months', '1-2 Yrs', '2-5 Yrs', '5 Yrs and above']
tcom_data['tenure_range'] = pd.cut(tcom_data['tenure'], tn_range, labels=tn_label)
tcom_data['tenure_range'].head()

In [None]:
# Plotting a bar plot for tenure range
plt.figure(figsize=[12,7])
sns.barplot(x='tenure_range',y='churn', data=tcom_data)
plt.show()

##### It can be observed that Most Churn happens during the first 6 months. As a customer stays longer with the network, Churn decreases

In [None]:
sns.distplot(tcom_data['tenure'],bins=30)
plt.show()

##### The average revenue per user in good phase of customer is given by arpu_6 and arpu_7. since we have two seperate averages, lets take an average to these two and drop the other columns

In [None]:
tcom_data["avg_arpu_6_7"]= (tcom_data['arpu_6']+tcom_data['arpu_7'])/2
tcom_data['avg_arpu_6_7'].head()

In [None]:
tcom_data.drop(['arpu_6','arpu_7'], axis=1, inplace=True)

In [None]:
tcom_data.shape

In [None]:
sns.distplot(tcom_data['avg_arpu_6_7'])
plt.show()

In [None]:
plt.figure(figsize=(10,50))
heatmap_churn = sns.heatmap(tcom_data.corr()[['churn']].sort_values(ascending=False, by='churn'),annot=True, 
                                cmap='summer')
heatmap_churn.set_title("Features Correlating with Churn variable", fontsize=15)

##### Avg Outgoing Calls & calls on romaning for 6 & 7th months are positively correlated with churn. Avg Revenue, No. Of Recharge for 8th month has negative correlation with churn.

In [None]:
sns.boxplot(x = tcom_data.churn, y = tcom_data.tenure)

plt.show()

In [None]:
tcom_data[['total_rech_num_8', 'arpu_8']].plot.scatter(x = 'total_rech_num_8',
                                                              y='arpu_8')
plt.show()

In [None]:
# Plot between churn vs max rechare amount
ax = sns.kdeplot(tcom_data.max_rech_amt_8[(tcom_data["churn"] == 0)],
                color="Red", shade = True)
ax = sns.kdeplot(tcom_data.max_rech_amt_8[(tcom_data["churn"] == 1)],
                ax =ax, color="Blue", shade= True)
ax.legend(["No-Churn","Churn"],loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Volume based cost')
ax.set_title('Distribution of Max Recharge Amount by churn')
plt.show()

In [None]:
# churn vs max rechare amount
ax = sns.kdeplot(tcom_data.av_rech_amt_data_8[(tcom_data["churn"] == 0)],
                color="Red", shade = True)
ax = sns.kdeplot(tcom_data.av_rech_amt_data_8[(tcom_data["churn"] == 1)],
                ax =ax, color="Blue", shade= True)
ax.legend(["No-Churn","Churn"],loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Volume based cost')
ax.set_title('Distribution of Average Recharge Amount for Data by churn')
plt.show()

In [None]:
# Creating categories for month 8 column totalrecharge and their count
tcom_data['total_rech_data_group_8']=pd.cut(tcom_data['total_rech_data_8'],[-1,0,10,25,100],labels=["No_Recharge","<=10_Recharges","10-25_Recharges",">25_Recharges"])
tcom_data['total_rech_num_group_8']=pd.cut(tcom_data['total_rech_num_8'],[-1,0,10,25,1000],labels=["No_Recharge","<=10_Recharges","10-25_Recharges",">25_Recharges"])

In [None]:
# Plotting the results

plt.figure(figsize=[12,4])
sns.countplot(data=tcom_data,x="total_rech_data_group_8",hue="churn")
print("\t\t\t\t\tDistribution of total_rech_data_8 variable\n",tcom_data['total_rech_data_group_8'].value_counts())
plt.show()
plt.figure(figsize=[12,4])
sns.countplot(data=tcom_data,x="total_rech_num_group_8",hue="churn")
print("\t\t\t\t\tDistribution of total_rech_num_8 variable\n",tcom_data['total_rech_num_group_8'].value_counts())
plt.show()

In [None]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy = pd.get_dummies(tcom_data[['total_rech_data_group_8','total_rech_num_group_8','tenure_range']], drop_first=True)
dummy.head()

In [None]:
# Adding the results to the master dataframe
tcom_data = pd.concat([tcom_data, dummy], axis=1)
tcom_data.head()

In [None]:
# Creating a copy of the filtered dataframe

df=tcom_data[:].copy()

# Dropping unwanted columns
df.drop(['tenure_range','mobile_number','total_rech_data_group_8','total_rech_num_group_8','sep_vbc_3g','tenure'], axis=1, inplace=True)

In [None]:
# Cheking the dataset
df.head()

In [None]:
# lets create X dataset for model building.
X = df.drop(['churn'],axis=1)

In [None]:
X.head()

In [None]:
y=df['churn']
y.head()

In [None]:
# split the dateset into train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=1)
print("Dimension of X_train:", X_train.shape)
print("Dimension of X_test:", X_test.shape)

In [None]:
X_train.info(verbose=True)

In [None]:
num_col = X_train.select_dtypes(include = ['int64','float64']).columns.tolist()

In [None]:
# apply scaling on the dataset
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train[num_col] = scaler.fit_transform(X_train[num_col])

In [None]:
X_train.head()

#### since the rate of churn is typically low (about 5-10%, this is called class-imbalance) - using SMOTE technique to handle class imbalance

##### Handling Data Imbalance

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train_sm,y_train_sm = sm.fit_resample(X_train,y_train)

In [None]:
print("Dimension of X_train_sm Shape:", X_train_sm.shape)
print("Dimension of y_train_sm Shape:", y_train_sm.shape)

### Logistic Regression

In [None]:
# Importing necessary libraries for Model creation
import statsmodels.api as sm

In [None]:
# Logistic regression model
logm1 = sm.GLM(y_train_sm,(sm.add_constant(X_train_sm)), family = sm.families.Binomial())
logm1.fit().summary()

##### Using RFE

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

from sklearn.feature_selection import RFE

# running RFE with 20 variables as output
rfe = RFE(logreg, step = 20)             
rfe = rfe.fit(X_train_sm, y_train_sm)

In [None]:
rfe.support_

In [None]:
rfe_columns=X_train_sm.columns[rfe.support_]
print("The selected columns by RFE for modelling are: \n\n",rfe_columns)

In [None]:
list(zip(X_train_sm.columns, rfe.support_, rfe.ranking_))

In [None]:
X_train_SM = sm.add_constant(X_train_sm[rfe_columns])
logm2 = sm.GLM(y_train_sm,X_train_SM, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
# From the p-value of the individual columns, 
    # we can drop the column 'loc_ic_t2t_mou_8' as it has high p-value of 0.80
rfe_columns_1=rfe_columns.drop('loc_ic_t2t_mou_8',1)
print("\nThe new set of edited featured are:\n",rfe_columns_1)

In [None]:
# Training the model with the edited feature list
X_train_SM = sm.add_constant(X_train_sm[rfe_columns_1])
logm2 = sm.GLM(y_train_sm,X_train_SM, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
# From the p-value of the individual columns, 
    # we can drop the column 'loc_ic_t2m_mou_8' as it has high p-value of 0.80
rfe_columns_2=rfe_columns_1.drop('loc_ic_t2m_mou_8',1)
print("\nThe new set of edited featured are:\n",rfe_columns_2)

In [None]:
# Training the model with the edited feature list
X_train_SM = sm.add_constant(X_train_sm[rfe_columns_2])
logm2 = sm.GLM(y_train_sm,X_train_SM, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
# Getting the predicted values on the train set
y_train_sm_pred = res.predict(X_train_SM)
y_train_sm_pred = y_train_sm_pred.values.reshape(-1)
y_train_sm_pred[:10]

##### Creating a dataframe with the actual churn flag and the predicted probabilities

In [None]:
y_train_sm_pred_final = pd.DataFrame({'Converted':y_train_sm.values, 'Converted_prob':y_train_sm_pred})
y_train_sm_pred_final.head()

In [None]:
##### Creating new column 'churn_pred' with 1 if Churn_Prob > 0.5 else 0

In [None]:
y_train_sm_pred_final['churn_pred'] = y_train_sm_pred_final.Converted_prob.map(lambda x: 1 if x > 0.5 else 0)

# Viewing the prediction results
y_train_sm_pred_final.head()

##### Confusion matrix 

In [None]:
from sklearn import metrics


confusion = metrics.confusion_matrix(y_train_sm_pred_final.Converted, y_train_sm_pred_final.churn_pred )
print(confusion)

In [None]:
# Checking the overall accuracy.
print("The overall accuracy of the model is:",metrics.accuracy_score(y_train_sm_pred_final.Converted, y_train_sm_pred_final.churn_pred))

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train_sm[rfe_columns_2].columns
vif['VIF'] = [variance_inflation_factor(X_train_sm[rfe_columns].values, i) for i in range(X_train_sm[rfe_columns_2].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
print("Sensitivity = ",TP / float(TP+FN))

# Let us calculate specificity
print("Specificity = ",TN / float(TN+FP))

# Calculate false postive rate - predicting churn when customer does not have churned
print("False Positive Rate = ",FP/ float(TN+FP))

# positive predictive value 
print ("Precision = ",TP / float(TP+FP))

# Negative predictive value
print ("True Negative Prediction Rate = ",TN / float(TN+ FN))

##### Plotting the ROC Curve

In [None]:
# Defining a function to plot the roc curve
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Prediction Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
# Defining the variables to plot the curve
fpr, tpr, thresholds = metrics.roc_curve( y_train_sm_pred_final.Converted, y_train_sm_pred_final.Converted_prob, drop_intermediate = False )

In [None]:
# Plotting the curve for the obtained metrics
draw_roc(y_train_sm_pred_final.Converted, y_train_sm_pred_final.Converted_prob)

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_sm_pred_final[i]= y_train_sm_pred_final.Converted_prob.map(lambda x: 1 if x > i else 0)
y_train_sm_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['probability','accuracy','sensitivity','specificity'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_sm_pred_final.Converted, y_train_sm_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    specificity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensitivity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensitivity,specificity]
print(cutoff_df)

In [None]:
# plotting accuracy sensitivity and specificity for various probabilities calculated above.
cutoff_df.plot.line(x='probability', y=['accuracy','sensitivity','specificity'])
plt.show()

In [None]:
# Let's create columns with refined probability cutoffs 
numbers = [0.50,0.51,0.52,0.53,0.54,0.55,0.56,0.57,0.58,0.59]
for i in numbers:
    y_train_sm_pred_final[i]= y_train_sm_pred_final.Converted_prob.map(lambda x: 1 if x > i else 0)
y_train_sm_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['probability','accuracy','sensitivity','specificity'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.50,0.51,0.52,0.53,0.54,0.55,0.56,0.57,0.58,0.59]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_sm_pred_final.Converted, y_train_sm_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    specificity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensitivity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensitivity,specificity]
print(cutoff_df)

In [None]:
# plotting accuracy sensitivity and specificity for various probabilities calculated above.
cutoff_df.plot.line(x='probability', y=['accuracy','sensitivity','specificity'])
plt.show()

In [None]:
y_train_sm_pred_final['final_churn_pred'] = y_train_sm_pred_final.Converted_prob.map( lambda x: 1 if x > 0.52 else 0)

y_train_sm_pred_final.head()

In [None]:
# Calculating the ovearall accuracy again
print("The overall accuracy of the model now is:",metrics.accuracy_score(y_train_sm_pred_final.Converted, y_train_sm_pred_final.final_churn_pred))

In [None]:
confusion2 = metrics.confusion_matrix(y_train_sm_pred_final.Converted, y_train_sm_pred_final.final_churn_pred )
print(confusion2)

In [None]:
TP2 = confusion2[1,1] # true positive 
TN2 = confusion2[0,0] # true negatives
FP2 = confusion2[0,1] # false positives
FN2 = confusion2[1,0] # false negatives

# Let's see the sensitivity of our logistic regression model
print("Sensitivity = ",TP2 / float(TP2+FN2))

# Let us calculate specificity
print("Specificity = ",TN2 / float(TN2+FP2))

# Calculate false postive rate - predicting churn when customer does not have churned
print("False Positive Rate = ",FP2/ float(TN2+FP2))

# positive predictive value 
print ("Precision = ",TP2 / float(TP2+FP2))

# Negative predictive value
print ("True Negative Prediction Rate = ",TN2 / float(TN2 + FN2))

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
p, r, thresholds = precision_recall_curve(y_train_sm_pred_final.Converted, y_train_sm_pred_final.Converted_prob)

# Plotting the curve
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

##### Making Predictions on the test set

In [None]:
# Scaling the test data
X_test[num_col] = scaler.transform(X_test[num_col])
X_test.head()

In [None]:
# Feature selection
X_test=X_test[rfe_columns_2]
X_test.head()

In [None]:
# Adding constant to the test model.
X_test_SM = sm.add_constant(X_test)

In [None]:
y_test_pred = res.predict(X_test_SM)
print("\n The first ten probability value of the prediction are:\n",y_test_pred[:10])

In [None]:
y_pred = pd.DataFrame(y_test_pred)
y_pred.head()

In [None]:
y_pred=y_pred.rename(columns = {0:"Conv_prob"})

In [None]:
y_test_df = pd.DataFrame(y_test)
y_test_df.head()

In [None]:
y_pred_final = pd.concat([y_test_df,y_pred],axis=1)
y_pred_final.head()

In [None]:
y_pred_final['test_churn_pred'] = y_pred_final.Conv_prob.map(lambda x: 1 if x>0.54 else 0)
y_pred_final.head()

In [None]:
# Checking the overall accuracy of the predicted set.
metrics.accuracy_score(y_pred_final.churn, y_pred_final.test_churn_pred)

In [None]:
# Confusion Matrix
confusion2_test = metrics.confusion_matrix(y_pred_final.churn, y_pred_final.test_churn_pred)
print("Confusion Matrix\n",confusion2_test)

In [None]:
# Calculating model validation parameters
TP3 = confusion2_test[1,1] # true positive 
TN3 = confusion2_test[0,0] # true negatives
FP3 = confusion2_test[0,1] # false positives
FN3 = confusion2_test[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
print("Sensitivity = ",TP3 / float(TP3+FN3))

# Let us calculate specificity
print("Specificity = ",TN3 / float(TN3+FP3))

# Calculate false postive rate - predicting churn when customer does not have churned
print("False Positive Rate = ",FP3/ float(TN3+FP3))

# positive predictive value 
print ("Precision = ",TP3 / float(TP3+FP3))

# Negative predictive value
print ("True Negative Prediction Rate = ",TN3 / float(TN3+FN3))

In [None]:
print("The accuracy of the predicted model is: ",round(metrics.accuracy_score(y_pred_final.churn, y_pred_final.test_churn_pred),2)*100,"%")
print("The sensitivity of the predicted model is: ",round(TP3 / float(TP3+FN3),2)*100,"%")

print("\nAs the model created is based on a sensitivity model, i.e. the True positive rate is given more importance as the actual and prediction of churn by a customer\n") 

In [None]:
# ROC curve for the test dataset

# Defining the variables to plot the curve
fpr, tpr, thresholds = metrics.roc_curve(y_pred_final.churn,y_pred_final.Conv_prob, drop_intermediate = False )
# Plotting the curve for the obtained metrics
draw_roc(y_pred_final.churn,y_pred_final.Conv_prob)

### Logistic Regression using PCA

In [None]:
# split the dateset into train and test datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=1)
print("Dimension of X_train:", X_train.shape)
print("Dimension of X_test:", X_test.shape)

# apply scaling on the dataset

scaler = MinMaxScaler()
X_train[num_col] = scaler.fit_transform(X_train[num_col])
X_test[num_col] = scaler.transform(X_test[num_col])

# Applying SMOTE technique for data imbalance correction

sm = SMOTE(random_state=42)
X_train_sm,y_train_sm = sm.fit_resample(X_train,y_train)
print("Dimension of X_train_sm Shape:", X_train_sm.shape)
print("Dimension of y_train_sm Shape:", y_train_sm.shape)

X_train_sm.head()

In [None]:
# importing PCA
from sklearn.decomposition import PCA
pca = PCA(random_state=42)

# applying PCA on train data
pca.fit(X_train_sm)

In [None]:
X_train_sm_pca=pca.fit_transform(X_train_sm)
print("Dimension of X_train_sm_pca: ",X_train_sm_pca.shape)

X_test_pca=pca.transform(X_test)
print("Dimension of X_test_pca: ",X_test_pca.shape)

In [None]:
#Viewing the PCA components
pca.components_

In [None]:
#### Performing Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg_pca = LogisticRegression()
logreg_pca.fit(X_train_sm_pca, y_train_sm)

# making the predictions
y_pred = logreg_pca.predict(X_test_pca)

# converting the prediction into a dataframe
y_pred_df = pd.DataFrame(y_pred)
print("Dimension of y_pred_df:", y_pred_df.shape)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Checking the Confusion matrix
print("Confusion Matirx for y_test & y_pred\n",confusion_matrix(y_test,y_pred),"\n")

# Checking the Accuracy of the Predicted model.
print("Accuracy of the logistic regression model with PCA: ",accuracy_score(y_test,y_pred))

In [None]:
plt.bar(range(1,len(pca.explained_variance_ratio_)+1),pca.explained_variance_ratio_)
plt.show()

In [None]:
var_cumu = np.cumsum(pca.explained_variance_ratio_)

# Making a scree plot
fig = plt.figure(figsize=[12,7])
plt.plot(var_cumu)
plt.xlabel('no of principal components')
plt.ylabel('explained variance - cumulative')
plt.show()

In [None]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)

##### **90% of the data can be explained with 8 PCA components*

##### **Fitting the dataset with the 8 explainable components**

In [None]:
pca_8 = PCA(n_components=15)

train_pca_8 = pca_8.fit_transform(X_train_sm)
print("Dimension for Train dataset using PCA: ", train_pca_8.shape)

test_pca_8 = pca_8.transform(X_test)
print("Dimension for Test dataset using PCA: ", test_pca_8.shape)

In [None]:
logreg_pca_8 = LogisticRegression()
logreg_pca_8.fit(train_pca_8, y_train_sm)

# making the predictions
y_pred_8 = logreg_pca_8.predict(test_pca_8)

# converting the prediction into a dataframe
y_pred_df_8 = pd.DataFrame(y_pred_8)
print("Dimension of y_pred_df_8: ", y_pred_df_8.shape)

In [None]:
# Checking the Confusion matrix
print("Confusion Matirx for y_test & y_pred\n",confusion_matrix(y_test,y_pred_8),"\n")

# Checking the Accuracy of the Predicted model.
print("Accuracy of the logistic regression model with PCA: ",accuracy_score(y_test,y_pred_8))