# Team Epic 
# Telecommunication - Customer Churn Analysis

## Data Preparation

### Importing tha required liberaries <a class= "anchor" id="h0"></a>

In [65]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

### Loading the Data set

In [66]:
churn_df = pd.read_csv("Customer-Churn-Prediction.csv")
churn_df

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107.0,area_code_415,no,yes,26.0,161.6,123.0,27.47,195.5,103.0,16.62,254.4,103.0,11.45,13.7,3.0,3.70,1.0,no
1,NJ,137.0,area_code_415,no,no,0.0,243.4,114.0,41.38,121.2,110.0,10.30,162.6,104.0,7.32,12.2,5.0,3.29,0.0,no
2,OH,84.0,area_code_408,yes,no,0.0,299.4,71.0,50.90,61.9,88.0,5.26,196.9,89.0,8.86,6.6,7.0,1.78,2.0,no
3,OK,75.0,area_code_415,yes,no,0.0,166.7,113.0,28.34,148.3,122.0,12.61,186.9,121.0,8.41,10.1,3.0,2.73,3.0,no
4,MA,121.0,area_code_510,no,yes,24.0,218.2,88.0,37.09,348.5,108.0,29.62,212.6,118.0,9.57,7.5,7.0,2.03,3.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,MT,83.0,area_code_415,no,no,0.0,188.3,70.0,32.01,243.8,88.0,20.72,213.7,79.0,9.62,10.3,6.0,2.78,0.0,no
4246,WV,73.0,area_code_408,no,no,0.0,177.9,89.0,30.24,131.2,82.0,11.15,186.2,89.0,8.38,11.5,6.0,3.11,3.0,no
4247,NC,75.0,area_code_408,no,no,0.0,170.7,101.0,29.02,193.1,126.0,16.41,129.1,104.0,5.81,6.9,7.0,1.86,1.0,no
4248,HI,50.0,area_code_408,no,yes,40.0,235.7,127.0,40.07,223.0,126.0,18.96,297.5,116.0,13.39,9.9,5.0,2.67,2.0,no


### Feature description of dataset

In [67]:
churn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4250 entries, 0 to 4249
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          4232 non-null   object 
 1   account_length                 4216 non-null   float64
 2   area_code                      4234 non-null   object 
 3   international_plan             4250 non-null   object 
 4   voice_mail_plan                4237 non-null   object 
 5   number_vmail_messages          4216 non-null   float64
 6   total_day_minutes              4240 non-null   float64
 7   total_day_calls                4248 non-null   float64
 8   total_day_charge               4242 non-null   float64
 9   total_eve_minutes              4215 non-null   float64
 10  total_eve_calls                4233 non-null   float64
 11  total_eve_charge               4242 non-null   float64
 12  total_night_minutes            4248 non-null   f

### Conversion of categorical variables into numerical variables  <a class= "anchor" id="h1"></a>

In [68]:
cat_col = ['international_plan', 'voice_mail_plan', 'area_code', 'state']
for i in cat_col:
    churn_df[i]= churn_df[i].astype('category').cat.codes

### Handling missing values  <a class= "anchor" id="h2"></a>

In [70]:
churn_df.isnull().sum()

state                             0
account_length                   34
area_code                         0
international_plan                0
voice_mail_plan                   0
number_vmail_messages            34
total_day_minutes                10
total_day_calls                   2
total_day_charge                  8
total_eve_minutes                35
total_eve_calls                  17
total_eve_charge                  8
total_night_minutes               2
total_night_calls                 5
total_night_charge                7
total_intl_minutes                5
total_intl_calls                 13
total_intl_charge                30
number_customer_service_calls     3
churn                            22
dtype: int64

### Replacing NULL values in numerical Columns using Median 

In [71]:
for col in churn_df.columns[~churn_df.columns.isin(['state','area_code','churn'])]:
    churn_df[col] =  churn_df[col].fillna(churn_df[col].median())

### Replacing NULL values in categorical columns with mode.

In [72]:
for col in churn_df[['state','area_code','churn']]:
    churn_df[col] =  churn_df[col].fillna(churn_df[col].mode()[0])

### Checking for other null values

In [73]:
churn_df.isnull().sum()

state                            0
account_length                   0
area_code                        0
international_plan               0
voice_mail_plan                  0
number_vmail_messages            0
total_day_minutes                0
total_day_calls                  0
total_day_charge                 0
total_eve_minutes                0
total_eve_calls                  0
total_eve_charge                 0
total_night_minutes              0
total_night_calls                0
total_night_charge               0
total_intl_minutes               0
total_intl_calls                 0
total_intl_charge                0
number_customer_service_calls    0
churn                            0
dtype: int64

### Imbalancing of data 

In [74]:
(churn_df['churn'].value_counts().index[0], churn_df['churn'].value_counts().values[0])

('no', 3656)

In [75]:
(churn_df['churn'].value_counts().index[1], churn_df['churn'].value_counts().values[1])

('yes', 594)

### Selection of Feature variables and target variable<a class= "anchor" id="h4"></a>

In [76]:
X = churn_df.drop(["churn"], axis=1)
y = churn_df["churn"]

### Converitng traget variable into numerical by Label Encoder

In [77]:
le=LabelEncoder()
y = le.fit_transform(y)

### Feature Scaling 

In [78]:
scale = StandardScaler()
X = scale.fit_transform(X)
X.shape

(4250, 19)

### Applying Principal Component Analysis


In [96]:
pca = PCA(random_state = 10, n_components = 10)
pca_df = pca.fit_transform(X, y)
pca_df.shape

(4250, 10)

In [97]:
pca.explained_variance_ratio_

array([0.10841386, 0.10650035, 0.10505692, 0.10294925, 0.10060388,
       0.05555349, 0.05527865, 0.05356955, 0.05264542, 0.05223979])

### Splitting tha balanced dataset using train_test_split.<a class= "anchor" id="h6"></a>

In [98]:
from sklearn.model_selection import train_test_split
X_train, y_train, X_test, y_test = train_test_split(pca_df, y, random_state= 20, test_size= 0.3)

In [99]:
print("Size of the traintest :", X_train.shape , y_train.shape)

Size of the traintest : (2975, 10) (1275, 10)


In [100]:
print("Size of the testset :", X_test.shape , y_test.shape)

Size of the testset : (2975,) (1275,)
