# **Exploratory Data Analysis**

## **Handling Missing Values**

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('/Users/adityakumbhar/Developer/Datasets/Churn_Modelling_EDA.csv')
df.head()

Unnamed: 0,customerID,gender,age,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,44.0,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,53.0,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,38.0,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,56.0,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,45.0,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            6950 non-null   object 
 2   age               6241 non-null   float64
 3   SeniorCitizen     7043 non-null   int64  
 4   Partner           7043 non-null   object 
 5   Dependents        7043 non-null   object 
 6   tenure            7043 non-null   int64  
 7   PhoneService      7043 non-null   object 
 8   MultipleLines     7043 non-null   object 
 9   InternetService   7043 non-null   object 
 10  OnlineSecurity    7043 non-null   object 
 11  OnlineBackup      7043 non-null   object 
 12  DeviceProtection  7043 non-null   object 
 13  TechSupport       7043 non-null   object 
 14  StreamingTV       7043 non-null   object 
 15  StreamingMovies   7043 non-null   object 
 16  Contract          7043 non-null   object 


### **Drop Columns and Rows**

In [6]:
updated_df_c = df.dropna(axis=1)
updated_df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [7]:
updated_df_r = df.dropna(axis=0)
updated_df_r.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6228 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        6228 non-null   object 
 1   gender            6228 non-null   object 
 2   age               6228 non-null   float64
 3   SeniorCitizen     6228 non-null   int64  
 4   Partner           6228 non-null   object 
 5   Dependents        6228 non-null   object 
 6   tenure            6228 non-null   int64  
 7   PhoneService      6228 non-null   object 
 8   MultipleLines     6228 non-null   object 
 9   InternetService   6228 non-null   object 
 10  OnlineSecurity    6228 non-null   object 
 11  OnlineBackup      6228 non-null   object 
 12  DeviceProtection  6228 non-null   object 
 13  TechSupport       6228 non-null   object 
 14  StreamingTV       6228 non-null   object 
 15  StreamingMovies   6228 non-null   object 
 16  Contract          6228 non-null   object 
 17  

### **Imputing Values**.  
1. If the the values are numerical impute it with mean/median.  
2. If the values are categorical impute with mode.

In [6]:
updated_df = df.copy()
mean_age = updated_df['age'].mean()
meadian_age = updated_df['age'].median()

print(f"Mean Age: {mean_age} \nMedian Age: {meadian_age}")

Mean Age: 42.63178977727928 
Median Age: 43.0


In [7]:
updated_df['age']=updated_df['age'].fillna(meadian_age)
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            6950 non-null   object 
 2   age               7043 non-null   float64
 3   SeniorCitizen     7043 non-null   int64  
 4   Partner           7043 non-null   object 
 5   Dependents        7043 non-null   object 
 6   tenure            7043 non-null   int64  
 7   PhoneService      7043 non-null   object 
 8   MultipleLines     7043 non-null   object 
 9   InternetService   7043 non-null   object 
 10  OnlineSecurity    7043 non-null   object 
 11  OnlineBackup      7043 non-null   object 
 12  DeviceProtection  7043 non-null   object 
 13  TechSupport       7043 non-null   object 
 14  StreamingTV       7043 non-null   object 
 15  StreamingMovies   7043 non-null   object 
 16  Contract          7043 non-null   object 


In [8]:
mode_gender = updated_df['gender'].mode()
print(f"Mode of gender: {mode_gender}")

Mode of gender: 0    Male
Name: gender, dtype: object


In [9]:
updated_df['gender'] = updated_df['gender'].fillna("Male")
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   age               7043 non-null   float64
 3   SeniorCitizen     7043 non-null   int64  
 4   Partner           7043 non-null   object 
 5   Dependents        7043 non-null   object 
 6   tenure            7043 non-null   int64  
 7   PhoneService      7043 non-null   object 
 8   MultipleLines     7043 non-null   object 
 9   InternetService   7043 non-null   object 
 10  OnlineSecurity    7043 non-null   object 
 11  OnlineBackup      7043 non-null   object 
 12  DeviceProtection  7043 non-null   object 
 13  TechSupport       7043 non-null   object 
 14  StreamingTV       7043 non-null   object 
 15  StreamingMovies   7043 non-null   object 
 16  Contract          7043 non-null   object 


### **Forward and Backward Fill**.  

1. Forward fill, fills the current data in the next record.  
2. Backward fill, fills the next data in the current record.

In [None]:
updated_df_f = df.ffill()  # for backward fill use bfill
updated_df_f.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   age               7043 non-null   float64
 3   SeniorCitizen     7043 non-null   int64  
 4   Partner           7043 non-null   object 
 5   Dependents        7043 non-null   object 
 6   tenure            7043 non-null   int64  
 7   PhoneService      7043 non-null   object 
 8   MultipleLines     7043 non-null   object 
 9   InternetService   7043 non-null   object 
 10  OnlineSecurity    7043 non-null   object 
 11  OnlineBackup      7043 non-null   object 
 12  DeviceProtection  7043 non-null   object 
 13  TechSupport       7043 non-null   object 
 14  StreamingTV       7043 non-null   object 
 15  StreamingMovies   7043 non-null   object 
 16  Contract          7043 non-null   object 


## **Feature Scaling**

#### Importing Libraries

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns

### **Normalization**

In [10]:
updated_df.head()

Unnamed: 0,customerID,gender,age,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,44.0,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,53.0,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,38.0,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,56.0,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,45.0,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [12]:
new_df = updated_df[['age', 'tenure']]
new_df.head()

Unnamed: 0,age,tenure
0,44.0,1
1,53.0,34
2,38.0,2
3,56.0,45
4,45.0,2


In [15]:
new_df.describe()

Unnamed: 0,age,tenure
count,7043.0,7043.0
mean,42.673719,32.371149
std,9.751035,24.559481
min,25.0,0.0
25%,35.0,9.0
50%,43.0,29.0
75%,51.0,55.0
max,60.0,72.0


**Max age:** 60 and **Min age:** 25.  
**Max tenure:** 72 and **Min tenure:** 0.  


In [13]:
scaler = MinMaxScaler()

normalized_df = scaler.fit_transform(new_df)
normalized_df

array([[0.54285714, 0.01388889],
       [0.8       , 0.47222222],
       [0.37142857, 0.02777778],
       ...,
       [0.02857143, 0.15277778],
       [0.57142857, 0.05555556],
       [0.22857143, 0.91666667]], shape=(7043, 2))

### **Standardization**

In [16]:
scaler = StandardScaler()
normalized_df = scaler.fit_transform(new_df)
normalized_df

array([[ 0.13602409, -1.27744458],
       [ 1.0590686 ,  0.06632742],
       [-0.47933892, -1.23672422],
       ...,
       [-1.71006494, -0.87024095],
       [ 0.23858459, -1.15528349],
       [-0.99214143,  1.36937906]], shape=(7043, 2))