## Feature Transformation using Sklearn with Aritificial Neural Network(ANN)

In [1]:
# Importing required libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle

In [2]:
# Loading the dataset

data = pd.read_csv("dataset/ChurnPrediction.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
data.shape

(10000, 14)

### Data Preprocessing

In [4]:
# Dropping irrelevant features

data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1)
data.head(3)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [5]:
# Encoding categorical variables

label_encoder = LabelEncoder()

data['Gender'] = label_encoder.fit_transform(data['Gender']) # Performs fit and transform in one go
data.head(3)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1


In [6]:
data['Gender'].value_counts()

Gender
1    5457
0    4543
Name: count, dtype: int64

In [7]:
# One-Hot Encoding on Geography column

one_hot_encoder = OneHotEncoder()

geography_encoder = one_hot_encoder.fit_transform(data[['Geography']]).toarray()
print(geography_encoder)

[[1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [8]:
# Features present in the Encoder

one_hot_encoder.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [9]:
# Converting to dataframe

geography_encoded_df = pd.DataFrame(geography_encoder, columns = one_hot_encoder.get_feature_names_out(['Geography']))
geography_encoded_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [10]:
# Dropping the old Geography column
data = data.drop(["Geography"], axis = 1)

# Combining One Hot Encoded columns with the original data
data = pd.concat([data, geography_encoded_df], axis = 1)
data.head(4)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0


In [11]:
# Saving the scalers and encoders for End to End prediction

with open('models/gender_label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

with open('models/geography_ohe.pkl', 'wb') as file:
    pickle.dump(one_hot_encoder, file)

### Train Test Split

In [12]:
# Independent and Dependent features

X = data.drop('Exited', axis = 1)
Y = data['Exited']

In [13]:
# Training and Testing Split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

`Feature Scaling` is a technique to standardize the independent features present in the data. If feature scaling is not done then machine learning algorithm tends to use greater values as higher and consider smaller values as lower regardless of the unit of the values. 

In [14]:
# Scaling the features

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) # 
X_test = scaler.transform(X_test) # tranform for preventing data leakage and for ensuring your model generalizes well

In [15]:
# Saving the scaler

with open("models/standard_scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

In [16]:
# Saving the training and testing data

with open("models/X_train", "wb") as file:
    pickle.dump(X_train, file)  

with open("models/X_test", "wb") as file:
    pickle.dump(X_test, file)

with open("models/Y_train", "wb") as file:
    pickle.dump(Y_train, file)

with open("models/Y_test", "wb") as file:
    pickle.dump(Y_test, file)