In [26]:
import pandas as pd
import numpy as np

In [27]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

# Importing pickle for serializing and de-serializing Python object structures
import pickle

In [28]:
# Replace 'your_file.csv' with the actual file name
df = pd.read_csv('Churn_Modelling.csv')

In [29]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [30]:
# Display basic information about the dataframe
df_info = df.info()
df_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [31]:
# Display summary statistics of the dataframe
df_description = df.describe()
df_description

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [32]:
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [33]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the Gender column
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Display the first few rows to verify the encoding
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [34]:
df['Gender'].value_counts()

Gender
1    5457
0    4543
Name: count, dtype: int64

In [35]:
df['Geography'].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

### One Hot Encoding

One hot encoding is a technique used to convert categorical data into a format that can be provided to machine learning algorithms to improve predictions. This technique is particularly useful when dealing with categorical variables that have no ordinal relationship.

In one hot encoding, each category is converted into a new binary column. Each column corresponds to one possible category, and the value is 1 if the original category matches the column, and 0 otherwise.

For example, consider a categorical variable "Geography" with three possible values: "France", "Spain", and "Germany". One hot encoding will convert this variable into three binary columns:

| Geography_France | Geography_Spain | Geography_Germany |
|------------------|-----------------|-------------------|
| 1                | 0               | 0                 |
| 0                | 1               | 0                 |
| 1                | 0               | 0                 |
| 0                | 0               | 1                 |

This transformation allows the machine learning algorithm to understand the categorical data without assuming any ordinal relationship between the categories.

In [36]:
one_hot_encoder = OneHotEncoder()

In [38]:
geo_encoded = one_hot_encoder.fit_transform(df['Geography'].values.reshape(-1, 1)).toarray()
"""
Encodes the 'Geography' column of the DataFrame using one-hot encoding.

The 'Geography' column is reshaped and transformed into a one-hot encoded array.

Note:
    Ensure that 'one_hot_encoder' is an instance of a one-hot encoder (e.g., OneHotEncoder from sklearn) 
    and 'df' is a pandas DataFrame containing the 'Geography' column.

Returns:
    numpy.ndarray: A 2D array with one-hot encoded values of the 'Geography' column.
"""

"\nEncodes the 'Geography' column of the DataFrame using one-hot encoding.\n\nThe 'Geography' column is reshaped and transformed into a one-hot encoded array.\n\nNote:\n    Ensure that 'one_hot_encoder' is an instance of a one-hot encoder (e.g., OneHotEncoder from sklearn) \n    and 'df' is a pandas DataFrame containing the 'Geography' column.\n\nReturns:\n    numpy.ndarray: A 2D array with one-hot encoded values of the 'Geography' column.\n"

In [39]:
geo_encoded

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [42]:
one_hot_encoder.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [43]:
geo_encoded_df = pd.DataFrame(geo_encoded, columns=one_hot_encoder.get_feature_names_out(['Geography']))
geo_encoded_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [44]:
geo_encoded_df.shape

(10000, 3)

In [45]:
df = pd.concat([df, geo_encoded_df], axis=1)

In [46]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,France,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,France,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [47]:
df.drop('Geography', axis=1, inplace=True)
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [48]:
df.dtypes

CreditScore            int64
Gender                 int64
Age                    int64
Tenure                 int64
Balance              float64
NumOfProducts          int64
HasCrCard              int64
IsActiveMember         int64
EstimatedSalary      float64
Exited                 int64
Geography_France     float64
Geography_Germany    float64
Geography_Spain      float64
dtype: object

In [50]:
# Save the LabelEncoder for Gender
with open('gender_label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

# Save the OneHotEncoder for Geography
with open('geography_one_hot_encoder.pkl', 'wb') as file:
    pickle.dump(one_hot_encoder, file)