### 1. Import Dependencies

In [34]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

### Encoding Types and Their Preferred Use

| Variable Type | Preferred Encoding | Why? |
|----------------|--------------------|------|
| **Nominal** | One-Hot Encoding | No inherent order → avoids implying false ordinal relationships |
| **Ordinal** | Label Encoding | Preserves order → small integers represent increasing levels |


Gender <Male, Female>

        Gender_Male, Gender_Female
Male    [1                  0]
Female  [0                  1]


(Assume Gender is ordinal);
    Male -> 1
    Female -> 0

Gender -> Nominal
Geography -> Nominal
CreditScoreBins -> Ordinary

In [35]:
df = pd.read_csv('data/processed/CEHHbInToW_Binning_Applied.csv')
df.head(10)

Unnamed: 0,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins
0,France,Female,42.0,2,0.0,1,1,1,101348.88,1,Fair
1,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0,Fair
2,France,Female,42.0,8,159660.8,3,1,0,113931.57,1,Poor
3,France,Female,38.91,1,0.0,2,0,0,93826.63,0,Good
4,Spain,Female,43.0,2,125510.82,1,1,1,79084.1,0,
5,Spain,Male,44.0,8,113755.78,2,1,0,149756.71,1,Fair
6,France,Male,50.0,7,0.0,2,1,1,10062.8,0,Excellent
7,Germany,Female,29.0,4,115046.74,4,1,0,119346.88,1,Poor
8,France,Male,44.0,4,142051.07,2,0,1,74940.5,0,Poor
9,France,Male,27.0,2,134603.88,1,1,1,71725.73,0,Good


### 2. Encode Nominal Variables

In [36]:
nominal_variable = ['Geography','Gender']

geography_dummies = pd.get_dummies(df['Geography'],prefix = "Geography")
gender_dummies = pd.get_dummies(df['Gender'],prefix = "Gender")


df_encoded = pd.concat([df,geography_dummies],axis =1 )
del df_encoded['Geography']

df_encoded = pd.concat([df,gender_dummies],axis =1 )
del df_encoded['Gender']


df_encoded.head(10)

Unnamed: 0,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Gender_Female,Gender_Male
0,France,42.0,2,0.0,1,1,1,101348.88,1,Fair,True,False
1,Spain,41.0,1,83807.86,1,0,1,112542.58,0,Fair,True,False
2,France,42.0,8,159660.8,3,1,0,113931.57,1,Poor,True,False
3,France,38.91,1,0.0,2,0,0,93826.63,0,Good,True,False
4,Spain,43.0,2,125510.82,1,1,1,79084.1,0,,True,False
5,Spain,44.0,8,113755.78,2,1,0,149756.71,1,Fair,False,True
6,France,50.0,7,0.0,2,1,1,10062.8,0,Excellent,False,True
7,Germany,29.0,4,115046.74,4,1,0,119346.88,1,Poor,True,False
8,France,44.0,4,142051.07,2,0,1,74940.5,0,Poor,False,True
9,France,27.0,2,134603.88,1,1,1,71725.73,0,Good,False,True


### 3. Encode Ordinal variable

In [37]:
encode_dict_creditscore = {
                            'Poor' : 0,
                            'Fair' : 1,
                            'Good' : 2,
                            'Very Good' : 3,
                            'Excellent': 4
                            }
df_encoded['CreditScoreBins'] = df_encoded['CreditScoreBins'].map(encode_dict_creditscore)
df_encoded.head(10)

Unnamed: 0,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Gender_Female,Gender_Male
0,France,42.0,2,0.0,1,1,1,101348.88,1,1.0,True,False
1,Spain,41.0,1,83807.86,1,0,1,112542.58,0,1.0,True,False
2,France,42.0,8,159660.8,3,1,0,113931.57,1,0.0,True,False
3,France,38.91,1,0.0,2,0,0,93826.63,0,2.0,True,False
4,Spain,43.0,2,125510.82,1,1,1,79084.1,0,,True,False
5,Spain,44.0,8,113755.78,2,1,0,149756.71,1,1.0,False,True
6,France,50.0,7,0.0,2,1,1,10062.8,0,4.0,False,True
7,Germany,29.0,4,115046.74,4,1,0,119346.88,1,0.0,True,False
8,France,44.0,4,142051.07,2,0,1,74940.5,0,0.0,False,True
9,France,27.0,2,134603.88,1,1,1,71725.73,0,2.0,False,True


In [45]:
df.to_csv('data/processed/CEHHbInToW_Encoded.csv',index=False)

In [38]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [39]:
df['Geography'].values.reshape(1000, 10)


array([['France', 'Spain', 'France', ..., 'Germany', 'France', 'France'],
       ['France', 'Spain', 'France', ..., 'Spain', 'Spain', 'France'],
       ['France', 'Spain', 'Spain', ..., 'France', 'Germany', 'France'],
       ...,
       ['France', 'France', 'France', ..., 'France', 'France', 'France'],
       ['Spain', 'Germany', 'Germany', ..., 'Spain', 'France', 'Spain'],
       ['Germany', 'France', 'Spain', ..., 'France', 'Germany', 'France']],
      shape=(1000, 10), dtype=object)

In [40]:
df = pd.read_csv('data/processed/CEHHbInToW_Binning_Applied.csv')
df.head(10)

ohe_geography = OneHotEncoder()
ohe_gender = OneHotEncoder()

le_credit_score = LabelEncoder()

ohe_geography.fit(df['Geography'].values.reshape(1000, 10))
ohe_gender.fit(df['Gender'].values.reshape(1000, 10))

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [42]:
geography_ohe =  ohe_geography.transform(df['Geography'].values.reshape(1000, 10))
geography_ohe = geography_ohe.toarray()
geography_ohe

array([[1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 1., 0., 0.]], shape=(1000, 30))

In [44]:
gender_ohe = ohe_gender.transform(df['Gender'].values.reshape(10000, 10))
gender_ohe = gender_ohe.toarray()
gender_ohe

ValueError: cannot reshape array of size 10000 into shape (10000,10)