# Feature Scaling

In [1]:
# Basic imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Here we're just adding missing values in DF for practise (IGNORE)


# Sample DataFrame similar to your description
data = {
    'RowNumber': range(1, 10001),
    'CustomerId': range(100000, 110000),
    'Surname': ['Surname']*10000,
    'CreditScore': np.random.randint(300, 850, size=10000),
    'Geography': np.random.choice(['France', 'Spain', 'Germany'], size=10000),
    'Gender': np.random.choice(['Male', 'Female'], size=10000),
    'Age': np.random.randint(18, 80, size=10000),
    'Tenure': np.random.randint(0, 10, size=10000),
    'Balance': np.random.uniform(0, 250000, size=10000),
    'NumOfProducts': np.random.randint(1, 4, size=10000),
    'HasCrCard': np.random.randint(0, 2, size=10000),
    'IsActiveMember': np.random.randint(0, 2, size=10000),
    'EstimatedSalary': np.random.uniform(10000, 150000, size=10000),
    'Exited': np.random.randint(0, 2, size=10000)
}

df = pd.DataFrame(data)


In [4]:
# Introduce missing values in 'Gender'
missing_gender_indices = np.random.choice(df.index, size=54, replace=False)
df.loc[missing_gender_indices, 'Gender'] = np.nan

# Introduce missing values in 'Age'
missing_age_indices = np.random.choice(df.index, size=300, replace=False)
df.loc[missing_age_indices, 'Age'] = np.nan


In [5]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,100000,Surname,602,Spain,Male,42.0,4,70180.747932,3,0,1,124298.826495,1
1,2,100001,Surname,407,Germany,Male,35.0,5,191370.262927,3,1,1,70191.18292,0
2,3,100002,Surname,797,Germany,Male,,3,50321.843904,2,1,1,87849.574441,1
3,4,100003,Surname,697,Spain,Female,74.0,6,235903.227934,2,1,1,141794.803799,0
4,5,100004,Surname,719,Spain,Female,19.0,4,201399.886198,1,1,1,58252.025973,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           9946 non-null   object 
 6   Age              9700 non-null   float64
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


###**Gender**: has 54 missing values
 ### **Age**: has 300 missing values

# Feature Scaling Methods:

## Normalization:

Normalization is a scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1. It is also known as **Min-Max scaling**.



<br>

## Standardization:

Standardization is another scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute becomes zero and the resultant distribution has a unit standard deviation.

In [22]:
# Feature Scaling Methods:

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [13]:
df.describe().round(2)

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,9700.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,104999.5,571.96,48.51,4.47,125414.87,1.99,0.49,0.49,80357.19,0.5
std,2886.9,2886.9,159.2,18.05,2.87,72634.86,0.82,0.5,0.5,40559.63,0.5
min,1.0,100000.0,300.0,18.0,0.0,11.02,1.0,0.0,0.0,10002.6,0.0
25%,2500.75,102499.75,434.0,32.0,2.0,62780.35,1.0,0.0,0.0,45167.6,0.0
50%,5000.5,104999.5,571.0,49.0,4.0,125707.24,2.0,0.0,0.0,80577.08,0.0
75%,7500.25,107499.25,708.0,64.0,7.0,188730.76,3.0,1.0,1.0,115357.63,1.0
max,10000.0,109999.0,849.0,79.0,9.0,249985.45,3.0,1.0,1.0,149993.1,1.0


# Task 1: Normalization

In [14]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,100000,Surname,602,Spain,Male,42.0,4,70180.747932,3,0,1,124298.826495,1
1,2,100001,Surname,407,Germany,Male,35.0,5,191370.262927,3,1,1,70191.18292,0
2,3,100002,Surname,797,Germany,Male,,3,50321.843904,2,1,1,87849.574441,1
3,4,100003,Surname,697,Spain,Female,74.0,6,235903.227934,2,1,1,141794.803799,0
4,5,100004,Surname,719,Spain,Female,19.0,4,201399.886198,1,1,1,58252.025973,1


In [15]:
new_df = pd.DataFrame(df, columns=["Age", "Tenure"])

In [16]:
new_df.head()

Unnamed: 0,Age,Tenure
0,42.0,4
1,35.0,5
2,,3
3,74.0,6
4,19.0,4


In [17]:
new_df['Age'] = new_df['Age'].fillna(new_df['Age'].mean())

In [18]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     10000 non-null  float64
 1   Tenure  10000 non-null  int64  
dtypes: float64(1), int64(1)
memory usage: 156.4 KB


<br>
<br>

### `scaler.fit_transform(new_df)`:

- Is basically doing **z score** behind the scene.

In [19]:
scaler = MinMaxScaler()
normalized_df = scaler.fit_transform(new_df)
print(normalized_df)

[[0.39344262 0.44444444]
 [0.27868852 0.55555556]
 [0.50010309 0.33333333]
 ...
 [0.37704918 1.        ]
 [0.86885246 0.44444444]
 [0.         0.77777778]]


## Let's Take an example

In [20]:
x_array = np.array([[2],[3],[5],[6],[6]])

scaler = MinMaxScaler()
normalized_df = scaler.fit_transform(x_array)
print(normalized_df)

[[0.  ]
 [0.25]
 [0.75]
 [1.  ]
 [1.  ]]


<br>

## Standardization



In [24]:
scaler = StandardScaler()
standardized_df = scaler.fit_transform(new_df)
print(standardized_df)

[[-3.66078318e-01 -1.63735463e-01]
 [-7.59935456e-01  1.85231849e-01]
 [ 3.99789040e-16 -5.12702776e-01]
 ...
 [-4.22343624e-01  1.58110110e+00]
 [ 1.26561554e+00 -1.63735463e-01]
 [-1.71644565e+00  8.83166474e-01]]


## Let's take another small example

In [25]:

x_array = np.array([[2],[3],[5],[6],[6]])

scaler = StandardScaler()
normalized_arr_ss = scaler.fit_transform(x_array)
print(normalized_arr_ss)


[[-1.47709789]
 [-0.86164044]
 [ 0.36927447]
 [ 0.98473193]
 [ 0.98473193]]
