Standardization (also known as Z-Score normalization) is when we center our data, then divide it by the standard deviation. Once we do that, our entire data set will have a mean of zero and a standard deviation of one.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
coffee = pd.read_csv('../data/starbucks_customers.csv')
ages = coffee['age']
ages.head()

0    52
1    35
2    29
3    28
4    28
Name: age, dtype: int64

In [3]:
ages.describe().astype(int)

count    122
mean      27
std        9
min       13
25%       22
50%       26
75%       29
max       70
Name: age, dtype: int32

In [13]:
age_standardized = (ages - ages.mean()) / ages.std()

In [14]:
age_standardized.describe().astype(int)

count    122
mean       0
std        1
min       -1
25%        0
50%        0
75%        0
max        4
Name: age, dtype: int32

# Standardizing our Data with Sklearn

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()

In [17]:
ages_reshape = np.array(ages).reshape(-1, 1)
ages_reshape[:5]

array([[52],
       [35],
       [29],
       [28],
       [28]], dtype=int64)

In [18]:
ages_scaled = scaler.fit_transform(ages_reshape)
ages_scaled[:5]

array([[2.6159116 ],
       [0.81285389],
       [0.17648058],
       [0.07041836],
       [0.07041836]])

In [19]:
print('Mean:', ages_scaled.mean())
print('Standard deviation:', ages_scaled.std())

Mean: 1.7290358580227847e-16
Standard deviation: 0.9999999999999999
