# Data Pre-processing

## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [2]:
df = pd.read_csv("Data/Data.csv")

In [3]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


## Taking care of missing data

In [4]:
# Step 1: Is there any missing data?
# 以下两种方法会同时识别 NaN 和 None 作为缺失值

In [5]:
# Method 1.1: print info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [6]:
# Method 1.2: print number of null valyes
missing_values = df.isnull().sum()
print(missing_values)

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


In [7]:
# Step 2: Fill the missing data

In [9]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputer.fit(df.iloc[:, 1:3])          # Apply on all numerical columns
df.iloc[:, 1:3] = imputer.transform(df.iloc[:, 1:3])        # Replace

#### Notes:
- df[:, 1:3], 当df是一个numpy数组
- df.iloc[:, 1:3], 当df是一个dataframe

In [11]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes


## Encoding Categorical and Binary data

In [12]:
# Step 1: 自动识别binary columns和categorical columns
binary_columns = []
categorical_columns = []

for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype.name == 'category':
        if df[col].nunique() == 2:
            binary_columns.append(col)
        elif df[col].nunique() > 2:
            categorical_columns.append(col)

print("Binary Columns:", binary_columns)
print("Categorical Columns:", categorical_columns)

Binary Columns: ['Purchased']
Categorical Columns: ['Country']


In [13]:
# Step 2: categorical columns - One-hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), categorical_columns)],
    remainder='passthrough'
)

df_encoded = pd.DataFrame(ct.fit_transform(df))

In [14]:
# Step 3: Binary columns - Binary columns: Label encoding, 仅适用于单列binary的情况
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_encoded.iloc[:, -1] = le.fit_transform(df_encoded.iloc[:, -1])

In [None]:
# # Step 3: Binary columns - Binary columns: Binary columns: Ordinal Encoder, 可用在多列binary columns的情况
# from sklearn.preprocessing import OrdinalEncoder
# 
# ct = ColumnTransformer(
#     transformers=[('ordinal_encoder', OrdinalEncoder(), binary_columns)],
#     remainder='passthrough'
# )
# 
# # 将 DataFrame 转换为 NumPy 数组并进行编码
# df_encoded = np.array(ct.fit_transform(df_encoded))

In [15]:
df_encoded.head()

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.0,0.0,44.0,72000.0,0
1,0.0,0.0,1.0,27.0,48000.0,1
2,0.0,1.0,0.0,30.0,54000.0,0
3,0.0,0.0,1.0,38.0,61000.0,0
4,0.0,1.0,0.0,40.0,63777.777778,1


## X and y values

In [16]:
X = df_encoded.iloc[:, :-1].values
y = df_encoded.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

#### Note:
- random_state=27 和 random_state=0 的作用与 random_state=1 是一样的，都是为了控制随机性，使得数据分割结果是确定的。
- 唯一的区别在于，不同的 random_state 值会导致不同的分割结果。即 random_state=27 分割出的训练集和测试集会与 random_state=1 或 random_state=0 的分割结果不同。


## Feature Scaling

In [None]:
# # Feature Scaling
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
# X_test[:, 3:] = sc.transform(X_test[:, 3:])
# print(X_train)
# print(X_test)

#### Note:

#### Always do feature scaling after train/test data set split

Reason: the test set is something you're not supposed to work with the training set. 

The "test set" needs to be a "brand new" dataset.
Otherwise the test set will grab information from the training set (the mean and std dev will be of the whole dataset, instead of training/testing seperated), which it is not supposed to. This is called "information leakage".


** Also, not all ML model requires feature scaling, only some do.



#### Standardisation vs. Normalization
- Standardization: works well for all cases, always improve the training process
- Normalization: works well when it's close to normal distribution for most of your features
** Therefore, standardization is always preferred.


#### Do we need to apply feature scaling for dummy variables?

No. Standardization will make it worse.
Already in the range of -3 to 3.
0 and 1 are easier to tell what category the record belongs to.