In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Introduction to Preprocessing

This step comes after cleaning and EDA. 
- Categorical variables will need to be changed. 
- Datatypes will have to be checked. 

We can drop nans or use encoders.

In [5]:
mock = pd.DataFrame({
    'A': [np.nan, 1, 2, 3, np.nan],
    'B': [1, 2, 3, 4, 5]
})

Unnamed: 0,A,B
0,,1
1,1.0,2
2,2.0,3
3,3.0,4
4,,5


In [6]:
# Drop Nan
mock.dropna(axis = 1)

Unnamed: 0,B
0,1
1,2
2,3
3,4
4,5


In [8]:
# Counting nans
mock.isna().sum()

A    2
B    0
dtype: int64

In [7]:
# Drop Column with greater than 2 nans
mock.dropna(axis = 1, thresh = 2)

Unnamed: 0,A,B
0,,1
1,1.0,2
2,2.0,3
3,3.0,4
4,,5


# Standardizing Data

If there's lots of variance or various sized data we'll have to standardize it. If we use a model that involves linear distances we need to standardize.

In [12]:
wine_data = datasets.load_wine()

In [13]:
X = wine_data.data
y = wine_data.target

In [17]:
knn = KNeighborsClassifier()

## No Scaling

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)


0.7555555555555555

## Log Normalisation

Good to use when we want to maintain magnitude and use positive values. 

In [21]:
X_log = np.log(X)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_log, y)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.9777777777777777

## Scaling

In [23]:
scaler = StandardScaler()

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn.fit(X_train_scaled, y_train)
knn.score(X_test_scaled, y_test)

0.9555555555555556

# Feature Engineering

Creation of new features based on old features. 

## Encoding

In [53]:
cat_data = pd.DataFrame({
    'A': ['A', 'B', 'A', 'C'],
    'B': ['A', 'C', 'C', 'B'],
    'C': ['B', 'B', 'C', 'A']
})

In [54]:
# Using pandas - will make too many new columns. 
pd.get_dummies(cat_data)

Unnamed: 0,A_A,A_B,A_C,B_A,B_B,B_C,C_A,C_B,C_C
0,1,0,0,1,0,0,0,1,0
1,0,1,0,0,0,1,0,1,0
2,1,0,0,0,0,1,0,0,1
3,0,0,1,0,1,0,1,0,0


In [56]:
le = LabelEncoder()
cat_data['A Transform'] = le.fit_transform(cat_data['A'])

In [57]:
cat_data

Unnamed: 0,A,B,C,A Transform
0,A,A,B,0
1,B,C,B,1
2,A,C,C,0
3,C,B,A,2


## Engineering Numerical Features

In [69]:
temps = pd.DataFrame({
    'City': ['A', 'B', 'C'],
    'Day 1': np.linspace(60, 65, 3),
    'Day 2': np.linspace(63, 70, 3),
    'Day 3': np.linspace(57, 64, 3)
})

In [70]:
temps

Unnamed: 0,City,Day 1,Day 2,Day 3
0,A,60.0,63.0,57.0
1,B,62.5,66.5,60.5
2,C,65.0,70.0,64.0


In [74]:
cols = ['Day 1', 'Day 2', 'Day 3']
temps['mean'] = temps.apply(lambda row: row[cols].mean(), axis = 1)

In [75]:
temps

Unnamed: 0,City,Day 1,Day 2,Day 3,mean
0,A,60.0,63.0,57.0,60.0
1,B,62.5,66.5,60.5,63.166667
2,C,65.0,70.0,64.0,66.333333
