# Data Preprocessing

This lab covers normalization, cross validation, regularization and dimension reduction.

In [None]:
# Load Iris dataset
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
X,y=load_iris(return_X_y=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
X_train[:3]

### Normalization
Features are scaled to zero mean and unit variance:
$$z = rac{x-\mu}{\sigma}$$

In [None]:
# Standardize features
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_scaled=sc.fit_transform(X_train)
X_scaled[:3]

### Cross Validation
We use cross-validation to estimate generalization performance.

In [None]:
# Cross validation example
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
clf=LogisticRegression(max_iter=200)
cv_scores=cross_val_score(clf,X_scaled,y_train,cv=5)
cv_scores

### Regularization
Penalize large coefficients to reduce overfitting.

In [None]:
# L2 regularization
from sklearn.linear_model import Ridge
ridge=Ridge(alpha=1.0)
ridge.fit(X_scaled,y_train)
ridge.score(sc.transform(X_test),y_test)

### Dimension Reduction
Principal Component Analysis (PCA) projects data to lower dimensions.

In [None]:
# PCA example
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
X_pca=pca.fit_transform(X_scaled)
X_pca[:3]