# Introduction to Machine Learning with scikit-learn

## Loading a dataset


In [None]:
from sklearn.datasets import load_iris

dataset = load_iris()

X = dataset['data']
y = dataset['target']
feature_names = dataset['feature_names']
target_names = dataset['target_names']

## Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

## Dimensionality reduction

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X_scaled)

X_pc = pca.transform(X_scaled)

## Training an estimator

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn.fit(X_pc, y)

## Testing an estimator

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=8)

X_train_scaled = scaler.fit_transform(X_train)
X_train_pc = pca.fit_transform(X_train_scaled)

knn.fit(X_train_pc, y_train)

X_test_scaled = scaler.transform(X_test)
X_test_pc = pca.transform(X_test_scaled)

y_pred = knn.predict(X_test_pc)

print(classification_report(y_test, y_pred))


## Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50, 60, 70, 80, 90],
    'p': [1, 2, 3]    
}

search = GridSearchCV(KNeighborsClassifier(), param_grid, verbose=1, n_jobs=-1)
search.fit(X_train_pc, y_train)
search.best_params_

In [None]:
print(classification_report(y_test, search.predict(X_test_pc)))

## Putting it all together

In [None]:
from sklearn.pipeline import Pipeline

steps = [
    ('scaling', StandardScaler()),
    ('pca', PCA()),
    ('knn', KNeighborsClassifier())
]

pipeline = Pipeline(steps)


param_grid = {
    'scaling__with_mean': [True, False],
    'scaling__with_std': [True, False],
    'pca__n_components': [1, 2, 3],
    'pca__whiten': [True, False],
    'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'knn__weights': ['uniform', 'distance'],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': [10, 20, 30, 40, 50, 60, 70, 80, 90],
    'knn__p': [1, 2, 3]    
}

find_best = GridSearchCV(pipeline, param_grid, verbose=1, n_jobs=-1)

find_best.fit(X_train, y_train)

In [None]:
import pandas as pd
pd.DataFrame(find_best.cv_results_).sort_values(by='rank_test_score')

In [None]:
print(classification_report(y_test, find_best.predict(X_test)))