In [2]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [5]:
from sklearn.decomposition import KernelPCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

# Generate a nonlinear dataset
X, y = make_moons(n_samples=1000, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build pipeline: kPCA -> Logistic Regression
kpca_logreg_pipeline = Pipeline([
    ('kpca', KernelPCA(n_components=2)),
    ('log_reg', LogisticRegression())
])

# Grid search for best kernel and gamma
param_grid = {
    'kpca__kernel': ['rbf', 'sigmoid', 'poly'],
    'kpca__gamma': [0.01, 0.1, 1, 10],
    'kpca__degree': [2, 3],  # only used for poly kernel
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(kpca_logreg_pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Test set performance
print("Test accuracy:", grid_search.score(X_test, y_test))


Best parameters: {'kpca__degree': 2, 'kpca__gamma': 10, 'kpca__kernel': 'rbf'}
Best cross-validation accuracy: 0.86875
Test accuracy: 0.865


## Question 9 

### Random Forest with 100 estimators on MNIST 

In [37]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784' , version = 1 , as_frame = False)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [38]:
X = mnist['data']
y = mnist['target']

In [39]:
X_train , X_test , y_train , y_test = X[:60000],X[60000:],y[:60000],y[60000:]

In [45]:
from sklearn.ensemble import RandomForestClassifier 
rnd_clf = RandomForestClassifier(n_estimators = 100 , random_state = 42 )


In [47]:
import time 
t0 = time.time()
rnd_clf.fit(X_train,y_train)
t1 = time.time()

In [55]:
print("Training Took",t1-t0,"s")

Training Took 23.84222412109375 s


In [59]:
from sklearn.metrics import accuracy_score
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_pred , y_test)

0.9691

### Random Forest with 100 estimators on MNIST and PCA

In [95]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_train_reduced = pca.fit_transform(X_train)

In [97]:
rnd_clf2 = RandomForestClassifier(n_estimators=100, random_state=42)
t0 = time.time()
rnd_clf2.fit(X_train_reduced, y_train)
t1 = time.time()

In [98]:
print("Training took {:.2f}s".format(t1 - t0))

Training took 67.17s


In [103]:
X_test_reduced = pca.transform(X_test)

y_pred = rnd_clf2.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

0.9487

### Logistic Regression  on MNIST 

In [118]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(multi_class="multinomial", solver="lbfgs", random_state=42)
t0 = time.time()
log_clf.fit(X_train, y_train)
t1 = time.time()

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [120]:
print("Time Taken" , t1-t0 , "s")

Time Taken 12.930645942687988 s


In [124]:
y_pred = log_clf.predict(X_test)
accuracy_score(y_pred , y_test)

0.9255

### Logistic Regression  on MNIST with PCA

In [130]:
log_clf2 = LogisticRegression(multi_class = "multinomial" , solver = "lbfgs" , random_state = 42)
t0 = time.time()
log_clf2.fit(X_train_reduced ,y_train)
t1 = time.time()

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [132]:
print("TIME TAKEN" , t1-t0 , "s")

TIME TAKEN 3.3431758880615234 s


In [136]:
y_pred = log_clf2.predict(X_test_reduced)
accuracy_score(y_pred , y_test
              )

0.9201

## Question 10 