# Classification with the Iris dataset

In [None]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#allow plots to appear within the notebook
%matplotlib inline

In [None]:
iris = load_iris()
type(iris)

In [None]:
iris.data

In [None]:
iris.feature_names

In [None]:
#show integers representing the species of each observation (aka response)
iris.target

In [None]:
iris.target_names

In [None]:
#check the types of the features and response
#1st they should be seperate objects
#2nd they should always be numbers
#3rd they should be stored as numpy arrays
print(type(iris.data))
#response should always be numeric either if is a Classification or a Regression problem
print(type(iris.target))

In [None]:
#check the shape of the features (first dimension=number of observations, second dimension=number of features)
iris.data.shape

In [None]:
#check the shape of the response (single dimension matches the number of observations)
iris.target.shape

In [None]:
#store feature matrix in X (upper X for a matrix)
X = iris.data
#store response vector in y (lower y for a vector)
y = iris.target

In [None]:
#verify that X and y have the appropriate shapes
print(X.shape)
print(y.shape)

## KNN (K=1)

In [None]:
#instantiating the estimator
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
#if we want to see the other parameters for the estimator, which we left on default
print(knn)

In [None]:
#fit the model with data
knn.fit(X,y)

In [None]:
#now the model can predict a new observation. 
#option 1: pass the data as a nested list, which will be interpreted as having shape (1, 4)
knn.predict([[3, 5, 4, 2]])

In [None]:
#option 2: explicitly change the shape to be (1, 4)
knn.predict(np.reshape([3, 5, 4, 2],[1, 4]))

In [None]:
#option 3: explicitly change the first dimension to be 1, let NumPy infer that the second dimension should be 4
knn.predict(np.reshape([3, 5, 4, 2],[1, -1]))

In [None]:
iris.target_names[2]

In [None]:
#predict more than one observations
X_new = [[3, 5, 4, 2], [1, 2, 1, 2]]
knn.predict(X_new)

In [None]:
iris.target_names[knn.predict(X_new)]

## KNN (K=5)

In [None]:
#instatiate the model with n_neighbors=5 (K=5)
knn = KNeighborsClassifier(n_neighbors = 5)

In [None]:
knn.fit(X,y)

In [None]:
iris.target_names[knn.predict(X_new)]

## Logistic Regression

In [None]:
#using Logistic Regression, which is another model used for classification (despite its name)

In [None]:
logreg = LogisticRegression(multi_class = 'auto', solver = 'liblinear')
logreg.fit(X, y)
logreg.predict(X_new)

## Evaluation Procedure: 1. Training accuracy

In [None]:
#FOR THE LOGISTIC REGRESSION
#train and test on the entire dataset
#predict the response values for the observations in X
logreg.predict(X)

In [None]:
#store the predicted values
y_pred = logreg.predict(X)
#check length
len(y_pred)

In [None]:
#compute classification accuracy for the model

In [None]:
#96% of the predictions are correct. This is the training accuracy
metrics.accuracy_score(y, y_pred)

In [None]:
#FOR THE KNN(K=5)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X, y)
y_pred = knn.predict(X)
#slightly better with 96,7%
metrics.accuracy_score(y, y_pred)

In [None]:
#FOR THE KNN(K=1)
knn = KNeighborsClassifier(n_neighbors = 1)
knn.fit(X, y)
y_pred = knn.predict(X)
#we have 100% accuracy for K=1 because KNN has memorized the training set and it's finding the exact same observation
#a very low value of K creates a high complexity model which overfits the traing data!!!
metrics.accuracy_score(y, y_pred)

## Evaluation Procedure: 2. Train/Test split

In [None]:
#here random_state = 4 is used in order to split the dataset the exact same way every time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 4)

In [None]:
#check the shape of the new objects
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
#FOR LINEAR REGRESSION
#instantiating and fitting the model
logreg = LogisticRegression(multi_class = 'auto', solver = 'liblinear')
logreg.fit(X_train, y_train)

In [None]:
#making predictions
y_pred = logreg.predict(X_test)
#compare the actual response values (y_test) with the predicted ones (y_pred)
metrics.accuracy_score(y_test, y_pred)

In [None]:
#FOR KNN(K=5)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

In [None]:
#FOR KNN(K=1)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
metrics.accuracy_score(y_test, y_pred)
#Out of the above scores we would say that KNN(K=5) is the best model to use for this dataset

### Making a loop to find out if other Ks perform better

In [None]:
k_range = range(1, 26)
scores = []
for num in k_range:
    knn = KNeighborsClassifier(n_neighbors=num)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred))

In [None]:
plt.plot(k_range, scores)
plt.xlabel('K value')
plt.ylabel('Accuracy metrics')
#Remember that testing accuracy penalizes models that are too complex or not complex enough! Typical example here

### Making predictions on out-of-sample data

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X, y)
knn.predict([[3, 5, 4, 2]])

In [None]:
'''Because the train/test split evaluation procedure has the downside of different results every time another part of the
dataset is being used, K-fold cross-validation can come to the rescue. This above procedure though, is flexible and 
quick'''

## Evaluation Procedure: 3. Cross-Validation

In [None]:
#10-fold cross validation with K=5 for KNN
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)

In [None]:
#use average accuracy as an estimate of out-of-sample accuracy
print(scores.mean())

In [None]:
#search again for an optimal value of K for KNN
k_range = range(1, 31)
k_scores = []
for num in k_range:
    knn = KNeighborsClassifier(n_neighbors=num)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
print(k_scores)

In [None]:
#visualize the accuracy scores
plt.plot(k_range, k_scores, color='green', marker='o', markersize=8)
plt.xlabel('Value of K')
plt.ylabel('Cross-validated accuracy')

### Compare the models' accuracy

In [None]:
#Higher values of K produce lower complexity models and therefore the K=20 is a better choise
#Compare the means of cross-validation for KNN and Logistic Regression to choose which model fits better to this dataset
knn = KNeighborsClassifier(n_neighbors=20)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

In [None]:
logreg = LogisticRegression(multi_class = 'auto', solver = 'liblinear')
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())

### Parameter tuning with cross-validation 
The K parameter

In [None]:
#how to replace the for loop and provide aditional functionality with GridSearchCV
#define the parameter values that should be searched and create a parameter grid. Map the parameter names to the values
k_range = list(range(1, 31))
param_grid = dict(n_neighbors = k_range)
print(param_grid)

In [None]:
#instantiate the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')

In [None]:
#fit the grid with data
grid.fit(X, y)

In [None]:
# convert the search results into a pandas DataFrame
results = pd.DataFrame(grid.cv_results_)

In [None]:
# view the mean and standard deviation of the test scores for each set of parameters
# while we pay attention to the mean we must see if the std is high. 
#Because that means that the cross-validate estimate of the accuracy might be not much reliable
results[['mean_test_score', 'std_test_score', 'params']]

In [None]:
# how to examine the results of the first set of parameters for example
print(results['params'][0])
print(results['std_test_score'][0])
print(results['mean_test_score'][0])

In [None]:
#plot the results for a better view
plt.plot(k_range, results['mean_test_score'], color='green', marker='o')
plt.xlabel('Values of K for KNN')
plt.ylabel('Cross validated accuracy')

In [None]:
#examine the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

### Parameter tuning with cross-validation
The weights parameter

In [None]:
#the default value for weights is 'uniform' for KNN but it can be changed to 'distance'
#define the parameter values that should be searched
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']

In [None]:
#create a parameter grid as before
param_grid = dict(n_neighbors=k_range, weights=weight_options)
print(param_grid)

In [None]:
#instantiate and fit the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)

In [None]:
#view the results
results = pd.DataFrame(grid.cv_results_)
print(results)

In [None]:
#examine the best model
print(grid.best_score_)
print(grid.best_params_)

Train the model with the best parameters

In [None]:
knn = KNeighborsClassifier(n_neighbors=13, weights='uniform')
knn.fit(X, y)
knn.predict([[3, 5, 4, 2]])

In [None]:
#This shortcut can be used
grid.predict([[3, 5, 4, 2]])

In [None]:
iris.target_names[1]

### RandomizedSearchCV
Having too many parameters to tune. Solve the computational problem

In [None]:
#Specify parameter distributions rather than parameter grid
#It needs continuous parameters so in this case it will be the same as before
param_dist = dict(n_neighbors=k_range, weights=weight_options)

In [None]:
#n_iter controls the number of searches
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(X, y)

In [None]:
results = pd.DataFrame(rand.cv_results_)
print(results.head())

In [None]:
#the best score is the same as GridSearchCV
print(rand.best_score_)
print(rand.best_params_)