# Machine learning intro

Using the Iris dataset, code based on tutorials by [Kevin Markham](https://github.com/justmarkham/scikit-learn-videos) and [Jason Brownlee](https://machinelearningmastery.com/machine-learning-with-python/)

In [2]:
# Basic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
# Prepare data
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_new = [[3,5,4,2], [5,4,3,2]]

## Data exploration

In [4]:
df_iris = pd.DataFrame(X, columns = iris.feature_names)
df_iris['class'] = y
df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### Univariate plots

In [5]:
# histogram
# df_iris.hist(figsize=(10,10))

df_iris_features = df_iris.iloc[:,0:-1]
shared_plot_params = dict(subplots=True, layout=(2,2), sharex=False, sharey=False, figsize=(10,10))

# density plot of all 4 features
# df_iris_features.plot(kind='density', **shared_plot_params)

# box and whisker plots
# df_iris_features.plot(kind='box', **shared_plot_params)

### Multivariate plots

In [6]:
correlations = df_iris_features.corr()
plt.figure(figsize=(10,10))

# correlation matrix / heatmap
# sns.heatmap(correlations)

# scatter plot
# sns.pairplot(df_iris, hue='class')

<matplotlib.figure.Figure at 0x1180a5668>

<matplotlib.figure.Figure at 0x1180a5668>

## Create model (estimator)
... and perform actual predictions

In [7]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X,y)
knn.predict(X_new)

array([2, 1])

In [8]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X,y)
print ('New prediction for X_new: {}\n'.format(logreg.predict(X_new)))

New prediction for X_new: [2 0]



In [9]:
# get coefficients and intercept from logreg
print ('Logistic regression coefficients: \n{}\n'.format(logreg.coef_))
print ('Logistic regression intercept: \n{}'.format(logreg.intercept_))

Logistic regression coefficients: 
[[ 0.41498833  1.46129739 -2.26214118 -1.0290951 ]
 [ 0.41663969 -1.60083319  0.57765763 -1.38553843]
 [-1.70752515 -1.53426834  2.47097168  2.55538211]]

Logistic regression intercept: 
[ 0.26560617  1.08542374 -1.21471458]


## Accuracy metrics

In [10]:
# testing on entire dataset
from sklearn import metrics
y_pred = logreg.predict(X) 
accuracy = metrics.accuracy_score(y, y_pred)
print ('Testing on entire dataset accuracy: {:.2f}' .format(accuracy))

Testing on entire dataset accuracy: 0.96


In [11]:
# train-test-split
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

# train on training set
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print ('Train-test-split accuracy: {}'.format(accuracy))

Train-test-split accuracy: 0.95


In [12]:
# cross validation
from sklearn.cross_validation import cross_val_score
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy') # produces array with 10 items
print('Accuracy for 10-fold cross val: {:.2f}'.format(scores.mean()))

Accuracy for 10-fold cross val: 0.97


In [13]:
# confusion matrix
confusion = metrics.confusion_matrix(y_test, y_pred)
print ('Confusion matrix:\n')
print (confusion)

Confusion matrix:

[[25  0  0]
 [ 0 15  2]
 [ 0  1 17]]


In [14]:
# Alternatives for accuracy score
print ('Misclassification rate: {:.2f}'.format(1 - accuracy))
print ('Sensitivity / True Positive Rate / Recall: {:.2f}'.format(metrics.recall_score(y_test, y_pred, average='weighted')))
print ('Precision: {:.2f}'.format(metrics.precision_score(y_test, y_pred, average='weighted')))

Misclassification rate: 0.05
Sensitivity / True Positive Rate / Recall: 0.95
Precision: 0.95


In [15]:
# ROC curve and AUC - only works for binary outcome

# fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
# plt.plot(fpr, tpr)

## Optimizing accuracy using grid search

In [16]:
from sklearn.grid_search import GridSearchCV
# example with multiple parameters at once
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
param_grid = dict(n_neighbors = k_range, weights = weight_options)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X,y)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [17]:
grid.grid_scores_[:5]

[mean: 0.96000, std: 0.05333, params: {'n_neighbors': 1, 'weights': 'uniform'},
 mean: 0.96000, std: 0.05333, params: {'n_neighbors': 1, 'weights': 'distance'},
 mean: 0.95333, std: 0.05207, params: {'n_neighbors': 2, 'weights': 'uniform'},
 mean: 0.96000, std: 0.05333, params: {'n_neighbors': 2, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 3, 'weights': 'uniform'}]

In [18]:
print ('Best score is: {} \n- with parameters: {}'.
       format(grid.best_score_, grid.best_params_))

Best score is: 0.98 
- with parameters: {'n_neighbors': 13, 'weights': 'uniform'}


In [19]:
# GridSearchCV can use grid.best_estimator_ directly for predictions
grid.predict(X_new)

array([1, 1])

### RandomizedSearchCV
reduces computational load

In [20]:
from sklearn.grid_search import RandomizedSearchCV
# use parameter distribition rather than grid
param_dist = dict(n_neighbors = k_range, weights = weight_options)
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10)
rand.fit(X,y)
rand.grid_scores_[:3]

[mean: 0.98000, std: 0.03055, params: {'n_neighbors': 27, 'weights': 'distance'},
 mean: 0.96667, std: 0.04472, params: {'n_neighbors': 7, 'weights': 'uniform'},
 mean: 0.96000, std: 0.05333, params: {'n_neighbors': 1, 'weights': 'distance'}]

In [21]:
print ('Best score is: {} \n- with parameters: {}'.
       format(rand.best_score_, rand.best_params_))

Best score is: 0.98 
- with parameters: {'n_neighbors': 27, 'weights': 'distance'}


In [22]:
# going even further - repeat this 20 times and record best score
best_scores = []
for _ in range(20):
    rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=4)
    rand.fit(X,y)
    best_scores.append(round(rand.best_score_, 3))
print (best_scores)

[0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998, 0.97299999999999998]


In [23]:
max_score = max(best_scores)
max_index = best_scores.index(max_score)
print ('Max score is: {:.3f} at index: {}'.format(max_score, max_index))

Max score is: 0.973 at index: 0
