In [None]:
import numpy as np
import pandas as pd

from bokeh.plotting import figure,show,output_notebook
from bokeh.models import Range1d

from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.cross_validation import cross_val_score
import matplotlib.pyplot as plt

output_notebook()

#import modules, numpy, pandas, bokeh, sklearn

In [None]:
#load the Iris dataset from SKlearn
sk_iris = datasets.load_iris()
iris = pd.DataFrame(sk_iris.data,columns=sk_iris['feature_names'])
iris['target'] = sk_iris.target
Names = sk_iris.target_names
iris.head()

#Iris targets are species names. 3 different categories

In [None]:
iris.target.value_counts()

In [None]:
# Plot the iris dataset

#Create a plot with custom colors
%matplotlib inline
from matplotlib.colors import ListedColormap
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# plot petal length vs petal width: color by species(target)
iris.plot(kind='scatter', x='petal length (cm)', y='petal width (cm)', c='target', colormap=cmap_bold)

In [None]:
#plot Sepal length vs sepal width: color by species
iris.plot(kind='scatter', x='sepal length (cm)', y='sepal width (cm)', c='target', colormap=cmap_bold)


In [None]:
#split the data in to train/test.
X = iris.drop('target', axis=1)
y = iris.target
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=0)

In [None]:
# fit the KNN on the training data. Use 5 neighbors. Then score on the test split.
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=0)
myknn = KNeighborsClassifier(n_neighbors=5)
myknn.fit(X_train,y_train)
myknn.score(X_test, y_test)

In [None]:
#KNN Classifier Model. Lets see how our model will look as we increase # of neighbors
# Create a list of the various numbers of neighbors to use to build models
# Create training and test sets
# Iterate through that list and for each number of neighbors:
#    Build a KNN model
#    Evaluate it
#    Record the score with the number of neighbors for that model
# Plot results

n_neighbors = range(1, 51)

scores = []
for n in n_neighbors:
    clf = KNeighborsClassifier(n)
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))
    
knn_scores_df = pd.DataFrame(scores, columns=["Accuracy"])
ax = knn_scores_df.plot(figsize=(10,8), title="N-Neighbor Parameter Accuracy")
ax.set_xlabel("N-Neighbors")
ax.set_ylabel("Accuracy")

In [None]:
#Lets take a closer look at metrics. Use sklearn.metrics.classification_report to generate a more informative picture
# precision = number of items selected that are relevant: True positives/ (true positives + true negatives)
# recall = number or relative items selected: true positives/(true positives + false negatives)
# Fscore is the harmonic mean of precision and recall: 2*(precision*recall)/(precision + recall)

from sklearn import metrics

print metrics.classification_report([sk_iris['target_names'][label] for label in y_test], 
                                    [sk_iris['target_names'][label] for label in myknn.predict(X_test)])

In [None]:
#Logistic Regression 
from sklearn.linear_model import LogisticRegression

#Logistic Regression uses 2 classes.
irislog = iris[iris.target!=0]
features = irislog.drop('target',axis=1)
target = irislog.target

#Lets run the LR model and split the data 5 times using cross validation
model_lr = LogisticRegression(C=1)
cross_val_score(model_lr,features,target,cv=5).mean()


In [None]:
#Determine feature importance
model_lr = LogisticRegression(C=1).fit(features, target)
x = np.arange(len(features.columns))
names = features.columns
print names
print model_lr.coef_


In [None]:
#Graph the Coefficient importants
p = figure(title="Model Coefficients")
for val in x:
    p.quad(top=model_lr.coef_.ravel()[val], 
           bottom=0, left=val+0.2,right=val+0.8, 
           color=['red','orange','green','purple'][val],
           legend=names[val]
          )
    
p.y_range = Range1d(min(model_lr.coef_.ravel())-0.1, max(model_lr.coef_.ravel())+1.5)
show(p)

In [None]:
# The above analysis for Logistic Regression did not normalize data. Lets normalize and our scores should improve.

n_features = StandardScaler().fit_transform(features)
new_model_lr = LogisticRegression(C=1).fit(n_features, target)
print cross_val_score(new_model_lr,features,target,cv=5).mean()
print new_model_lr.coef_

In [None]:
p = figure(title="Model Coefficients Normalized")
for val in x:
    p.quad(top=new_model_lr.coef_.ravel()[val], 
           bottom=0, left=val+0.2,right=val+0.8, 
           color=['red','orange','green','purple'][val],
           legend=names[val]
          )
    
p.y_range = Range1d(min(new_model_lr.coef_.ravel())-0.1, max(new_model_lr.coef_.ravel())+1.5)
show(p)

In [None]:
#run the unsupervised model K-means on Iris
from sklearn.cluster import KMeans

#fit the data to 3 different clusters as there are 3 different species
km = KMeans(3)
km.fit(X)

In [None]:
centers = km.cluster_centers_
centers

In [None]:
#Lets compare silihouette scores of clusters as we increase clusters;
from sklearn.metrics import silhouette_score

my_ks = range(2,16)
my_ks
silhouette = []
for k in my_ks:
    temp_km = KMeans(k)
    temp_km.fit(X)
    temp_labels = temp_km.labels_
    new_score = silhouette_score(X,temp_labels,metric='euclidean')
    silhouette.append(new_score)
    
p = figure(title='Silhouette')
p.line(my_ks,silhouette)
show(p)