# Iris Flower 

In [None]:
# Load libraries
import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

plt.rcParams['figure.figsize'] = [10, 5]

Loading the dataset: 

In [None]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
# names of the columns
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)

## Summarize the Dataset

1. Dimensions of the dataset
2. Peek at the data 
3. Statistical summary of all attributes
4. Breakdown of the date by the class variable

### Dimensions of dataset:

In [None]:
# shape 
print(dataset.shape)
# (instances, attributes)

### Peek at the data

In [None]:
# head 
print(dataset.head(20))

### Statistical Summary 

In [None]:
# descriptions
print(dataset.describe())

### Class Distribution 
(There are 50 instances of each class of flower)

In [None]:
# class distribution 
print(dataset.groupby('class').size())

## Data Visualization 

1. Univariate plots to better understand each attribute
2. Multivariate plots to better understand the relationship between attributes

### Univariate Plots


In [None]:
# box and whisker plots 
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.show()

In [None]:
# histograms 
dataset.hist()
plt.show()

Looks like two of the input variables have a Gaussian distribution. This is useful ot note as we can use algorithms the can exploit this assumption

### Multivariable Plots 
Now we can look at the interactions between variables


In [None]:
# scatter plot matrix
scatter_matrix(dataset)
plt.show()

Note the diagonal grouping of some pairs of attributes. This suggests a high correlation and a predictable relationship.

## Evaluate Some Algorithms

1. Separate out a validation dataset.
2. Set-up the test harness to use 10-fold cross validation
3. Build 5 different models to predict species from flower measurements
4. Select best model

### Create a Validation Dataset 

We need to know that the model we created is any good.
We will use statistical methods to estimate the accuracy of the models that we create on unseen data.
To do this we need to hold back some data that the alogrithms will not get to see. We will split the into two, 80% of which we will to train our models and 20% that we will hold back as a validation dataset.

In [None]:
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

## Test harness
We will use 10-fold cross validation to estimate accuracy. This will split out dataset into 10 parts, train on 9 and test on 1 and repeat for all combinations of train-test splits. 

In [None]:
# Test options and evaluation metric 
seed = 7
scoring = 'accuracy'

We are using the metric of ‘accuracy‘ to evaluate models. This is a ratio of the number of correctly predicted instances in divided by the total number of instances in the dataset multiplied by 100 to give a percentage (e.g. 95% accurate). We will be using the scoring variable when we run build and evaluate each model next.

## Build Models 
Let’s evaluate 6 different algorithms:

Logistic Regression (LR)
Linear Discriminant Analysis (LDA)
K-Nearest Neighbors (KNN).
Classification and Regression Trees (CART).
Gaussian Naive Bayes (NB).
Support Vector Machines (SVM).


In [None]:
# Spot check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn 
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Make Predictions

In [None]:
# make predictions on validation dataset
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation,predictions))