<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Libraries" data-toc-modified-id="Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Libraries</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load data</a></span><ul class="toc-item"><li><span><a href="#Explore-de-data" data-toc-modified-id="Explore-de-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Explore de data</a></span></li><li><span><a href="#Data-Visualization" data-toc-modified-id="Data-Visualization-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Data Visualization</a></span></li></ul></li><li><span><a href="#Predictive-model" data-toc-modified-id="Predictive-model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Predictive model</a></span><ul class="toc-item"><li><span><a href="#Run-different-algorithms-on-training-dataset" data-toc-modified-id="Run-different-algorithms-on-training-dataset-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Run different algorithms on training dataset</a></span></li><li><span><a href="#Comparing-Algorithms" data-toc-modified-id="Comparing-Algorithms-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Comparing Algorithms</a></span></li><li><span><a href="#Make-predictions-on-Validation-dataset" data-toc-modified-id="Make-predictions-on-Validation-dataset-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Make predictions on Validation dataset</a></span></li></ul></li><li><span><a href="#Save-the-model" data-toc-modified-id="Save-the-model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Save the model</a></span></li></ul></div>

# Libraries 

In [None]:
import pandas as pd
from sklearn import datasets
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import matplotlib.pyplot as plt 
import pickle

# Load data

In [None]:
data =  datasets.load_iris()
df = pd.DataFrame(data.data, columns = data.feature_names)
df.head()

In [None]:
df2 = pd.DataFrame(data.target, columns = ["target"])

In [None]:
final = df.join(df2)

## Explore de data

In [None]:
#Statistical Summary of the Dataset
print("\n\nStatistical Summary\n")
display(final.describe())               
print('\n')

## Data Visualization 

In [None]:
final.columns

In [None]:
final.hist(edgecolor='black', linewidth=1.2)
plt.show()

#Box and Whisper Plots
final.plot(kind='box',subplots=True,layout=(2,2), sharex=False,sharey=False,title="Boxplot(Class vs cm)")
plt.show()

#Multivariate Plot 
final.plot(kind="scatter", x="sepal length (cm)", y="sepal width (cm)")
plt.title("sepal length (cm) vs sepal width (cm)")
plt.show()

final.plot(kind="scatter", x="petal length (cm)", y="petal width (cm)")
plt.title("petal length (cm) vs petal width (cm)")
plt.show()

# Predictive model

In [None]:
final.head()

In [None]:
# define the x,y variables
y = final["target"]
X = final.drop(["target"], axis = 1)

#With model_selection.train_test_split imported from sklearn, we can split data into test and train sets
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, y, test_size=0.2)

## Run different algorithms on training dataset
 

In [None]:
#Linear(LR, LDA) & Non Linear (KNN, CART, NB, SVM)
models = []
models.append(('LR', LogisticRegression( multi_class='ovr')))
models.append(('RDF', RandomForestClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
#Evaluate each model in turn
results = []
names = []
print("Model\t","Mean\t\t","Std")
for name, model in models: 
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold,)
    results.append(cv_results)
    names.append(name)
    msg = "%s:\t %f\t (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


## Comparing Algorithms


In [None]:
#From the above values, we can see that Support Vector Machines (SVM) has the largest estimated accuracy score.
#Compare mean acccuracy of Algorithms with each other
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

## Make predictions on Validation dataset
 

In [None]:
model=SVC()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
print("Accuracy=",accuracy_score(Y_test, predictions),"%\n")
print("Confusion Matrix=\n",confusion_matrix(Y_test, predictions),"\n")
print("Classification Report=\n",classification_report(Y_test, predictions))

# Save the model

In [None]:
best_model = RandomForestClassifier()
# save the model to disk
pickle.dump(best_model, open("mi_mejor_modelo", 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open("mi_mejor_modelo", 'rb'))


In [None]:
loaded_model