<h3>Importing other libraries together</h3>

In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pandas.plotting import parallel_coordinates
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

<h3> Load Dataset</h3>

In [15]:
# Load dataset
url = "E:/Downloads/Data Science/iris/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = read_csv(url, names=names)

In [16]:
dataset

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,virginica
147,6.3,2.5,5.0,1.9,virginica
148,6.5,3.0,5.2,2.0,virginica
149,6.2,3.4,5.4,2.3,virginica


<h3> Dimensions of Dataset</h3>

In [18]:
dataset.describe()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
count,151.0,151.0,151.0,151.0,151
unique,36.0,24.0,44.0,23.0,4
top,5.0,3.0,1.5,0.2,setosa
freq,10.0,26.0,14.0,28.0,50


In [19]:
# shape
print(dataset.shape)

(151, 5)


In [29]:
dataset.groupby('class').size()

class
setosa        50
species        1
versicolor    50
virginica     50
dtype: int64

<h3> Peek at the Data</h3>

In [30]:
# head
print(dataset.head(20))

    sepal-length  sepal-width  petal-length  petal-width    class
0   sepal_length  sepal_width  petal_length  petal_width  species
1            5.1          3.5           1.4          0.2   setosa
2            4.9          3.0           1.4          0.2   setosa
3            4.7          3.2           1.3          0.2   setosa
4            4.6          3.1           1.5          0.2   setosa
5            5.0          3.6           1.4          0.2   setosa
6            5.4          3.9           1.7          0.4   setosa
7            4.6          3.4           1.4          0.3   setosa
8            5.0          3.4           1.5          0.2   setosa
9            4.4          2.9           1.4          0.2   setosa
10           4.9          3.1           1.5          0.1   setosa
11           5.4          3.7           1.5          0.2   setosa
12           4.8          3.4           1.6          0.2   setosa
13           4.8          3.0           1.4          0.1   setosa
14        

<h3>Statistical Summary</h3>

In [31]:
# descriptions
print(dataset.describe())

       sepal-length sepal-width petal-length petal-width   class
count           151         151          151         151     151
unique           36          24           44          23       4
top             5.0         3.0          1.5         0.2  setosa
freq             10          26           14          28      50


<h3>Class Distribution</h3>

In [32]:
# class distribution
print(dataset.groupby('class').size())

class
setosa        50
species        1
versicolor    50
virginica     50
dtype: int64


<h3>Data Visualization</h3>
<h4> Univariate Plots</h4>

In [10]:
# box and whisker plots
#dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
#plt.show()

In [45]:
X_train, y_test = train_test_split(dataset, test_size = 0.4, stratify = dataset['class'], random_state = 42)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [34]:
# histograms
dataset.hist()
#plt.show()

ValueError: hist method requires numerical or datetime columns, nothing to plot.

<h3>Multivariate Plots</h3>

In [None]:
# scatter plot matrix
scatter_matrix(dataset)
plt.show()

<h3>The Complete Code for the above</h3>

In [None]:
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt
# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = read_csv(url, names=names)
# box and whisker plots
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.show()
# histograms
dataset.hist()
plt.show()
# scatter plot matrix
scatter_matrix(dataset)
plt.show()

<h3>Evaluate Some Algorithms</h3>
<h4>Create a Validation Dataset</h4>

In [None]:
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
y = array[:,4]
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1)


<h3>build and evaluate our models</h3>

In [None]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
 kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
 cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
 results.append(cv_results)
 names.append(name)
 print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

<h3>Select Best Model</h3>
<h4>Comparing spread and mean</h4>

In [None]:
# Compare Algorithms
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()

<h3>The Codes/acripts together Complete </h3>

In [None]:
from pandas import read_csv
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = read_csv(url, names=names)
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
y = array[:,4]
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1, shuffle=True)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
 kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
 cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
 results.append(cv_results)
 names.append(name)
 print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()

<h3>Make Predictions</h3>

In [None]:
# Make predictions on validation dataset
model = SVC(gamma='auto')
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)

<h3> Evaluate Predictions</h3>

In [None]:
# Evaluate predictions
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))