In [2]:
# Imports section
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:
# Loading the data
dataFrame = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)

# Splitting the data into features and labels
data = dataFrame.iloc[:,:].values
features = data[:,:4]
labels = data[:,4]
print(features.shape)
IA = dataFrame.iloc[:,:].values
Cl = IA[:,4]
IC = np.unique(IA[:,4])

first, second = [], []

(150, 4)


In [4]:
# Adaline Model
class AdalineGD(object):
    def __init__(self, eta=0.01, n_iter=50):
        self.eta = eta
        self.n_iter = n_iter
        self.cost_ = []
        self.w_ = np.zeros(1)

    def fit(self, x, y):
        self.w_ = np.zeros(1 + x.shape[1])
        self.cost_ = []
        for a in range(self.n_iter):
            output = self.net_input(x)
            errors = (y - output)
            self.w_[1:] += self.eta * x.T.dot(errors)
            self.w_[0] += self.eta * errors.sum()
            cost = (errors**2).sum() / 2.0
            self.cost_.append(cost)
        return self

    def net_input(self, x):
        return np.dot(x, self.w_[1:]) + self.w_[0]

    def activation(self, x):
        return self.net_input(x)

    def predict(self, x):
        return np.where(self.activation(x) >= 0.0, 1, -1)

    def accuracy(self, x, y):
        return 1.0 - abs(y - self.predict(x)).sum() / (2.0 * y.size)

cdx = np.array([1,2]) # Focusing on the classifications of "versicolor" and "verginica", class indexes 1 and 2
fdx = np.array([0,1,2,3]) # all features

i = 0
numCl = 0
for i in range(0 , len(Cl)):
    if Cl[i] == IC[cdx[0]]:
        numCl += 1
    if Cl[i] == IC[cdx[1]]:
        numCl += 1    
    i += 1  
    
NC = np.zeros((numCl))
NF = np.zeros((numCl, 4))

i = 0
j = 0
for i in range(0 , len(Cl)):
    if Cl[i] == IC[cdx[0]]:        
        NC[j] = 1
        NF[j,0] = IA[i,fdx[0]]
        NF[j,1] = IA[i,fdx[1]]
        NF[j,2] = IA[i,fdx[2]]
        NF[j,3] = IA[i,fdx[3]]
        j += 1
    if Cl[i] == IC[cdx[1]]:      
        NC[j] = -1
        NF[j,0] = IA[i,fdx[0]]
        NF[j,1] = IA[i,fdx[1]]
        NF[j,2] = IA[i,fdx[2]]
        NF[j,3] = IA[i,fdx[3]]
        j += 1
    i += 1
    
NF_Std = np.copy(NF)
NF_Std[:,0] = (NF[:,0] - NF[:,0].mean()) / NF[:,0].std()
NF_Std[:,1] = (NF[:,1] - NF[:,1].mean()) / NF[:,1].std()
NF_Std[:,2] = (NF[:,2] - NF[:,2].mean()) / NF[:,2].std()
NF_Std[:,3] = (NF[:,3] - NF[:,3].mean()) / NF[:,3].std()
ada = AdalineGD(n_iter=50, eta=0.003)
ada.fit(NF_Std, NC)
first.append(ada.accuracy(NF_Std, NC))
print(first[len(first)-1])

0.97


In [None]:
# Logistic Regression Model
lr = LogisticRegression().fit(features, labels)
first.append(lr.score(features, labels))
print(first[len(first)-1])

In [None]:
# Support Vector Machine (Linear) Model
linear_svm = svm.SVC(kernel='linear').fit(features, labels)
first.append(linear_svm.score(features, labels))
print(first[len(first)-1])

In [None]:
# Support Vector Machine (Polynomial) Model
poly_svm = svm.SVC(kernel='poly').fit(features, labels)
first.append(poly_svm.score(features, labels))
print(first[len(first)-1])

In [None]:
# Support Vector Machine (Polynomial) Model
rbf_svm = svm.SVC(kernel='rbf').fit(features, labels)
first.append(rbf_svm.score(features, labels))
print(first[len(first)-1])

In [None]:
# Decision Tree Model
decision_tree = tree.DecisionTreeClassifier().fit(features, labels)
first.append(decision_tree.score(features, labels))
print(first[len(first)-1])

In [None]:
# Ada Boost Model with Decision Trees as base classifier
# "If None, then the base estimator is DecisionTreeClassifier initialized with max_depth=1"
# Above quote shows that base estimator = None means that DecisionTreeClassifier is used which is what we need
ada_boost = AdaBoostClassifier(base_estimator=None).fit(features, labels)
first.append(ada_boost.score(features, labels))
print(first[len(first)-1])

In [None]:
# Random Forest Model
random_forest = RandomForestClassifier().fit(features, labels)
first.append(random_forest.score(features, labels))
print(first[len(first)-1])

From here on out we will test with 100 train samples and 50 test samples

In [None]:
# Adaline Model with the train_test_split data
xa_train, xa_test, ya_train, ya_test = train_test_split(NF_Std, NC,train_size=(2/3),test_size=(1/3))
adaline = AdalineGD(n_iter=50, eta=0.003).fit(xa_train, ya_train)
second.append(adaline.accuracy(xa_train, ya_train))
print(second[len(second)-1])
second.append(adaline.accuracy(xa_test, ya_test))
print(second[len(second)-1])

In [None]:
# All the models have been tested. Now we will test all the models again but with the data split by sklearn
# We will utilize the train_test_split to split the 150 iris samples into 100 training samples and 50 test samples
x_train, x_test, y_train, y_test = train_test_split(features, labels, train_size=(100/150),test_size=(50/150))

In [None]:
# Logistic Regression Model
lr = LogisticRegression(max_iter=120).fit(x_train, y_train)
second.append(lr.score(x_train, y_train))
print(second[len(second)-1])
second.append(lr.score(x_test, y_test))
print(second[len(second)-1])

In [None]:
# Support Vector Machine (Linear) Model
linear_svm = svm.SVC(kernel='linear').fit(x_train, y_train)
second.append(linear_svm.score(x_train, y_train))
print(second[len(second)-1])
second.append(linear_svm.score(x_test, y_test))
print(second[len(second)-1])

In [None]:
# Support Vector Machine (Polynomial) Model
poly_svm = svm.SVC(kernel='poly').fit(x_train, y_train)
second.append(poly_svm.score(x_train, y_train))
print(second[len(second)-1])
second.append(poly_svm.score(x_test, y_test))
print(second[len(second)-1])

In [None]:
# Support Vector Machine (Polynomial) Model
rbf_svm = svm.SVC(kernel='rbf').fit(x_train, y_train)
second.append(rbf_svm.score(x_train, y_train))
print(second[len(second)-1])
second.append(rbf_svm.score(x_test, y_test))
print(second[len(second)-1])

In [None]:
# Decision Tree Model
decision_tree = tree.DecisionTreeClassifier().fit(x_train, y_train)
second.append(decision_tree.score(x_train, y_train))
print(second[len(second)-1])
second.append(decision_tree.score(x_test, y_test))
print(second[len(second)-1])

In [None]:
# Ada Boost Model with Decision Trees as base classifier
# "If None, then the base estimator is DecisionTreeClassifier initialized with max_depth=1"
# Above quote shows that base estimator = None means that DecisionTreeClassifier is used which is what we need
ada_boost = AdaBoostClassifier(base_estimator=None).fit(x_train, y_train)
second.append(ada_boost.score(x_train, y_train))
print(second[len(second)-1])
second.append(ada_boost.score(x_test, y_test))
print(second[len(second)-1])

In [None]:
# Random Forest Model
random_forest = RandomForestClassifier().fit(x_train, y_train)
second.append(random_forest.score(x_train, y_train))
print(second[len(second)-1])
second.append(random_forest.score(x_test, y_test))
print(second[len(second)-1])

In [None]:
# Putting The Data Into A Table
fig, ax = plt.subplots(dpi=200, figsize = (10,2))
fig.patch.set_visible(False)
ax.axis('off')
ax.axis('tight')
cases=np.array(["Adaline","Logistic Regression","Support Vector Machine (linear)",
                "Support Vector Machine (Polynomial)","Support Vector Machine (RBF)",
                "Decision Trees", "Ada Boost", "Random Forest"])
df = pd.DataFrame({"Regular" : first,
                   "Train_Test_Split (x_train)" : second[0::2],
                   "Train_Test_Split (x_test)" : second[1::2]})
ax.table(cellText=df.values, rowLabels=cases, colLabels=df.columns, cellLoc='center', loc='center')
fig.tight_layout()
plt.show()

In order to clearly describe and analyze the results of the machine learning models we will first make
conclusions each model by itself and then make conclusions by consider all the models in unison.

Adaline:  
The Adaline model had an 97% accuracy by itself, which shows that the model is very capable by itself. In addtion
after using train_test_split with the Adaline model we increase the _train accuracy to 98.48% while the _test accuracy
was 94%. there is some over fitting in this particular simulation since there is a 4% difference between _train and _test accuracy.


Logistic Regression:  
The Logistic Regression model had an accuracy of 97.3% by itself, which means that the model
is very capable by itself since the score near 100%. After utilizing the train_test_split method to
create the _train and _test samples, the _train accuracy of the model increased to 98%, while the _test
accuracy was at 98%. This shows that since the _test accuarcy is 98% which is equal to the _train accuracy 98%, we can say that in this simulation it looks like there wasn't any over fitting.

Support Vector Machine (Linear):
The Support Vector Machine when using kernel='linear' has an accuracy of 99.3% by itself, which means
the model is very capable by itself since the score is near 100%. After utilizing the train_test_split method to split the data into _train and _test, the _train data had an accuracy of 99% and the test data had an accuracy of 98%. Clearly, the _train accuracy is higher than the _train accuracy by 1% which means that there is little to no over fitting in this particular simulation.

Support Vector Machine (Polynomial):
The Support Vector Machine when using kernel='poly' has an accuracy of 97.3% by itself, which means
the model is very capable by itself since the score is near 100%. After utilizing the train_test_split
method to split the data into _train and _test, the _train data had an accuracy of 99% and the test data
had an accuracy of 98%. This model somewhat benefitted from splitting the data because the _train accuracy
increased from 97.3% to 99%. Also, the _test accuracy is 98% which is only 1% different than the _train accuracy
so we can say that there was very little or no over fitting in this particular simulation.

Support Vector Machine (RBF):
The Support Vector Machine when using kernel='poly' has an accuracy of 97.3% by itself, which means
the model is very capable by itself since the score is near 100%. After utilizing the train_test_split
method to split the data into _train and _test, the _train data had an accuracy of 96% and the test data
had an accuracy of 98%. The _test accuracy is 98% which is higher than the 96% _train accuracy
so we can say that there was no over fitting in this particular simulation.

Decision Trees:
The Decision Tree model has an accuracy of 100%, which means that this model is very accurate and capable.
After utilizing the train_test_split method to split the data into _train and _test, the _train data had an accuracy 
of 100% and the _test data had an accuracy of 96%. Clearly, the _test is 4% less than the _test data and therefore,
we can say there is some over fitting to the data but not too much since the difference was only 4%. 

Ada Boost: 
The Ada Boost model using Decision Trees as base classifier gave us an accuracy of 96% by itself, which means that
it is capable by itself because the score is near 100%. After utilizing the train_test_split method to split the data into _train and _test, the _train data had an accuracy of 98% and the _test data had an accuracy of 96%. This shows signs of 
slight over fitting for our particular simulation because there was 2% difference in _train accuracy and _test accuracy.
Also the model was capable of benefitting from the splitting because the split data lead to an increase in 2% of accuracy
in the _train results.

Random Forest:
The Random Forest model gave us an accuracy of 100% by itself, which shows it is very capable and accurate. After utilizing 
the train_test_split method to split the data into _train and _test, the _train data had an accuracy of 100% and the 
_test accuracy was 96%. Clearly, the 4% difference in _train and _test accuracies show that there is some slight over fitting in this particular simulation.


In Summary:
The most capable models by itself seem to be Decision Trees, Random Forest and Support Vector Machine(LINEAR) because
they have the top 3 highest accuracies by itself. After utilizing train_test_split method the models usually
stayed the same or benefitted from splitting the data. Every model was around 98% to 100%, which is 1%-2% of difference after splitting the data except for the Adaline model and the Support Vector Machine(RBF) model. The models with the most overfitting was the Adaline model, Decision Trees model, and Random Forest model because they gave the highest difference, which was 4% between _train and _test scores. 




