In [1]:
##### CodSoft Internship Programme for Data Science (1 October 2023 to 31 October 2023) #####
##### Name:- Deepak Gupta #####
##### Task-3 IRIS Flower Classification ##### 

In [6]:
##### Import Some Important Libraries:-
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
import os

In [3]:
##### Import the dataset:-
iris = pd.read_csv("IRIS.csv")

In [None]:
##### Let's see what's in the iris data:-
iris.head()

In [None]:
iris.describe()

In [None]:
##### Analysing the data visually:-
##### At the outset , let us look at a simple scatter plot , to get a visual feel of the data. (We are going to view a host of 
##### them)

In [None]:
##### We'll use this to make a scatterplot of the Iris features.
iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm")

In [None]:
##### A seaborn jointplot shows bivariate scatterplots and univariate histograms in the same figure
sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=iris, size=5)

In [None]:
##### We'll use seaborn's FacetGrid to color the scatterplot by species
sns.FacetGrid(iris, hue="Species", size=5) \
   .map(plt.scatter, "SepalLengthCm", "SepalWidthCm") \
   .add_legend()

In [None]:
##### Secondly , let us see at the box plot of the dataset, which shows us the visual representation of how our data is 
##### scattered over the the plane. Box plot is a percentile-based graph, which divides the data into four quartiles of 25% 
##### each. This method is used in statistical analysis to understand various measures such as mean, median and deviation.

In [None]:
##### We can look at an individual feature in Seaborn through a boxplot
sns.boxplot(x="Species", y="PetalLengthCm", data=iris)

In [None]:
## One way we can extend this plot is adding a layer of individual points on top of it through Seaborn's striplot

## We'll use jitter=True so that all the points don't fall in single vertical lines above the species
## Saving the resulting axes as ax each time causes the resulting plot to be shown on top of the previous axes

ax = sns.boxplot(x="Species", y="PetalLengthCm", data=iris)
ax = sns.stripplot(x="Species", y="PetalLengthCm", data=iris, jitter=True, edgecolor="gray")

In [None]:
## This is a special plot called violin plot

## A violin plot combines the benefits of the previous two plots and simplifies them Denser regions of the data are fatter, and
## sparser thiner in a violin plot
sns.violinplot(x="Species", y="PetalLengthCm", data=iris, size=6)

In [None]:
## Next is a visual based on probability density , called kernel density plots. (KD Plots)

## A final seaborn plot useful for looking at univariate relations is the kdeplot,which creates and visualizes a kernel density
## estimate of the underlying feature

sns.FacetGrid(iris, hue="Species", size=6) \
   .map(sns.kdeplot, "PetalLengthCm") \
   .add_legend()

In [None]:
##### Another useful seaborn plot is a hybrid plot called pairplot, which shows the bivariate relation between each pair of 
##### features. Lets see the same:-

In [None]:
## From the pairplot, we'll see that the Iris-setosa species is separataed from the other two across all feature combinations

sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3)

In [None]:
## Box plot grid
iris.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6))

In [7]:
##### And now , let's see some special visuals !! One cool more sophisticated technique pandas has available is called Andrews 
##### Curves

In [None]:
## Andrews Curves involve using attributes of samples as coefficients for Fourier series and then plotting these

from pandas.plotting import andrews_curves
andrews_curves(iris.drop("Id", axis=1), "Species")

In [None]:
## Another multivariate visualization technique pandas has is parallel_coordinates Parallel coordinates plots each feature on a 
## separate column & then draws lines connecting the features for each data sample

from pandas.plotting import parallel_coordinates
parallel_coordinates(iris.drop("Id", axis=1), "Species")

In [None]:
##### A final multivariate visualization technique

In [None]:
## Which puts each feature as a point on a 2D plane, and then simulates having each sample attached to those points through a 
## spring weighted by the relative value for that feature

from pandas.plotting import radviz
radviz(iris.drop("Id", axis=1), "Species")

In [None]:
##### INTO THE REALM OF MACHINE LEARNING:-

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [None]:
## Seperating the data into dependent and independent variables
X = iris.iloc[:, :-1].values
y = iris.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
##### Training the model:-
##### Using some of the commonly used algorithms, we will be training our model to check how accurate every algorithm is.We will
##### be implementing these algorithms to compare:-
##### 1. Logistic Regression
##### 2. K – Nearest Neighbour (KNN)
##### 3. Support Vector Machine (SVM)
##### 4. Decision Trees
##### 5. Naive Bayes classifier

##### Let us start building our model and predicting accuracy of every algorithm used. We can also check which gives the best 
##### result.

In [None]:
##### Start with the first algorithm Logistic Regression:-

##### LogisticRegression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

##### Summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

##### Accuracy score
from sklearn.metrics import accuracy_score
print('accuracy is',accuracy_score(y_pred,y_test))

In [None]:
##### Now , let us see the scores with K-Nearest Neighbors technique.

##### K-Nearest Neighbours
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=8)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

##### Summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

##### Accuracy score
from sklearn.metrics import accuracy_score
print('accuracy is',accuracy_score(y_pred,y_test))

In [None]:
##### Thirdly , with SVM (Support Vector Machines).

##### Support Vector Machine's 
from sklearn.svm import SVC

classifier = SVC()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

##### Summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

##### Accuracy score
from sklearn.metrics import accuracy_score
print('accuracy is',accuracy_score(y_pred,y_test))

In [None]:
##### Next , is my favorite , decision trees !

##### Decision Tree's
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

##### Summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

##### Accuracy score
from sklearn.metrics import accuracy_score
print('accuracy is',accuracy_score(y_pred,y_test))

In [None]:
##### And lastly , the Naive Bayes classifier. (Variants included)

##### Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

##### Summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred)) 

##### Accuracy score
from sklearn.metrics import accuracy_score
print('accuracy is',accuracy_score(y_pred,y_test))

In [None]:
##### Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

##### Summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

##### Accuracy score
from sklearn.metrics import accuracy_score
print('accuracy is',accuracy_score(y_pred,y_test))

In [None]:
##### Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

##### Summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

##### Accuracy score
from sklearn.metrics import accuracy_score
print('accuracy is',accuracy_score(y_pred,y_test))

In [None]:
##### Complement Naive Bayes
from sklearn.naive_bayes import ComplementNB
classifier = ComplementNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

##### Summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

##### Accuracy score
from sklearn.metrics import accuracy_score
print('accuracy is',accuracy_score(y_pred,y_test))

In [None]:
from sklearn.metrics import accuracy_score, log_loss
classifiers = [
    GaussianNB(),
    MultinomialNB(),
    BernoulliNB(),
    ComplementNB(),               
                  ]
 
##### Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)
 
for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    log_entry = pd.DataFrame([[name, acc*100, 11]], columns=log_cols)
    log = log.append(log_entry)
    
    print("="*30)

In [None]:
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")

plt.xlabel('Accuracy %')
plt.title('Classifier Accuracy')
plt.show()