In [None]:
# بسم الله الرحمن الرحيم
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns


from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm  
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import graphviz
from sklearn.tree import export_graphviz

![](http://editor.analyticsvidhya.com/uploads/51518iris%20img1.png)

## Explore dataset

In [None]:
# read dataset
df = pd.read_csv('/kaggle/input/iris-flower-dataset/IRIS.csv')
df.head() 

`df.info()`  It provides a concise summary of the DataFrame's structure, including details such as the number of rows and columns, data types of each column, and the presence of missing values

In [None]:
df.info()

### `df.describe()` generate various statistics for each numerical column in the DataFrame. These statistics include:

1. Count: The number of non-null (non-missing) values in each column.
2. Mean: The average value of each column.
3. Std: The standard deviation, which measures the amount of variation or dispersion in each column.
4. Min: The minimum value in each column.
5. 25%: The 25th percentile value, also known as the first quartile.
6. 50%: The 50th percentile value, also known as the median or second quartile.
7. 75%: The 75th percentile value, also known as the third quartile.
8. Max: The maximum value in each column.

In [None]:
df.describe()

In [None]:
# find the unique values in 'species'column along with the count of each unique value.
uni, count = np.unique(df['species'], return_counts=True)
sns.set_color_codes("pastel")
sns.barplot(y=uni, x=count, palette='icefire', width=0.8,)
sns.despine(left=True, bottom=True)


## Correlation Matrix

In [None]:
correlation_matrix = df.corr(numeric_only = True)
ax = sns.heatmap(correlation_matrix, annot = True)
ax.set_title('Correlation Matrix')

* petal_length and petal_width are strongly correlated. 
* The same occurs between sepal_length, petal_length and petal_width. 

## Visualizations

In [None]:
# shows the bivariate relation between each pair of features
# The diagonal elements in a pairplot show the histogram by default
# We can update these elements to show other things, such as a kde
ax = sns.pairplot(df, 
                  hue='species', 
                 # corner=True
                 )

In [None]:
# A seaborn jointplot shows bivariate scatterplots and univariate histograms in the same figure
ax = sns.jointplot(x="sepal_length", y="sepal_width", data=df, size=5)


In [None]:
# A seaborn jointplot shows bivariate scatterplots and univariate histograms in the same figure
ax = sns.jointplot(x="sepal_length", y="sepal_width", hue='species',data=df,color='b')

## Find Ouliers

In [None]:
# outliers_indexes will contain all outliers rowes index that be outliers that should remove
outliers_indexes = []

In [None]:
# show outlier in sepal_length column
ax = sns.boxplot(data=df, 
                 x='species', 
                 y='sepal_length')

* Based on sepal_length, there is a outlier on Iris Virginica data.

In [None]:
outliers = df.query("species == 'Iris-virginica' and sepal_length < 5.5")

# add outliers that in sepal_length to outliers_indexes
outliers_indexes +=  list(outliers.index)

outliers

In [None]:
# show outlier in sepal_width column
ax = sns.boxplot(x="species", y="sepal_width", data=df)

* Based on sepal_width, there is a outlier on Iris Virginica data.


In [None]:
outliers = df.query("species == 'Iris-virginica' and \
        (sepal_width < 2.5 or sepal_width > 3.6)")

# add outliers that in sepal_width to outliers_indexes
outliers_indexes += list(outliers.index)
outliers

In [None]:
# show outlier in petal_length column
ax = sns.boxplot(data=df, 
                 x='species', 
                 y='petal_length')

* Based on petal_length, there is a outlier on Iris Setosa and versicolor data.


In [None]:
outliers = df.query("\
                    (species == 'Iris-versicolor' and petal_length < 3.3) or\
                    (species == 'Iris-setosa' and (petal_length > 1.8 or petal_length < 1.2))"
                   )
# add outliers that in petal_length to outliers_indexes
outliers_indexes += list(outliers.index)
outliers

In [None]:
# show outlier in petal_width column
ax = sns.boxplot(data=df, x='species', y='petal_width')

* Based on petal_width, there is a outlier on Iris Setosa data.

In [None]:
outliers = df.query("species == 'Iris-setosa' and petal_width > 0.4")
# add outliers that in petal_width to outliers_indexes
outliers_indexes +=  list(outliers.index)
outliers

## Removing outliers


In [None]:
print(outliers_indexes)
df.drop(outliers_indexes, inplace=True)

In [None]:
print(f'number of rows : {df.shape[0]}')

In [None]:
# show one column after remove outliers
ax = sns.boxplot(data=df, x='species', y='sepal_length')

## Modeling

## Now we will use Petals and Sepals Seperately

In [None]:
X_sepal, Y_sepal = df[['sepal_length','sepal_width']], df['species']

X_petal, Y_petal = df[['petal_length','petal_width']], df['species']

In [None]:
X_trian_sepal, X_test_sepal, y_trian_sepal, y_test_sepal = train_test_split(X_sepal, Y_sepal, test_size=0.2,random_state=42)

X_trian_petal, X_test_petal, y_trian_petal, y_test_petal = train_test_split(X_petal, Y_petal, test_size=0.2,random_state=42)


## DecisionTreeClassifier

### Apply Decision Tree Classifier On Sepals Data

In [None]:
model = DecisionTreeClassifier()
model.fit(X_trian_sepal,y_trian_sepal) 
prediction = model.predict(X_test_sepal) 
sepal_dec_t_accuracy = accuracy_score(prediction,y_test_sepal)*100
print(f'The accuracy of the Decision Tree using Sepals is: {sepal_dec_t_accuracy:.2f}')

### Apply Decision Tree Classifier On Petals Data

In [None]:
model = DecisionTreeClassifier()
model.fit(X_trian_petal, y_trian_petal) 
prediction = model.predict(X_test_petal) 
petal_dec_t_accuracy = accuracy_score(prediction,y_test_petal)*100
print(f'The accuracy of the Decision Tree using Petals is: {petal_dec_t_accuracy:.2f}')

## SVM

### Apply SVM Classifier On Sepals Data

In [None]:
model = svm.SVC()
model.fit(X_trian_sepal, y_trian_sepal) 
prediction = model.predict(X_test_sepal) 
sepal_svm_accuracy = accuracy_score(prediction, y_test_sepal)*100
print(f'The accuracy of the SVM using Sepal is: {sepal_svm_accuracy:.2f}')

### Apply SVM Classifier On Petals Data

In [None]:
model = svm.SVC()
model.fit(X_trian_petal, y_trian_petal) 
prediction = model.predict(X_test_petal) 
petal_svm_accuracy = accuracy_score(prediction, y_test_petal)*100
print(f'The accuracy of the SVM using Petals is: {petal_svm_accuracy:.2f}')

In [None]:
accuracy = {
    "classifier": ['decision tree', 'decision tree', 'SVM', 'SVM'],
    "data": ['sepal dataset', 'petal dataset','sepal dataset', 'petal dataset'],
    "accuracy": [ sepal_dec_t_accuracy, petal_dec_t_accuracy, sepal_svm_accuracy, petal_svm_accuracy],
}
accuracy = pd.DataFrame(accuracy)
g = sns.catplot(
    data=accuracy, kind="bar",
    x="classifier", y="accuracy", hue="data",
    palette="dark", alpha=.9, height=5)

This was expected as we saw in the heatmap above that the correlation between the Sepal Width and Length was very low whereas the correlation between Petal Width and Length was very high.

## We used all the features of iris data frame on AdaBoost Classifier

In [None]:
# please note we don't need to use all feature ('sepal_length','sepal_width') after we saw that when ('petal_length', 'petal_width') get high accuracy
# but let see Ada Boost Classifier what will do when use all features
# ...


X = df[['sepal_length','sepal_width', 'petal_length', 'petal_width']]
Y = df['species']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
# Create a base classifier (e.g., Decision Tree)
base_classifier = DecisionTreeClassifier(random_state=42)

# Create an AdaBoostClassifier using the base classifier
adaboost_classifier = AdaBoostClassifier(base_classifier, n_estimators=50, random_state=42)

# Fit the AdaBoostClassifier to the training data
adaboost_classifier.fit(X_train, y_train)

# Make predictions
predictions = adaboost_classifier.predict(X_test)

In [None]:
# Get the first decision tree (stump) from the AdaBoost ensemble
stump_classifier = adaboost_classifier.estimators_[0]

# Visualize the stump using Graphviz
dot_data = export_graphviz(
    stump_classifier,
    out_file=None,
    feature_names=df.columns[:-1],
    class_names=np.unique(Y),
    filled=True,
    rounded=True
)

plt.figure(figsize=(10,6))
graph = graphviz.Source(dot_data)

graph

In [None]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted') 
recall = recall_score(y_test, predictions, average='weighted')  # Adjust 'average' as needed
f1 = f1_score(y_test, predictions, average='weighted')
# -----------------------------------
print(f"accuracy: {accuracy*100:.3f}")
print(f"precision: {precision*100:.3f}")
print(f"recall: {recall*100:.3f}")
print(f"F1: {f1*100:.3f}")

## Confusion Matrix

In [None]:
conf_matrix = confusion_matrix(y_test, predictions)
conf_matrix = pd.DataFrame(conf_matrix, columns=np.unique(df['species']), index = np.unique(df['species']))
sns.heatmap(conf_matrix, annot=True, linewidth=.5)

In [None]:
class_report = classification_report(predictions, y_test, target_names=np.unique(df['species']))
print("\n       ********** Classification Report **********\n\n", class_report)

## Observations:
* Using Petals over Sepal for training the data gives a much better accuracy.
* This was expected as we saw in the heatmap above that the correlation between the Sepal Width and Length was very low whereas the correlation between Petal Width and Length was very high.

Thus we have just implemented some of the common Machine Learning. Since the dataset is small with very few features, I didn't cover some concepts as they would be relevant when we have many features.

I have compiled a notebook covering some advanced ML concepts using a larger dataset.

### don't forget Upvote ^
## Thank You :)