In [69]:
# 1. Import necessary libraries
import pandas as pd  # for data handling
from sklearn.model_selection import train_test_split  # to split data
from sklearn.naive_bayes import GaussianNB  # Naive Bayes classifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,classification_report

In [3]:
# 2. Load the dataset
df = pd.read_csv("iris.csv")

In [5]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [11]:
df.describe(include="all")
#The command df.describe(include="all") in pandas gives you descriptive statistics 
#for all columns in your DataFrame df, including both numeric and non-numeric (categorical) columns.
#df.describe() gives you only numeric colmns

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
count,150.0,150.0,150.0,150.0,150.0,150
unique,,,,,,3
top,,,,,,Iris-setosa
freq,,,,,,50
mean,75.5,5.843333,3.054,3.758667,1.198667,
std,43.445368,0.828066,0.433594,1.76442,0.763161,
min,1.0,4.3,2.0,1.0,0.1,
25%,38.25,5.1,2.8,1.6,0.3,
50%,75.5,5.8,3.0,4.35,1.3,
75%,112.75,6.4,3.3,5.1,1.8,


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [15]:
df.shape

(150, 6)

In [19]:
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [21]:
df.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [23]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [25]:
# 3. Use iloc to select feature columns (columns 1 to 4) and target column (column 5)
X = df.iloc[:, 1:5]   # Selects SepalLengthCm to PetalWidthCm (columns 1 to 4)
y = df.iloc[:, 5]     # Selects Species column (column 5)

In [105]:
# 4. Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [107]:
# 5. Create and train the Naive Bayes model
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

In [109]:
# 6. Predict on test data
y_pred = naive_bayes.predict(X_test)

In [111]:
# 10. Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred,labels=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])  # Computes the confusion matrix
print("Confusion Matrix:\n", cm)  # Displays the confusion matrix


Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


This specifies the order of class labels when generating the confusion matrix.
When we use:
confusion_matrix(y_test, y_pred, labels=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
We are telling the confusion matrix to arrange rows and columns in this order:
Iris-setosa
Iris-versicolor
Iris-virginica
Correctly predicted the class (diagonal).
Incorrectly predicted it as another class (off-diagonal).
Example Interpretation:
If you see this output:
Confusion Matrix:
[[10  0  0]
 [ 0  9  1]
 [ 0  2  8]]
That means:
Setosa: All 10 were predicted correctly.
Versicolor: 9 correct, 1 wrongly predicted as Virginica.
Virginica: 8 correct, 2 wrongly predicted as Versicolor.



In [114]:
# 11. Compute evaluation metrics
TP = cm[1, 1]  # True Positive (Predicted 1 and actual 1)
TN = cm[0, 0]  # True Negative (Predicted 0 and actual 0)
FP = cm[0, 1]  # False Positive (Predicted 1 and actual 0)
FN = cm[1, 0]  # False Negative (Predicted 0 and actual 1)

In [116]:
# 12. Accuracy
accuracy = accuracy_score(y_test, y_pred)  # Computes the accuracy
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [118]:
# 13. Error Rate
error_rate = 1 - accuracy  # Computes the error rate (1 - accuracy)
print(f"Error Rate: {error_rate}")

Error Rate: 0.0


In [120]:
# 14. Precision
precision = precision_score(y_test, y_pred, average='macro') # Computes precision (TP / (TP + FP))
#`average='macro'` means it calculates precision separately for each class and then takes the average, treating all classes equally no matter how many items are in each.
print(f"Precision: {precision}")

Precision: 1.0


In [122]:
# 15. Recall
recall = recall_score(y_test, y_pred,average='macro')  # Computes recall (TP / (TP + FN))
print(f"Recall: {recall}")

Recall: 1.0


In [124]:
print(classification_report(y_test,y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      1.00      1.00         9
 Iris-virginica       1.00      1.00      1.00        11

       accuracy                           1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30



In [126]:
# Precision
precision = TP / (TP + FP)

# Recall
recall = TP / (TP + FN)

# F1 Score
f1_score = 2 * (precision * recall) / (precision + recall)

# Accuracy
accuracy = (TP + TN) / (TP + TN + FP + FN)

# Print metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Accuracy: {accuracy}")

Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Accuracy: 1.0
