# Iris Flower Classification Project

In [None]:
# Blueprint to be followed:
# 1. Import libraries => Will be doing it as we go along
# 2. Load the dataset
# 3. Preprocessing: drop first column

# 4. EDA
    # → Histograms
    # → Pairplot
    # → Covariance matrix

# 5. Train-test split (use sklearn's train_test_split instead)

# 6. Model 1: Logistic Regression
    # → Train
    # → Predict
    # → Accuracy
    # → Confusion matrix

# 7. Model 2 (Optional): Try KNN, SVM

# 8. Summary or comparison

# 9. Conclusion


## Reading the CSV File & Drop Columns

In [1]:
import pandas as pd
df = pd.read_csv('Iris.csv')
df = df.drop(df.columns[0], axis=1)  # Remove the first column
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## EDA(Exploratory Data Analysis)

### Histograms

### PairPlot

### Covariance Matrix

## Splitting the data into Train and Test

In [2]:
# Training data: first 40 rows (0–39) and rows 50–89 (50–89)
train_df = pd.concat([df.iloc[0:40], df.iloc[50:90]], ignore_index=True)

# Test data: rows 40–49 (10 rows) and rows 90–119 (30 rows)
test_df = pd.concat([df.iloc[40:50], df.iloc[90:100]], ignore_index=True)


## Model 1: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Separate features and target
X_train = train_df.iloc[:, :-1]  # All columns except the last
y_train = train_df.iloc[:, -1]   # Last column as target

# Create and fit the logistic regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Get parameters (theta)
theta = model.coef_
intercept = model.intercept_

Theta (coefficients): [[ 0.46658794 -0.78047597  2.18189051  0.8804814 ]]
Intercept: [-6.86848547]


### Testing The Dataset

In [8]:
# Separate features and actual labels from test_df
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

# Predict labels using the trained model
y_pred = model.predict(X_test)

# Create a DataFrame with predicted and actual labels
predictions_table = pd.DataFrame({
    'Predicted': y_pred,
    'Actual': y_test.values  # Ensure alignment
})

# Count the number of mismatches
num_mismatches = (predictions_table['Predicted'] != predictions_table['Actual']).sum()
print(f"Number of mismatches: {num_mismatches}")
print(predictions_table)

Number of mismatches: 0
          Predicted           Actual
0       Iris-setosa      Iris-setosa
1       Iris-setosa      Iris-setosa
2       Iris-setosa      Iris-setosa
3       Iris-setosa      Iris-setosa
4       Iris-setosa      Iris-setosa
5       Iris-setosa      Iris-setosa
6       Iris-setosa      Iris-setosa
7       Iris-setosa      Iris-setosa
8       Iris-setosa      Iris-setosa
9       Iris-setosa      Iris-setosa
10  Iris-versicolor  Iris-versicolor
11  Iris-versicolor  Iris-versicolor
12  Iris-versicolor  Iris-versicolor
13  Iris-versicolor  Iris-versicolor
14  Iris-versicolor  Iris-versicolor
15  Iris-versicolor  Iris-versicolor
16  Iris-versicolor  Iris-versicolor
17  Iris-versicolor  Iris-versicolor
18  Iris-versicolor  Iris-versicolor
19  Iris-versicolor  Iris-versicolor


### Testing Model Accuracy

In [7]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 1.00
