In [None]:
# ============================================================
# IRIS FLOWER CLASSIFICATION - OIBSIP TASK 1
# Author: Anshuman Agrawal
# ============================================================

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import kagglehub
import os
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# ============================================================
# DOWNLOAD DATASET FROM KAGGLE
# ============================================================

path = kagglehub.dataset_download("saurabh00007/iriscsv")
print("Dataset downloaded to:", path)
print("Files:", os.listdir(path))

# Load CSV (name inside the folder)
df = pd.read_csv(path + "/IRIS.csv")

# ============================================================
# BASIC EXPLORATION
# ============================================================

print(df.head())
print(df.shape)
print(df.info())
print(df.describe())
print(df['species'].value_counts())

# ============================================================
# DATA VISUALIZATION
# ============================================================

sns.pairplot(df, hue="species")
plt.show()

plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

plt.figure(figsize=(8,6))
sns.boxplot(data=df)
plt.title("Boxplot Distribution")
plt.xticks(rotation=45)
plt.show()

# ============================================================
# SPLITTING DATA
# ============================================================

X = df.iloc[:, :-1]
y = df["species"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================================================
# FEATURE SCALING
# ============================================================

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ============================================================
# MODEL 1 — Logistic Regression
# ============================================================

lr = LogisticRegression()
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr))

# ============================================================
# MODEL 2 — SVM
# ============================================================

svm = SVC(kernel="linear")
svm.fit(X_train, y_train)
pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, pred_svm))

# ============================================================
# MODEL 3 — KNN
# ============================================================

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
pred_knn = knn.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, pred_knn))

# ============================================================
# COMPARE ACCURACIES
# ============================================================

print("\n====== ACCURACY SUMMARY ======")
print("Logistic Regression:", accuracy_score(y_test, pred_lr))
print("SVM:", accuracy_score(y_test, pred_svm))
print("KNN:", accuracy_score(y_test, pred_knn))

# ============================================================
# SAVE BEST MODEL (SVM)
# ============================================================

best_model = svm  
pickle.dump(best_model, open("iris_model.pkl", "wb"))

print("\nModel saved successfully as iris_model.pkl")
