In [None]:
"""
IRIS ML PROJECT — FULL PIPELINE
--------------------------------
1. Problem Definition
2. Load Data
3. Basic EDA
4. Advance EDA (visual)
5. Data Cleaning
6. Label Encoding
7. Train-Test Split
8. Model Training (4 Models)
9. Model Comparison
10. Feature Importance
11. Final Prediction Function (User Input)
12. Project Insights
"""

: 

In [None]:
# 1. Problem Definition

"""
Goal:
    Given flower measurements (sepal length, sepal width, petal length, petal width),
    predict the species (Setosa, Versicolor, Virginica).

Type:
    Multiclass Classification (3 classes)

Input Features (X):
    - SepalLengthCm
    - SepalWidthCm
    - PetalLengthCm
    - PetalWidthCm

Target (y):
    Species

Success Metric:
    Accuracy (primary)
    Confusion Matrix + Classification Report (secondary)
"""

In [None]:
# 2. Load Dataset

import pandas as pd

# the notebook runs from the workspace root, so the CSV lives under data/
# original path was incorrect and caused FileNotFoundError

df = pd.read_csv("data/Iris.csv")  # Load the CSV file

df.head()                     # Show first 5 rows

In [None]:
# 3. Basic EDA

df.shape               # rows, columns
df.columns             # column names
df.info()              # data types, null values
df.describe()          # statistical summary

df['Species'].value_counts()  # class distribution

In [None]:
# 4.1 Histograms
import matplotlib.pyplot as plt
df.hist(figsize=(10,8))
plt.show()

# 4.2 Key Scatter Plot
plt.scatter(df['PetalLengthCm'], df['PetalWidthCm'], c=df['Species'].astype('category').cat.codes)
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()

In [None]:
# 4.3 Pairplot
import seaborn as sns
sns.pairplot(df, hue="Species")

In [None]:
# 4.4 Correlation Heatmap
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

In [None]:
# 5. Data Cleaning

df = df.drop(columns=['Id'])
df.head()

In [None]:
# 6. Label Encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['target'] = le.fit_transform(df['Species'])
df = df.drop(columns=['Species'])

df.head()

In [None]:
# 7. Train-Test Split

from sklearn.model_selection import train_test_split

X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# 8. Train multiple ML models

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

for name, model in models.items():
    model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 9. Compare accuracy of all models

results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc

# show results dictionary and best model
print(results)
best = max(results, key=results.get)
print("Best model:", best, "with accuracy", results[best])

# print confusion matrix and classification report for the best model

y_pred = models[best].predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# 10. Feature Importance

rf = models["Random Forest"]
importances = rf.feature_importances_

for feat, score in zip(X.columns, importances):
    print(f"{feat}: {score}")

In [None]:
# Bar plot
plt.bar(X.columns, importances)
plt.title("Feature Importance (Random Forest)")
plt.show()

In [None]:
# 11. Prediction Function

import numpy as np

def predict_species(sl, sw, pl, pw):
    sample = np.array([[sl, sw, pl, pw]])
    pred = models["Random Forest"].predict(sample)[0]

    species_map = {0:"Setosa", 1:"Versicolor", 2:"Virginica"}
    return species_map[pred]

# Example test
predict_species(5.1, 3.5, 1.4, 0.2)

In [None]:
"""
INSIGHTS:

1. Petal length & petal width are strongest features.
2. Dataset is clean & perfectly balanced.
3. Best performing model usually:
      RandomForest or SVM (95%–98% accuracy)
4. Logistic Regression also high performing due to clean dataset.
5. Iris is a perfect beginner dataset to understand ML end-to-end.
6. You now have:
    - Full pipeline
    - EDA
    - Model comparison
    - Feature importance
    - Prediction function
    - Professional project structure
"""