<a href="https://colab.research.google.com/github/EleniHaylemeskel/Week-3-AI-Tools-Assignment/blob/main/iris_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# PART 2 - TASK 1: Classical Machine Learning with Scikit-learn
# Dataset: Iris Species
# Goal: Preprocess â†’ Train Decision Tree â†’ Evaluate


# --- Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix

# --- Step 1: Load Dataset ---
df = pd.read_csv("Iris.csv")

# Remove unnamed or unnecessary columns
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
df.columns = df.columns.str.strip()

print("Dataset Loaded Successfully!")
print(df.head())

# --- Step 2: Check & Handle Missing Values ---
print("\nMissing Values:\n", df.isnull().sum())

# Fill missing numeric values with median (if any)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in num_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)
        print(f"Filled missing values in column: {col}")

# --- Step 3: Encode Label Column ---
label_col = "Species"
le = LabelEncoder()
df[label_col] = le.fit_transform(df[label_col])

print("\nLabel Encoding Completed!")
print("Classes:", le.classes_)

# --- Step 4: Prepare Features and Labels ---
# Remove ID column if present
feature_cols = [col for col in df.columns if col not in ["Species", "Id"]]

X = df[feature_cols]
y = df[label_col]

# --- Step 5: Split Train & Test ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nData Split Completed!")
print("Training Shape:", X_train.shape)
print("Test Shape:", X_test.shape)

# --- Step 6: Train the Decision Tree Classifier ---
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

print("\nModel Training Completed!")

# --- Step 7: Predictions ---
y_pred = clf.predict(X_test)

# --- Step 8: Evaluation ---
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")

print("\n=== MODEL PERFORMANCE ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision:.4f}")
print(f"Recall (Macro): {recall:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nTask 1 Completed Successfully! ðŸŽ‰")

Dataset Loaded Successfully!
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa

Missing Values:
 Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

Label Encoding Completed!
Classes: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']

Data Split Completed!
Training Shape: (120, 4)
Test Shape: (30, 4)

Model Training Completed!

=== MODEL PERFORMANCE ===
Accuracy: 0.9333
Precision (Macro): 0.9333
Recall (Macro): 0.9333

Classification Report:
                 precision    recall  f1-score   support

    Iri