<a href="https://colab.research.google.com/github/Anngladys/AGPLP/blob/main/Classical_ML_with_Scikit_learn_(Iris_Species_Dataset).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import numpy as np

# --- 1. Load the Iris Species Dataset ---
# The Iris dataset is a classic and is included in scikit-learn.
# It contains 150 samples of iris flowers, with 4 features and 3 possible species.
iris = load_iris()

# Create a Pandas DataFrame for easier manipulation and viewing.
# The 'data' attribute contains the features, and 'feature_names' are the column names.
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Add the target variable (species) to the DataFrame.
# 'target' contains numerical labels (0, 1, 2) for the species.
# 'target_names' maps these numbers to actual species names (setosa, versicolor, virginica).
df['species'] = iris.target
df['species_name'] = df['species'].apply(lambda x: iris.target_names[x])

print("--- Dataset Loaded Successfully ---")
print("First 5 rows of the dataset:")
print(df.head())
print("\nDataset Info:")
df.info()

# --- 2. Preprocess the Data ---

# A. Handle Missing Values:
# The Iris dataset is clean and typically does not have missing values.
# However, in a real-world scenario, you would check and handle them.
print("\n--- Checking for Missing Values ---")
print(df.isnull().sum())
# If there were missing values, common strategies include:
# - Imputation (e.g., df.fillna(df.mean(), inplace=True) for numerical)
# - Removing rows/columns (e.g., df.dropna(inplace=True))
if df.isnull().sum().sum() == 0:
    print("No missing values found in the dataset. Good to go!")
else:
    print("Missing values found. Please handle them before proceeding.")


# B. Encode Labels (if not already numerical):
# The 'species' column (iris.target) is already numerically encoded (0, 1, 2).
# If the target was categorical strings, we would use LabelEncoder or OneHotEncoder.
# Here, we will use the existing numerical 'species' column as our target variable.
X = df.drop(['species', 'species_name'], axis=1) # Features (all columns except species and species_name)
y = df['species'] # Target (numerical species label)

print("\nFeatures (X) shape:", X.shape)
print("Target (y) shape:", y.shape)
print("Target classes:", iris.target_names)


# --- 3. Split Data into Training and Testing Sets ---
# It's crucial to split data to evaluate the model on unseen data.
# test_size=0.3 means 30% of data for testing, 70% for training.
# random_state ensures reproducibility of the split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"\nTraining set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")
print(f"Distribution of classes in training set:\n{y_train.value_counts(normalize=True)}")
print(f"Distribution of classes in test set:\n{y_test.value_counts(normalize=True)}")


# --- 4. Train a Decision Tree Classifier ---
# Initialize the Decision Tree Classifier.
# A small max_depth is often good to prevent overfitting on small datasets.
dt_classifier = DecisionTreeClassifier(random_state=42, max_depth=5)

print("\n--- Training Decision Tree Classifier ---")
# Train the model using the training data.
dt_classifier.fit(X_train, y_train)
print("Decision Tree Classifier trained successfully.")


# --- 5. Evaluate the Model ---
# Make predictions on the test set.
y_pred = dt_classifier.predict(X_test)

print("\n--- Model Evaluation ---")

# A. Accuracy: Proportion of correctly classified samples.
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# B. Precision: Ability of the classifier not to label as positive a sample that is negative.
# 'macro' calculates metrics for each label and takes unweighted mean.
# 'weighted' calculates metrics for each label and takes weighted average by support.
# 'none' returns score for each class.
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precision (weighted): {precision:.4f}")

# C. Recall: Ability of the classifier to find all the positive samples.
recall = recall_score(y_test, y_pred, average='weighted')
print(f"Recall (weighted): {recall:.4f}")

# D. Classification Report: Provides a comprehensive report of precision, recall,
# f1-score, and support for each class.
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# --- Optional: Visualize the Decision Tree (requires graphviz) ---
# This part is commented out by default as it requires additional installations
# If you want to visualize, uncomment the lines below and run:
# pip install graphviz
# pip install scikit-learn[tree]
# import graphviz
# from sklearn.tree import export_graphviz
#
# dot_data = export_graphviz(dt_classifier, out_file=None,
#                            feature_names=iris.feature_names,
#                            class_names=iris.target_names,
#                            filled=True, rounded=True,
#                            special_characters=True)
# graph = graphviz.Source(dot_data)
# graph.render("iris_decision_tree", view=True) # Saves to a file and opens it

print("\n--- Analysis Complete ---")
print("This script demonstrates data preprocessing, decision tree training, and evaluation for the Iris dataset.")

--- Dataset Loaded Successfully ---
First 5 rows of the dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   species species_name  
0        0       setosa  
1        0       setosa  
2        0       setosa  
3        0       setosa  
4        0       setosa  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    f