In [1]:
# ===============================
# Step 1: Import Required Libraries
# ===============================

# Pandas: for handling dataset
import pandas as pd  

# Numpy: for numerical operations like arrays
import numpy as np  

# scikit-learn (sklearn): machine learning library
from sklearn.model_selection import train_test_split   # to split dataset into train and test
from sklearn.preprocessing import StandardScaler       # to scale/normalize features
from sklearn.linear_model import LogisticRegression    # our ML model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # evaluation tools

# ===============================
# Step 2: Load Dataset
# ===============================

# Iris dataset has 150 rows (flowers) and 5 columns:
# - sepal_length, sepal_width, petal_length, petal_width (features)
# - species (label/target)
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
data = pd.read_csv(url)

print("First 5 rows of the dataset:")
print(data.head())

# ===============================
# Step 3: Split Features (X) and Target (y)
# ===============================

# Features (inputs) are the measurements (lengths & widths of petals and sepals)
X = data.drop("species", axis=1)   # all columns except 'species'

# Target (output) is the species of the flower
y = data["species"]

print("\nFeature columns:", X.columns.tolist())
print("Target classes:", y.unique())

# ===============================
# Step 4: Train-Test Split
# ===============================

# Why? Because we need to train the model on some data and test it on unseen data
# test_size=0.2 means 20% test, 80% train
# random_state=42 ensures same split every time (reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

# ===============================
# Step 5: Feature Scaling
# ===============================

# Why scale? Because logistic regression uses mathematical optimization.
# If features are on very different scales, training becomes harder.
# StandardScaler converts data to mean=0, standard deviation=1

scaler = StandardScaler()

# Fit on training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Only transform test data (never fit on test data to avoid data leakage)
X_test_scaled = scaler.transform(X_test)

# ===============================
# Step 6: Train Logistic Regression Model
# ===============================

# Logistic Regression is a classification algorithm.
# Even though it has "regression" in name, it predicts categories (classes).
# It works by estimating probabilities using the logistic (sigmoid) function.

model = LogisticRegression(max_iter=200)  # max_iter ensures convergence
model.fit(X_train_scaled, y_train)        # train the model

print("\nModel training complete!")

# ===============================
# Step 7: Evaluate Model on Test Data
# ===============================

y_pred = model.predict(X_test_scaled)   # predict on test data

print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))   # % correct predictions
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ===============================
# Step 8: Make Predictions on New Samples
# ===============================

# Example new flowers (unseen data)
# Each row = [sepal_length, sepal_width, petal_length, petal_width]
sample_data = np.array([
    [4.9, 3.0, 1.2, 0.1],   # looks like Iris-setosa
    [6.7, 3.1, 4.7, 1.5],   # looks like Iris-versicolor
    [5.8, 2.7, 5.1, 1.9]    # looks like Iris-virginica
])

# Scale new samples the same way
sample_scaled = scaler.transform(sample_data)

# Predict species
predictions = model.predict(sample_scaled)

print("\nPredictions for new samples:", predictions)

# ===============================
# Step 9: Predict a Single Sample
# ===============================

single_sample = np.array([5.1, 3.5, 1.4, 0.2]).reshape(1, -1)   # reshape required for single input
single_sample_scaled = scaler.transform(single_sample)
single_prediction = model.predict(single_sample_scaled)

print("\nPrediction for single sample [5.1, 3.5, 1.4, 0.2]:", single_prediction[0])


First 5 rows of the dataset:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

Feature columns: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
Target classes: ['setosa' 'versicolor' 'virginica']

Training samples: 120
Testing samples: 30

Model training complete!

Model Evaluation:
Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
w

