In [1]:


import os
import glob
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

cleaned_folder = "../data/cleaned"
cleaned_files = glob.glob(os.path.join(cleaned_folder, "*.csv"))

if not cleaned_files:
    raise FileNotFoundError("No cleaned CSV found. Run Notebook 03 first.")

latest_cleaned = max(cleaned_files, key=os.path.getmtime)
print(" Using cleaned dataset:", latest_cleaned)

df = pd.read_csv(latest_cleaned)
print("Shape:", df.shape)

# Detect target column

possible_targets = [c for c in df.columns if c.lower() in ["target", "label", "class", "species", "outcome"]]

if len(possible_targets) > 0:
    target = possible_targets[0]
else:
    # fallback: assume last column is target
    target = df.columns[-1]

print(" Target column detected:", target)

#type of model
if df[target].dtype in ['int64', 'float64']:
    task_type = "regression"
elif df[target].dtype == 'object' or df[target].dtype.name == 'category':
    task_type = "classification"
else:
    # numeric but only few unique values => classification
    if df[target].nunique() < 20:
        task_type = "classification"
    else:
        task_type = "regression"

print(" Task type detected:", task_type)


 Using cleaned dataset: ../data/cleaned\cleaned_dataset_20251127_225256.csv
Shape: (150, 5)
 Target column detected: Species
 Task type detected: classification


In [2]:
import sys
!{sys.executable} -m pip install scikit-learn==1.3.2





[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from sklearn.model_selection import train_test_split
print("sklearn import successful!")


sklearn import successful!


In [4]:

X = df.drop(columns=[target])
y = df[target]

# Train/test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify keeps class balance
)

print("Training shape:", X_train.shape, y_train.shape)
print("Testing shape:", X_test.shape, y_test.shape)


Training shape: (120, 4) (120,)
Testing shape: (30, 4) (30,)


In [5]:


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


try:
    from xgboost import XGBClassifier
    xgb_available = True
except:
    xgb_available = False
    print(" XGBoost not installed. Skipping XGBClassifier.")

# Model dictionary
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=200),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(probability=True)
}

if xgb_available:
    models["XGBoost"] = XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=4,
        eval_metric="mlogloss"
    )

print(" Models available for training:")
models


 Models available for training:


{'LogisticRegression': LogisticRegression(max_iter=1000),
 'RandomForest': RandomForestClassifier(n_estimators=200),
 'KNN': KNeighborsClassifier(),
 'SVC': SVC(probability=True),
 'XGBoost': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='mlogloss',
               feature_types=None, feature_weights=None, gamma=None,
               grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.1, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=4, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=200, n_jobs=None,
               num_parallel_tree=None, ...)}

In [6]:
import sys
!{sys.executable} -m pip install xgboost





[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

results = {}           # model -> accuracy
trained_models = {}    # store fitted models

# Create label encoder for XGBoost only , since it needs 0,1,2 as inputs 
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

for name, model in models.items():
    print(f" Training {name}...")

    # XGBoost needs encoded labels
    if "XGBoost" in name:
        model.fit(X_train, y_train_enc)
        y_pred = model.predict(X_test)
        # Evaluate using encoded labels
        acc = accuracy_score(y_test_enc, y_pred)
    else:
        # Other models handle string labels fine
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

    results[name] = acc
    trained_models[name] = model

    print(f"    Accuracy: {acc:.4f}")

print("\n Model performance summary:")
results


 Training LogisticRegression...
    Accuracy: 0.9667
 Training RandomForest...
    Accuracy: 0.9333
 Training KNN...
    Accuracy: 1.0000
 Training SVC...
    Accuracy: 0.9667
 Training XGBoost...
    Accuracy: 0.9333

 Model performance summary:


{'LogisticRegression': 0.9666666666666667,
 'RandomForest': 0.9333333333333333,
 'KNN': 1.0,
 'SVC': 0.9666666666666667,
 'XGBoost': 0.9333333333333333}

In [8]:

import pickle
import datetime

# Finding best model based on accuracy
best_model_name = max(results, key=results.get)
best_model = trained_models[best_model_name]
best_score = results[best_model_name]

print(f" Best Model: {best_model_name} (Accuracy: {best_score:.4f})")

# Create timestamped filename
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
model_path = f"../results/models/{best_model_name}_{timestamp}.pkl"

# Saving as pkl to later load in phase 2
with open(model_path, "wb") as f:
    pickle.dump(best_model, f)

print(" Model saved at:")
print(model_path)


 Best Model: KNN (Accuracy: 1.0000)
 Model saved at:
../results/models/KNN_20251202_223130.pkl


In [9]:
# overall report for all models :-

import json
import datetime

training_summary = {
    "task_type": task_type,
    "target_column": target,
    "model_performance": results,
    "best_model": best_model_name,
    "best_score": best_score,
}

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = f"../results/logs/training_summary_{timestamp}.json"

# Save log
with open(log_path, "w") as f:
    json.dump(training_summary, f, indent=4)

print(" Training summary saved at:")
print(log_path)


 Training summary saved at:
../results/logs/training_summary_20251202_223130.json


In [10]:

import numpy as np
import pandas as pd

feature_importance = {}

for name, model in trained_models.items():
    try:
        # RandomForest, XGBoost
        if hasattr(model, "feature_importances_"):
            importances = model.feature_importances_
            feature_importance[name] = dict(zip(X.columns, importances))
            print(f" Feature importance extracted for: {name}")
    except:
        pass

# If no model supports feature importance
if len(feature_importance) == 0:
    print(" No models with feature importance available.")

feature_importance


 Feature importance extracted for: RandomForest
 Feature importance extracted for: XGBoost


{'RandomForest': {'SepalLengthCm': 0.12426685779780329,
  'SepalWidthCm': 0.01867383268253318,
  'PetalLengthCm': 0.4443348638796532,
  'PetalWidthCm': 0.4127244456400104},
 'XGBoost': {'SepalLengthCm': 0.021314733,
  'SepalWidthCm': 0.022769462,
  'PetalLengthCm': 0.519456,
  'PetalWidthCm': 0.43645984}}

In [11]:

import json
import datetime

# Convert all numpy.float32 â†’ python float
fi_cleaned = {}

for model_name, fi_dict in feature_importance.items():
    fi_cleaned[model_name] = {k: float(v) for k, v in fi_dict.items()}

# Save JSON
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
fi_path = f"../results/models/feature_importance_{timestamp}.json"

with open(fi_path, "w") as f:
    json.dump(fi_cleaned, f, indent=4)

print(" Feature importance saved at:")
print(fi_path)


 Feature importance saved at:
../results/models/feature_importance_20251202_223130.json
