In [18]:


import pandas as pd
import numpy as np
import os
import glob

print(" Loading dataset...")

# Change this path to load any dataset
raw_data_path = "../data/raw/raw_dataset.csv"

# For demonstration: if no raw dataset, use latest cleaned one
if not os.path.exists(raw_data_path):
    print(" Raw dataset not found. Using latest cleaned dataset instead.")
    cleaned_files = glob.glob("../data/cleaned/*.csv")
    raw_data_path = max(cleaned_files, key=os.path.getmtime)

df = pd.read_csv(raw_data_path)

print(" Dataset loaded:", raw_data_path)
print("Shape:", df.shape)
df.head()


 Loading dataset...
 Raw dataset not found. Using latest cleaned dataset instead.
 Dataset loaded: ../data/cleaned\cleaned_dataset_20251127_225256.csv
Shape: (150, 5)


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-Setosa
1,4.9,3.0,1.4,0.2,Iris-Setosa
2,4.7,3.2,1.3,0.2,Iris-Setosa
3,4.6,3.1,1.5,0.2,Iris-Setosa
4,5.0,3.6,1.4,0.2,Iris-Setosa


In [19]:

import re

def detect_column_types(df):
    types = {
        "numeric": [],
        "categorical": [],
        "datetime": []
    }
    
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            types["numeric"].append(col)
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            types["datetime"].append(col)
        else:
            types["categorical"].append(col)
    
    return types


def detect_id_columns(df):
    id_cols = []
    for col in df.columns:
        # safe ID detection
        if re.fullmatch(r".*(id|Id|ID)$", col):
            id_cols.append(col)
        # to detect columns with all unique values
        elif df[col].nunique() == len(df[col]):
            id_cols.append(col)
    return id_cols


def detect_target_column(df):
    possible = ["target", "label", "class", "species", "outcome"]
    for col in df.columns:
        if col.lower() in possible:
            return col
    return df.columns[-1]  # fallback: last column


column_types = detect_column_types(df)
id_columns = detect_id_columns(df)
target = detect_target_column(df)

schema = {
    "column_types": column_types,
    "id_columns": id_columns,
    "target": target
}

print(" Schema detected:")
schema


 Schema detected:


{'column_types': {'numeric': ['SepalLengthCm',
   'SepalWidthCm',
   'PetalLengthCm',
   'PetalWidthCm'],
  'categorical': ['Species'],
  'datetime': []},
 'id_columns': [],
 'target': 'Species'}

In [20]:


target_col = schema["target"]

# 1. Drop ID columns
df_clean = df.drop(columns=schema["id_columns"], errors="ignore")

# 2. Fix numeric types
for col in schema["column_types"]["numeric"]:
    df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce")

# 3. Fix categorical types (EXCEPT target)
for col in schema["column_types"]["categorical"]:
    if col != target_col:
        df_clean[col] = df_clean[col].astype(str)

# 4. Impute missing values
for col in df_clean.columns:
    if df_clean[col].isnull().sum() > 0:
        if col in schema["column_types"]["numeric"]:
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())
        else:
            df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])

# 5. Outlier handling (IQR clipping)
for col in schema["column_types"]["numeric"]:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)



In [21]:

print(" Missing Values:")
display(df_clean.isnull().sum())

print("\n Numeric Summary:")
display(df_clean.describe(include=[np.number]))

print("\n Categorical Summary:")
display(df_clean.describe(include=['object']))

print("\n Correlation Matrix (numeric):")
display(df_clean.corr(numeric_only=True))


 Missing Values:


SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


 Numeric Summary:


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5



 Categorical Summary:


Unnamed: 0,Species
count,150
unique,3
top,Iris-Setosa
freq,50



 Correlation Matrix (numeric):


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
SepalLengthCm,1.0,-0.109369,0.871754,0.817954
SepalWidthCm,-0.109369,1.0,-0.420516,-0.356544
PetalLengthCm,0.871754,-0.420516,1.0,0.962757
PetalWidthCm,0.817954,-0.356544,0.962757,1.0


In [22]:


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

#  XGB
try:
    from xgboost import XGBClassifier
    xgb_available = True
except:
    xgb_available = False
    print(" XGBoost not installed.")

target = schema["target"]
X = df_clean.drop(columns=[target])
y = df_clean[target]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Encoder for XGBoost
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)


# Models

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=200),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(probability=True)
}

if xgb_available:
    models["XGBoost"] = XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=4,
        eval_metric="mlogloss"
    )


results = {}
best_model = None
best_score = -1
best_name = None

print(" Training models...\n")

for name, model in models.items():
    # XGBoost needs encoded labels
    if name == "XGBoost":
        model.fit(X_train, y_train_enc)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test_enc, preds)
    else:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)

    results[name] = acc
    print(f"{name}: {acc:.4f}")

    if acc > best_score:
        best_score = acc
        best_model = model
        best_name = name

print("\n Best Model:", best_name, "Accuracy:", best_score)
results


 Training models...

LogisticRegression: 0.9667
RandomForest: 0.9333
KNN: 1.0000
SVC: 0.9667
XGBoost: 0.9333

 Best Model: KNN Accuracy: 1.0


{'LogisticRegression': 0.9666666666666667,
 'RandomForest': 0.9333333333333333,
 'KNN': 1.0,
 'SVC': 0.9666666666666667,
 'XGBoost': 0.9333333333333333}

In [23]:


import pickle
import json
import datetime

# Timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Save best model
model_path = f"../results/models/{best_name}_{timestamp}.pkl"
with open(model_path, "wb") as f:
    pickle.dump(best_model, f)

print(" Best model saved at:")
print(model_path)

# Save pipeline summary
pipeline_summary = {
    "dataset_shape": df_clean.shape,
    "target_column": target,
    "schema": schema,
    "model_performance": results,
    "best_model": best_name,
    "best_accuracy": float(best_score),
    "timestamp": timestamp
}

summary_path = f"../results/logs/pipeline_summary_{timestamp}.json"
with open(summary_path, "w") as f:
    json.dump(pipeline_summary, f, indent=4)

print("\n Pipeline summary saved at:")
print(summary_path)

print("\n Phase 1 pipeline completed successfully!")


 Best model saved at:
../results/models/KNN_20251202_225010.pkl

 Pipeline summary saved at:
../results/logs/pipeline_summary_20251202_225010.json

 Phase 1 pipeline completed successfully!
