<a href="https://colab.research.google.com/github/BilalKhaliqWillis/BILAL-Assignment2/blob/main/BILAL_Assignment9_AITMLOps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Advanced Infrastructure & Tooling for MLOps — Airflow DAGs (Iris)
# Install packages needed
!pip install -q scikit-learn pandas joblib fpdf

In [27]:
import zipfile
import pandas as pd
import numpy as np
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
import os

In [25]:
from google.colab import files
uploaded = files.upload()


Saving train.zip to train.zip


In [28]:
zip_filename = list(uploaded.keys())[0]  # auto-detect
print("Extracting:", zip_filename)

with zipfile.ZipFile(zip_filename, 'r') as z:
    z.extractall("/content/data")

print("Extracted to /content/data")

Extracting: train.zip
Extracted to /content/data


In [29]:
for root, dirs, files in os.walk("/content/data"):
    for f in files:
        print(os.path.join(root, f))

/content/data/train.csv


In [31]:
csv_path = "/content/data/train.csv"
df = pd.read_csv(csv_path)

print("Loaded rows:", len(df))
df.head()


Loaded rows: 50000


Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,pixel_3063,pixel_3064,pixel_3065,pixel_3066,pixel_3067,pixel_3068,pixel_3069,pixel_3070,pixel_3071,label
0,59,43,50,68,98,119,139,145,149,149,...,58,65,59,46,57,104,140,84,72,6
1,154,126,105,102,125,155,172,180,142,111,...,42,67,101,122,133,136,139,142,144,9
2,255,253,253,253,253,253,253,253,253,253,...,83,80,69,66,72,79,83,83,84,9
3,28,37,38,42,44,40,40,24,32,43,...,39,59,42,44,48,38,28,37,46,4
4,170,168,177,183,181,177,181,184,189,189,...,88,85,82,83,79,78,82,78,80,1


In [32]:
# Remove rows with any NaN
df = df.dropna()

# Or fill missing values:
# df = df.fillna(df.mean(numeric_only=True))

print("Rows after cleaning:", len(df))


Rows after cleaning: 50000


In [37]:
# Prepare Features + Label
label_column = "label"

X = df.drop(label_column, axis=1)
y = df[label_column]

print("Features shape:", X.shape)
print("Label shape:", y.shape)


Features shape: (50000, 3072)
Label shape: (50000,)


In [38]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])


Train size: 40000
Test size: 10000


In [None]:
# Train Logistic Regression
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

print("Training complete.")

In [None]:
# Evaluate Model
preds = model.predict(X_test)
accuracy = accuracy_score(y_test, preds)

print("Test set accuracy:", accuracy)

In [None]:
# Save Model
with open("trained_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved as trained_model.pkl")

In [None]:
# Save Processed CSVs for Airflow DAG
# Creating outputs folder
os.makedirs("outputs", exist_ok=True)

# Save training and testing data
train_df = pd.concat([X_train, y_train.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test, y_test.reset_index(drop=True)], axis=1)

train_csv_path = "outputs/iris_train.csv"
test_csv_path = "outputs/iris_test.csv"

train_df.to_csv(train_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

print("Saved training CSV:", train_csv_path)
print("Saved testing CSV:", test_csv_path)

In [None]:
# Reload CSVs to Simulate DAG Step
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

In [None]:
# Model Training Function
def train_model_from_csv(csv_path, model_save_path="outputs/trained_model.pkl"):
    df = pd.read_csv(csv_path)
    X = df.drop("label", axis=1)
    y = df["label"]

    model = LogisticRegression(max_iter=500)
    model.fit(X, y)

    # Save model
    with open(model_save_path, "wb") as f:
        pickle.dump(model, f)

    print("Model trained and saved at:", model_save_path)
    return model_save_path

In [None]:
# Model Evaluation Function
def evaluate_model_from_csv(csv_path, model_path="outputs/trained_model.pkl"):
    df = pd.read_csv(csv_path)
    X = df.drop("label", axis=1)
    y = df["label"]

    with open(model_path, "rb") as f:
        model = pickle.load(f)

    preds = model.predict(X)
    acc = accuracy_score(y, preds)
    print("Evaluation accuracy:", acc)
    return acc

In [None]:
# Run Training + Evaluation
# Train model
model_path = train_model_from_csv(train_csv_path)

# Evaluate model
accuracy = evaluate_model_from_csv(test_csv_path)

In [41]:
"""Work steps

Data Preprocessing DAG

Task 1: Loaded dataset from .zip

Task 2: Removed NaNs and clean data

Task 3: Splitting into train/test

Task 4: Saving processed CSVs

Model Training and Evaluation DAG

Task 5: Loading training CSV

Task 6: Training Logistic Regression

Task 7: Saving trained model

Task 8: Loading test CSV

Task 9: Evaluating model and print accuracy

CSVs act as inputs/outputs between DAG tasks which simulating an Airflow workflow."""

'Work steps\n\nData Preprocessing DAG\n\nTask 1: Loaded dataset from .zip\n\nTask 2: Removed NaNs and clean data\n\nTask 3: Splitting into train/test\n\nTask 4: Saving processed CSVs\n\nModel Training and Evaluation DAG\n\nTask 5: Loading training CSV\n\nTask 6: Training Logistic Regression\n\nTask 7: Saving trained model\n\nTask 8: Loading test CSV\n\nTask 9: Evaluating model and print accuracy\n\nCSVs act as inputs/outputs between DAG tasks which simulating an Airflow workflow.'