In [31]:
import os

base_path = "/content/ml-assignment-2"

folders = [
    "data",
    "model",
    "model/saved_models"
]

for f in folders:
    os.makedirs(os.path.join(base_path, f), exist_ok=True)


print("✅ Project folders created")


✅ Project folders created


In [32]:
!pip install -q streamlit pandas numpy scikit-learn joblib xgboost matplotlib seaborn

In [33]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load dataset
data = load_breast_cancer()

X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="label")

df = pd.concat([X, y], axis=1)

# Save dataset
dataset_path = "/content/ml-assignment-2/data/dataset.csv"
df.to_csv(dataset_path, index=False)

df.shape, df.head()

((569, 31),
    mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
 0        17.99         10.38          122.80     1001.0          0.11840   
 1        20.57         17.77          132.90     1326.0          0.08474   
 2        19.69         21.25          130.00     1203.0          0.10960   
 3        11.42         20.38           77.58      386.1          0.14250   
 4        20.29         14.34          135.10     1297.0          0.10030   
 
    mean compactness  mean concavity  mean concave points  mean symmetry  \
 0           0.27760          0.3001              0.14710         0.2419   
 1           0.07864          0.0869              0.07017         0.1812   
 2           0.15990          0.1974              0.12790         0.2069   
 3           0.28390          0.2414              0.10520         0.2597   
 4           0.13280          0.1980              0.10430         0.1809   
 
    mean fractal dimension  ...  worst texture  worst perimeter  w

In [34]:
import pandas as pd
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

BASE_PATH = "/content/ml-assignment-2"
DATA_PATH = f"{BASE_PATH}/data/dataset.csv"
MODEL_DIR = f"{BASE_PATH}/model/saved_models"

df = pd.read_csv(DATA_PATH)

X = df.drop(columns=["label"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

joblib.dump((X_train, X_test, y_train, y_test),
            f"{MODEL_DIR}/data_split.pkl")

models = {
    "logistic_regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=2000))
    ]),
    "decision_tree": DecisionTreeClassifier(random_state=42),
    "knn": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression())  # fixed below
    ]),
    "naive_bayes": GaussianNB(),
    "random_forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "xgboost": XGBClassifier(
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42
    )
}

# Fix KNN properly
models["knn"] = Pipeline([
    ("scaler", StandardScaler()),
    ("model", __import__("sklearn").neighbors.KNeighborsClassifier(n_neighbors=5))
])

for name, model in models.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f"{MODEL_DIR}/{name}.pkl")
    print(f"✅ Saved model: {name}")

print("✅ All 6 models trained successfully")

✅ Saved model: logistic_regression
✅ Saved model: decision_tree
✅ Saved model: knn
✅ Saved model: naive_bayes
✅ Saved model: random_forest
✅ Saved model: xgboost
✅ All 6 models trained successfully


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [35]:
import pandas as pd
import joblib
from sklearn.metrics import (
    accuracy_score, roc_auc_score,
    precision_score, recall_score,
    f1_score, matthews_corrcoef
)

MODEL_DIR = "/content/ml-assignment-2/model/saved_models"

X_train, X_test, y_train, y_test = joblib.load(
    f"{MODEL_DIR}/data_split.pkl"
)

model_files = [
    "logistic_regression",
    "decision_tree",
    "knn",
    "naive_bayes",
    "random_forest",
    "xgboost"
]

results = []

for m in model_files:
    model = joblib.load(f"{MODEL_DIR}/{m}.pkl")
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    results.append({
        "ML Model Name": m,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })

metrics_df = pd.DataFrame(results)
metrics_df

Unnamed: 0,ML Model Name,Accuracy,AUC,Precision,Recall,F1,MCC
0,logistic_regression,0.982456,0.99537,0.986111,0.986111,0.986111,0.962302
1,decision_tree,0.912281,0.915675,0.955882,0.902778,0.928571,0.817412
2,knn,0.95614,0.978836,0.958904,0.972222,0.965517,0.905447
3,naive_bayes,0.938596,0.987765,0.945205,0.958333,0.951724,0.867553
4,random_forest,0.95614,0.993056,0.958904,0.972222,0.965517,0.905447
5,xgboost,0.95614,0.990079,0.946667,0.986111,0.965986,0.905824


In [36]:
metrics_df.to_csv(
    "/content/ml-assignment-2/model/saved_models/metrics_summary.csv",
    index=False
)

print("✅ Metrics saved")

✅ Metrics saved


In [37]:
import os

os.chdir("/content/ml-assignment-2")
print("✅ Current working directory:", os.getcwd())
print("✅ Files:", os.listdir("."))

✅ Current working directory: /content/ml-assignment-2
✅ Files: ['data', 'app.py', 'model']


In [38]:
import joblib
import pandas as pd

X_train, X_test, y_train, y_test = joblib.load("model/saved_models/data_split.pkl")

test_df = X_test.copy()
test_df["label"] = y_test.values

test_path = "data/test_upload.csv"
test_df.to_csv(test_path, index=False)

print("✅ Created test upload file at:", test_path)
print("Shape:", test_df.shape)
test_df.head()

✅ Created test upload file at: data/test_upload.csv
Shape: (114, 31)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
256,19.55,28.77,133.6,1207.0,0.0926,0.2063,0.1784,0.1144,0.1893,0.06232,...,36.27,178.6,1926.0,0.1281,0.5329,0.4251,0.1941,0.2818,0.1005,0
428,11.13,16.62,70.47,381.1,0.08151,0.03834,0.01369,0.0137,0.1511,0.06148,...,20.29,74.35,421.1,0.103,0.06219,0.0458,0.04044,0.2383,0.07083,1
501,13.82,24.49,92.33,595.9,0.1162,0.1681,0.1357,0.06759,0.2275,0.07237,...,32.94,106.0,788.0,0.1794,0.3966,0.3381,0.1521,0.3651,0.1183,0
363,16.5,18.29,106.6,838.1,0.09686,0.08468,0.05862,0.04835,0.1495,0.05593,...,25.45,117.2,1009.0,0.1338,0.1679,0.1663,0.09123,0.2394,0.06469,1
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0


In [39]:
%%writefile app.py
import os
import streamlit as st
import pandas as pd
import joblib

from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

st.set_page_config(page_title="ML Assignment 2", layout="wide")
st.title("ML Assignment 2 — Classification Models Demo")

MODEL_DIR = "model/saved_models"
TARGET_COL = "label"

# --- Sidebar ---
st.sidebar.header("Controls")
model_map = {
    "Logistic Regression": "logistic_regression.pkl",
    "Decision Tree": "decision_tree.pkl",
    "KNN": "knn.pkl",
    "Naive Bayes": "naive_bayes.pkl",
    "Random Forest": "random_forest.pkl",
    "XGBoost": "xgboost.pkl"
}
selected_model_name = st.sidebar.selectbox("Select Model", list(model_map.keys()))
selected_model_file = model_map[selected_model_name]

st.sidebar.markdown("---")
st.sidebar.caption("Tip: Upload a CSV that includes a 'label' column.")

# --- Show stored comparison metrics if available ---
metrics_path = os.path.join(MODEL_DIR, "metrics_summary.csv")
st.subheader("Model Comparison Table (from your evaluation script)")

if os.path.exists(metrics_path):
    metrics_df = pd.read_csv(metrics_path)
    st.dataframe(metrics_df, use_container_width=True)
else:
    st.warning("metrics_summary.csv not found. Run: python model/evaluate_models.py")

st.markdown("---")

# --- Upload ---
st.subheader("Upload Test Dataset (CSV)")
uploaded_file = st.file_uploader("Upload CSV with features + label column", type=["csv"])

def compute_metrics(y_true, y_pred, y_prob=None):
    out = {}
    out["Accuracy"] = accuracy_score(y_true, y_pred)
    out["Precision"] = precision_score(y_true, y_pred, zero_division=0)
    out["Recall"] = recall_score(y_true, y_pred, zero_division=0)
    out["F1"] = f1_score(y_true, y_pred, zero_division=0)
    out["MCC"] = matthews_corrcoef(y_true, y_pred)

    # AUC only if probabilities available
    if y_prob is not None:
        out["AUC"] = roc_auc_score(y_true, y_prob)
    else:
        out["AUC"] = None
    return out

if uploaded_file is None:
    st.info("Upload a CSV to run predictions.")
    st.stop()

# --- Read uploaded dataset ---
df = pd.read_csv(uploaded_file)

if TARGET_COL not in df.columns:
    st.error(f"Uploaded CSV must contain target column '{TARGET_COL}'.")
    st.stop()

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# --- Load model ---
model_path = os.path.join(MODEL_DIR, selected_model_file)
if not os.path.exists(model_path):
    st.error(f"Model file not found: {model_path}. Train models first.")
    st.stop()

model = joblib.load(model_path)

# --- Predict ---
y_pred = model.predict(X)

# Probability for AUC
y_prob = None
if hasattr(model, "predict_proba"):
    proba = model.predict_proba(X)
    if proba.shape[1] == 2:
        y_prob = proba[:, 1]

# --- Metrics ---
st.subheader(f"Metrics for: {selected_model_name}")
metrics = compute_metrics(y, y_pred, y_prob)

c1, c2, c3 = st.columns(3)
c1.metric("Accuracy", f"{metrics['Accuracy']:.4f}")
c2.metric("Precision", f"{metrics['Precision']:.4f}")
c3.metric("Recall", f"{metrics['Recall']:.4f}")

c4, c5, c6 = st.columns(3)
c4.metric("F1", f"{metrics['F1']:.4f}")
c5.metric("MCC", f"{metrics['MCC']:.4f}")
c6.metric("AUC", "N/A" if metrics["AUC"] is None else f"{metrics['AUC']:.4f}")

st.markdown("---")

# --- Report + Confusion Matrix ---
st.subheader("Classification Report")
st.text(classification_report(y, y_pred, zero_division=0))

st.subheader("Confusion Matrix")
cm = confusion_matrix(y, y_pred)
st.write(cm)

st.success("✅ Done! Try switching models from the sidebar.")


Overwriting app.py


In [56]:
!wget -O cloudflared https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!chmod +x cloudflared
!sudo mv cloudflared /usr/local/bin/cloudflared
!cloudflared --version


--2026-02-15 13:27:42--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2026.2.0/cloudflared-linux-amd64 [following]
--2026-02-15 13:27:42--  https://github.com/cloudflare/cloudflared/releases/download/2026.2.0/cloudflared-linux-amd64
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/106867604/f9298ca8-89c8-41fe-a51f-e24cb2059878?sp=r&sv=2018-11-09&sr=b&spr=https&se=2026-02-15T14%3A06%3A15Z&rscd=attachment%3B+filename%3Dcloudflared-linux-amd64&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2026-02-15T1

In [62]:
!streamlit run app.py --server.port 8501 --server.address 0.0.0.0 --server.headless true


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.188.253.67:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m


In [63]:
!cloudflared tunnel --url http://localhost:8501

[90m2026-02-15T13:48:23Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2026-02-15T13:48:23Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
^C


In [61]:
!cloudflared --version

cloudflared version 2026.2.0 (built 2026-02-06-14:47 UTC)
