<a href="https://colab.research.google.com/github/Diksha427/mlops-assignment/blob/main/Mlops_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
# Download the dataset
!wget -q "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

In [18]:
import pandas as pd

def load_data(path):
    cols = [
        "age","sex","cp","trestbps","chol","fbs",
        "restecg","thalach","exang","oldpeak",
        "slope","ca","thal","target"
    ]
    df = pd.read_csv(path, names=cols)
    df.replace("?", pd.NA, inplace=True)
    df = df.apply(pd.to_numeric)
    return df

# 3. Load data
df = load_data("processed.cleveland.data")

# 4. Handle missing values
df = df.dropna()

# 5. Convert target to binary (0 = no disease, 1 = disease)
df["target"] = (df["target"] > 0).astype(int)

# 6. Quick check
print(df.shape)
df.head()


(297, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [19]:
df.to_csv("heart_disease.csv", index=False)
df = pd.read_csv("heart_disease.csv")

print(df.shape)
print(df.info())

(297, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       297 non-null    float64
 1   sex       297 non-null    float64
 2   cp        297 non-null    float64
 3   trestbps  297 non-null    float64
 4   chol      297 non-null    float64
 5   fbs       297 non-null    float64
 6   restecg   297 non-null    float64
 7   thalach   297 non-null    float64
 8   exang     297 non-null    float64
 9   oldpeak   297 non-null    float64
 10  slope     297 non-null    float64
 11  ca        297 non-null    float64
 12  thal      297 non-null    float64
 13  target    297 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 32.6 KB
None


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

def preprocess_data(df):
    X = df.drop("target", axis=1)
    y = df["target"]

    num_features = X.columns
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    preprocessor = ColumnTransformer([
        ("num", num_pipeline, num_features)
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    return preprocessor, X_train, X_test, y_train, y_test


In [21]:
!pip install mlflow



In [22]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

mlflow.set_experiment("Heart-Disease-Experiment")

df = load_data("processed.cleveland.data")
preprocessor, X_train, X_test, y_train, y_test = preprocess_data(df)

with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)
    score = pipe.score(X_test, y_test)

    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("accuracy", score)
    mlflow.sklearn.log_model(pipe, "model")

    print("Accuracy:", score)




Accuracy: 0.5245901639344263


In [24]:
from sklearn.metrics import classification_report, roc_auc_score

def evaluate(model, X_test, y_test):
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    print("ROC-AUC:", roc_auc_score(y_test, preds))


In [25]:
import mlflow.sklearn
from fastapi import FastAPI

app = FastAPI()

#model = mlflow.sklearn.load_model("runs:/latest/model")
model = mlflow.sklearn.log_model(
    pipe,
    "model",
    registered_model_name="HeartDiseaseModel"
)

@app.post("/predict")
def predict(data: dict):
    df = pd.DataFrame([data])
    pred = model.predict(df)[0]
    prob = model.predict_proba(df)[0].max()

    return {
        "prediction": int(pred),
        "confidence": float(prob)
    }


Registered model 'HeartDiseaseModel' already exists. Creating a new version of this model...
Created version '2' of model 'HeartDiseaseModel'.


In [27]:
#from src.data_loader import load_data

def test_data_loading():
    df = load_data("data/heart.csv")
    assert df.shape[0] > 0


In [28]:
!git config --global user.name "Diksha"
!git config --global user.email "2024aa05518@wilp.bits-pilani.ac.in"

!git clone https://github.com/Diksha427/mlops-assignment

Cloning into 'mlops-assignment'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [30]:
!mv mmlops_assignment.ipynb mlops-assignment/

mv: cannot stat 'Mlops_assignment.ipynb': No such file or directory


In [31]:
!ls

cleveland.data	   mlops-assignment	     processed.cleveland.data.1
heart_disease.csv  mlruns		     processed.cleveland.data.2
mlflow.db	   processed.cleveland.data  sample_data
