In [2]:
# experiment_tracking.ipynb

import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Optional: Name the columns (since the dataset has no header row)
column_names = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'
]

# Load dataset
df = pd.read_csv("data/diabetes.csv", names=column_names)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set MLflow experiment name
mlflow.set_experiment("Diabetes Classification")

# Start tracking with MLflow
with mlflow.start_run():
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # Log parameters and metrics
    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("accuracy", acc)

    # Log the trained model
    mlflow.sklearn.log_model(model, "diabetes_model")

    print("Model trained and logged with accuracy:", acc)


2025/04/25 11:23:17 INFO mlflow.tracking.fluent: Experiment with name 'Diabetes Classification' does not exist. Creating a new experiment.


Model trained and logged with accuracy: 0.7207792207792207
