In [None]:
#QUESTION 1

In [14]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import mlflow


In [17]:
class IrisDataProcessor:
    def __init__(self):
        self.data = load_iris()
        self.df = None
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
        self.scaler = StandardScaler()
    def prepare_data(self):
        self.df = pd.DataFrame(data=self.data.data, columns=self.data.feature_names)
        self.df['target'] = self.data.target
        
        X = self.df[self.data.feature_names]
        X_scaled = self.scaler.fit_transform(X)
        y = self.df['target']

        #splitting data
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X_scaled, y, test_size=0.2, random_state=42, stratify=y
        )
        return self.X_train, self.X_test, self.y_train, self.y_test
        
    def get_feature_stats(self):
        return self.df.describe()

processor = IrisDataProcessor()
X_train, X_test, y_train, y_test = processor.prepare_data()
print("Data Preparation Complete")

print(processor.get_feature_stats())

Data Preparation Complete
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)      target  
count        150.000000  150.000000  
mean           1.199333    1.000000  
std            0.762238    0.819232  
min            0.100000    0.000000  
25%            0.300000    0.000000  
50%            1.300000    1.000000  
75%            1.800000    2.000000  
max            2.500000    2.000000  


In [18]:
#Question 2: Experiment Tracking and Model Development

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score

class IrisExperiment:
    def __init__(self, data_processor):
        self.data_processor = data_processor
        mlflow.set_experiment("Iris_Classification")
        mlflow.set_tracking_uri(" http://127.0.0.1:5001/")
        self.model_params = []  # to store each model's result

    def run_experiment(self):
        # Train and log multiple models
        models = {
            "Logistic Regression": LogisticRegression(max_iter=200),
            "Random Forest": RandomForestClassifier(n_estimators=100)
        }
        

        for model_name, model in models.items():
            with mlflow.start_run(run_name=model_name):
                scores = cross_val_score(model, self.data_processor.X_train, self.data_processor.y_train, cv=5)
                cv_accuracy = np.mean(scores)

                model.fit(self.data_processor.X_train, self.data_processor.y_train)
                y_pred = model.predict(self.data_processor.X_test)

                accuracy = accuracy_score(self.data_processor.y_test, y_pred)
                precision = precision_score(self.data_processor.y_test, y_pred, average='macro')
                recall = recall_score(self.data_processor.y_test, y_pred, average='macro')            

                mlflow.log_param("model_name", model_name)
                mlflow.log_metric("cross_val_accuracy", cv_accuracy)
                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric("precision", precision)
                mlflow.log_metric("recall", recall)

                self.model_params.append({
                    'model_name': model_name, 
                    'cv_accuracy': cv_accuracy, 
                    'accuracy': accuracy, 
                    'precision': precision, 
                    'recall': recall
                })

        print("Experiment run complete.")
        return self.model_params

    def log_results(self):
        for param in self.model_params:
            print(f"Model: {param['model_name']}")
            print(f"Cross-validation Accuracy: {param['cv_accuracy']}")
            print(f"Accuracy: {param['accuracy']}")
            print(f"Precision: {param['precision']}")
            print(f"Recall: {param['recall']}")
            print("Logged to MLflow")

processor = IrisDataProcessor()
processor.prepare_data()

experiment = IrisExperiment(processor)
experiment.run_experiment()
experiment.log_results()

2024/11/12 12:54:27 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at:  http://127.0.0.1:5001/#/experiments/508691177518362002/runs/e86da228d4bb4d63b072f1c1e7560ffc.
2024/11/12 12:54:27 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at:  http://127.0.0.1:5001/#/experiments/508691177518362002.
2024/11/12 12:54:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at:  http://127.0.0.1:5001/#/experiments/508691177518362002/runs/b4b0acbdd74b4508878f4f1d8bebbca3.
2024/11/12 12:54:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at:  http://127.0.0.1:5001/#/experiments/508691177518362002.


Experiment run complete.
Model: Logistic Regression
Cross-validation Accuracy: 0.9583333333333334
Accuracy: 0.9333333333333333
Precision: 0.9333333333333332
Recall: 0.9333333333333332
Logged to MLflow
Model: Random Forest
Cross-validation Accuracy: 0.9583333333333334
Accuracy: 0.9666666666666667
Precision: 0.9696969696969697
Recall: 0.9666666666666667
Logged to MLflow


In [32]:
mlflow.end_run()

2024/11/12 12:44:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run illustrious-flea-711 at:  http://127.0.0.1:5001/#/experiments/508691177518362002/runs/ed961370e58a4512a3ba98f84284f041.
2024/11/12 12:44:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at:  http://127.0.0.1:5001/#/experiments/508691177518362002.


In [35]:
#Question 3