In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [None]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load dataset
df = pd.read_csv('abalone.csv')

# Convert Rings to Age
df['Age'] = df['Rings'] + 1.5

# Drop the Rings column as it's no longer needed
df = df.drop(columns=['Rings'])

# Encode the 'Sex' column
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])

# Define features and target
X = df.drop(columns=['Age'])
y = df['Age']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the model
model = RandomForestRegressor(random_state=42)

# Set tracking URI for MLflow
mlflow.set_tracking_uri("http://127.0.0.1:8080")

# Create a new MLflow experiment
mlflow.set_experiment("Abalone Age Prediction Experiment")

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params({
        'n_estimators': model.n_estimators,
        'max_depth': model.max_depth,
        'random_state': model.random_state
    })
    
    # Train the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2_score", r2)
    
    # Set tags (optional)
    mlflow.set_tag("Training Info", "Random Forest model for Abalone Age Prediction")
    
    # Infer the model signature
    signature = infer_signature(X_train_scaled, model.predict(X_train_scaled))
    
    # Log the model
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="abalone_age_model",
        signature=signature,
        input_example=X_train_scaled,
        registered_model_name="AbaloneAgePredictionModel"
    )
    
    print(f"MAE: {mae}, R2 Score: {r2}")

    