In [20]:
import pandas as pd

pricing_df = pd.read_csv("https://full-stack-assets.s3.eu-west-3.amazonaws.com/Deployment/get_around_pricing_project.csv", index_col=0)
pricing_df.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [11]:
params_list = [i/2000 for i in range(1, 100)]
params_list[:10]

[0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004, 0.0045, 0.005]

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score, make_scorer
from dotenv import load_dotenv
import pandas as pd
import mlflow
import joblib
import os

load_dotenv()

categorical_cols = ["model_key", "fuel", "paint_color", "car_type", "private_parking_available", "has_gps", "has_air_conditioning",
                    "automatic_car", "has_getaround_connect", "has_speed_regulator", "winter_tires"]
numerical_cols = ["mileage", "engine_power"]

X = pricing_df[categorical_cols + numerical_cols]
y = pricing_df["rental_price_per_day"]

bins = pd.qcut(y, q=4, duplicates="drop")
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.75,
    random_state=444719,
    stratify=bins
)

onehot_encoder = OneHotEncoder(drop="first", handle_unknown="ignore")
standard_scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", onehot_encoder, categorical_cols),
        ("numerical", standard_scaler, numerical_cols)
    ]
)

model = Lasso()  # or Ridge()

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", model)
])

param_grid = {
    "regressor__alpha": params_list
}

scorer = make_scorer(r2_score)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

EXPERIMENT_NAME="getaround-price-prediction"
mlflow.set_tracking_uri(os.environ["TRACKING_SERVER_URL"])
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

with mlflow.start_run(experiment_id = experiment.experiment_id, run_name="Lasso_GridSearchCV_1_casting_a_wide_net"):
    grid_search.fit(X_train, y_train)

    mlflow.log_params(grid_search.best_params_)

    y_pred = grid_search.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("test_r2", r2)

    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation R²:", grid_search.best_score_)
    print("Test R²:", r2)

    input_example = X_train.iloc[:1]
    mlflow.sklearn.log_model(
        grid_search.best_estimator_,
        "model",
        input_example=input_example,
        registered_model_name="getaround-price-prediction-model"
    )

    # joblib.dump(grid_search.best_estimator_, "linear_regression_model.pkl")

In [None]:
# This cell tests the model, with an input example that was part of the logs sent to the tracking server
import mlflow
from mlflow.models import Model
from dotenv import load_dotenv

load_dotenv()

model_uri = 's3://getaround-mlflow-artifactstore/2/b21d60925615456587e03b92d95843c4/artifacts/model'

pyfunc_model = mlflow.pyfunc.load_model(model_uri)
input_data = pyfunc_model.input_example

mlflow.models.predict(
    model_uri=model_uri,
    input_data=input_data,
    env_manager="conda",
)

In [17]:
import sklearn
print(sklearn.__version__)

1.6.1
