In [3]:
import time
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split
from mlflow.models.signature import infer_signature
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    classification_report
)

In [5]:
dataset = pd.read_csv('get_around_pricing_project.csv').drop('Unnamed: 0', axis=1)
dataset

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,Toyota,39743,110,diesel,black,van,False,True,False,False,False,False,True,121
4839,Toyota,49832,100,diesel,grey,van,False,True,False,False,False,False,True,132
4840,Toyota,19633,110,diesel,grey,van,False,True,False,False,False,False,True,130
4841,Toyota,27920,110,diesel,brown,van,True,True,False,False,False,False,True,151


In [16]:
dataset.loc[2].to_list()

['Citroën',
 183297,
 120,
 'diesel',
 'white',
 'convertible',
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 101]

In [6]:
# mlflow server connection 
mlflow.set_tracking_uri("https://mlflow-s3-5c46c0d9d46b.herokuapp.com/")
#mlflow.set_tracking_uri("http://0.0.0.0:443/")

#mlflow.set_tracking_uri("../mlruns")
EXPERIMENT_NAME="getaround_predict"
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment:
    print("Experiment ID:", experiment.experiment_id)
    print("Artifact Location:", experiment.artifact_location)
else:
    print(f"Experiment '{EXPERIMENT_NAME}' does not exist.")

# start experiment time tracking
start_time = time.time()
mlflow.sklearn.autolog(log_models=False)

# load dataset for training
dataset = pd.read_csv('get_around_pricing_project.csv').drop('Unnamed: 0', axis=1)
Y = dataset['rental_price_per_day']
X = dataset.drop('rental_price_per_day', axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=1)

# list numeric/categorical columns
numeric_features = []
categorical_features = []
for i, t in X.dtypes.items():
    if ("float" in str(t)) or ("int" in str(t)):
        numeric_features.append(i)
    else:
        categorical_features.append(i)
# list check
print("Found numeric features ", numeric_features)
print("Found categorical features ", categorical_features)

# assemble the model with preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
    ]
)
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor())
])

# train the model
with mlflow.start_run(experiment_id = experiment.experiment_id):
    model.fit(X_train, Y_train)
    predictions = model.predict(X_train)

    mlflow.sklearn.log_model(model, "xgboost")
    '''mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="getaround_price_prediction",
        registered_model_name="xgboost",
        signature=infer_signature(X_train, predictions)
    )'''

print("...Done!")
print(f"---Total training time: {time.time()-start_time}")
print('train_score', model.score(X_train, Y_train))
print('test_score', model.score(X_test, Y_test))

mlflow.end_run()

Experiment ID: 37
Artifact Location: mlflow-artifacts:/37
Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']




...Done!
---Total training time: 19.29865026473999
train_score 0.9417796329507986




test_score 0.7119470671202109


In [11]:
dataset.dtypes.to_list()


[dtype('O'),
 dtype('int64'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('bool'),
 dtype('bool'),
 dtype('bool'),
 dtype('bool'),
 dtype('bool'),
 dtype('bool'),
 dtype('bool'),
 dtype('int64')]

In [10]:
mlflow.set_tracking_uri("https://mlflow-s3-5c46c0d9d46b.herokuapp.com/")
# Read data 
input = dataset.loc[2].to_list()  
print(input)

columns = ['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day']
features = pd.DataFrame([input], 
                        columns=columns,
                        )
features = features.drop('rental_price_per_day', axis=1)  
display('features',features)

logged_model = 'runs:/034b629cb7de4c89acdc44b50dfb3c28/xgboost'
#logged_model = 'logged_model/getaround_price_prediction'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

prediction = loaded_model.predict(features)
print('prediction',prediction)

# Format response
response = {"prediction": prediction.tolist()[0]}
response

['Citroën', 183297, 120, 'diesel', 'white', 'convertible', False, False, False, False, True, False, True, 101]


'features'

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]



prediction [110.788]


{'prediction': 110.78800201416016}

In [32]:
import os
from mlflow.tracking import MlflowClient
mlflow.set_tracking_uri("https://mlflow-server-09febdc5a39a.herokuapp.com")

# Initialize MLflow client
client = MlflowClient()

# Specify the local directory where you want to store artifacts
local_dir = "logged_model"

# Create the local directory if it doesn't exist
if not os.path.exists(local_dir):
    os.mkdir(local_dir)

# Assume you have logged an artifact named "features.txt" during an MLflow run
getaround_price_prediction = '''model_key, mileage, engine_power, fuel, paint_color,
       car_type, private_parking_available, has_gps,
       has_air_conditioning, automatic_car, has_getaround_connect,
       has_speed_regulator, winter_tires, rental_price_per_day'''
with open("getaround_price_prediction.txt", 'w') as f:
    f.write(getaround_price_prediction)

# Create a sample MLflow run and log the artifact "features.txt"
with mlflow.start_run() as run:
    mlflow.log_artifact("getaround_price_prediction.txt", artifact_path="getaround_price_prediction")

# Download the artifact to local storage
local_path = client.download_artifacts('c956f13b3d334bfb9202f52244a224e9', "getaround_price_prediction", local_dir)
print(f"Artifacts downloaded in: {local_dir}")

Downloading artifacts: 100%|██████████| 9/9 [00:00<00:00, 20.07it/s]   

Artifacts downloaded in: logged_model



