## Imports & Setup

In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

from xgboost import XGBRegressor
import joblib

pd.set_option('display.max_columns', None)

## Load Dataset

In [13]:
url = "https://full-stack-assets.s3.eu-west-3.amazonaws.com/Deployment/get_around_pricing_project.csv"

df = pd.read_csv(url, encoding="latin")
df.head()

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,CitroÃ«n,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,CitroÃ«n,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,CitroÃ«n,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,CitroÃ«n,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,CitroÃ«n,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


## Inspect Dataset

In [14]:
df.info()
df.describe(include="all")
df.isna().mean()
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4843 entries, 0 to 4842
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 4843 non-null   int64 
 1   model_key                  4843 non-null   object
 2   mileage                    4843 non-null   int64 
 3   engine_power               4843 non-null   int64 
 4   fuel                       4843 non-null   object
 5   paint_color                4843 non-null   object
 6   car_type                   4843 non-null   object
 7   private_parking_available  4843 non-null   bool  
 8   has_gps                    4843 non-null   bool  
 9   has_air_conditioning       4843 non-null   bool  
 10  automatic_car              4843 non-null   bool  
 11  has_getaround_connect      4843 non-null   bool  
 12  has_speed_regulator        4843 non-null   bool  
 13  winter_tires               4843 non-null   bool  
 14  rental_p

Index(['Unnamed: 0', 'model_key', 'mileage', 'engine_power', 'fuel',
       'paint_color', 'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day'],
      dtype='object')

In [15]:
target = "rental_price_per_day"
assert target in df.columns

## Define X, y

In [16]:
X = df.drop(columns=[target])
y = df[target]

num_cols = X.select_dtypes(include="number").columns.tolist()
cat_cols = X.select_dtypes(exclude="number").columns.tolist()

print("Numeric features:", num_cols)
print("Categorical features:", cat_cols)


Numeric features: ['Unnamed: 0', 'mileage', 'engine_power']
Categorical features: ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


## Build Preprocessing Pipeline

In [17]:
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)


## Define XGBoost Regressor

In [18]:
model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

## Build Final Pipeline

In [19]:
pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", model)
    ]
)

## Split Data & Train

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, Y_train)
print("Training complete.")

Training complete.


## Evaluate

In [21]:
y_pred = pipeline.predict(X_test)
print("R2 Score:", r2_score(Y_test, y_pred))

R2 Score: 0.7365987300872803


## Save the Model

In [22]:
joblib.dump(pipeline, "model.joblib")
print("Model saved as model.joblib")

Model saved as model.joblib


## Quick test on sample input

In [23]:
sample = X.iloc[[0]]  # take first row as sample
pipeline.predict(sample)

array([112.85594], dtype=float32)

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    ),
    "XGBoost": XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42
    )
}

results = {}

for name, model in models.items():
    pipe = Pipeline(steps=[
        ("preprocessing", preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, Y_train)
    pred = pipe.predict(X_test)
    score = r2_score(Y_test, pred)
    results[name] = score

results

{'LinearRegression': 0.6936992728577747,
 'RandomForest': 0.7264185143493568,
 'GradientBoosting': 0.7255557454924207,
 'XGBoost': 0.7365987300872803}

In [29]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "model__n_estimators": [200, 300],
    "model__max_depth": [4, 6],
    "model__learning_rate": [0.05, 0.1],
    "model__subsample": [0.8],
    "model__colsample_bytree": [0.8]
}

pipe = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", XGBRegressor(
        objective="reg:squarederror",
        random_state=42
    ))
])

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="r2",
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, Y_train)

print("Best parameters:", grid.best_params_)
print("Best R2 score:", grid.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=4, model__n_estimators=300, model__subsample=0.8; total time=   0.5s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200, model__subsample=0.8; total time=   0.4s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=4, model__n_estimators=300, model__subsample=0.8; total time=   0.5s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=4, model__n_estimators=200, model__subsample=0.8; total time=   0.4s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=6, model__n_estimators=200, model__subsample=0.8; total time=   0.6s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=6, model__n_estimators=200, model__subsample=0.8; total time=   0.6s
[CV] END model__colsample_bytree=0

In [30]:
best_model = grid.best_estimator_
joblib.dump(best_model, "model.joblib")
print("Best tuned model saved → model.joblib")


Best tuned model saved → model.joblib


In [31]:
pd.DataFrame.from_dict(results, orient="index", columns=["R2 Score"]).sort_values(by="R2 Score", ascending=False)

Unnamed: 0,R2 Score
XGBoost,0.736599
RandomForest,0.726419
GradientBoosting,0.725556
LinearRegression,0.693699


In [32]:
best_model = grid.best_estimator_
joblib.dump(best_model, "model.joblib")
print("Tuned model saved → model.joblib")

Tuned model saved → model.joblib


## Model conclusion:
After comparing four models (Linear Regression, Random Forest, Gradient Boosting, XGBoost), the XGBoost model achieved the best R².
It combines:
- stable performance,
- good generalisation,
- robustness to non-normalised variables,
- simple integration into the pipeline

→ This is the model selected for production (FastAPI).

In [28]:
best_model = models["XGBoost"]

pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", best_model)
])

pipeline.fit(X_train, Y_train)

joblib.dump(pipeline, "model.joblib")
print("Best model saved → model.joblib")

Best model saved → model.joblib
