In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression
import os
from supabase import create_client, Client
from dotenv import load_dotenv

In [2]:
# Load variables from .env into the environment
load_dotenv()

# Read variables
supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_KEY")

In [3]:
# Initialize client
supabase: Client = create_client(supabase_url, supabase_key)

## Load Data

In [90]:
car_adverts_table = 'autoscout_car_adverts'
response = supabase.table(car_adverts_table).select("car_id, make, model, mileage, listing_price, months_since_2000, power_pk").execute()
df_full = pd.DataFrame(response.data)

In [91]:
df_full

Unnamed: 0,car_id,make,model,mileage,listing_price,months_since_2000,power_pk
0,093b2d48-ff65-480e-9be0-3d4fa3737925,audi,a3,15,47490,301.0,
1,ca26948e-ad6d-4df2-8e30-74ba423e8147,opel,movano,10,47856,300.0,
2,7706f159-2b71-46a8-9611-ef1a62f05f5a,volkswagen,golf,10,47350,301.0,
3,7e8eee11-6b84-4838-85fb-7fc2a5964e9d,mercedes-benz,citan,10,47184,301.0,
4,1d585d70-062b-4f97-9b6b-7f3699ad49da,volkswagen,crafter,10,47130,300.0,
...,...,...,...,...,...,...,...
357029,09ee99be-81e3-4458-8464-66de42f83d1b,renault,master,20,46677,300.0,
357030,a93a581f-571a-4fbf-bb26-929eb1a55dd7,kia,ev3,5,46209,306.0,
357031,3ba87ddd-ceab-40ce-8552-7344f8705041,cupra,terramar,10,46990,301.0,
357032,4ab72afe-c022-4f99-8664-e5c311b41ae1,bmw,320,50549,22950,212.0,


## Prepare Data for Training

In [92]:
df_clean = df_full.where(df_full['mileage']>0, None) # Filter out rows with mileage < 0, which are missing values
df_clean = df_clean.dropna(axis=0, subset=['mileage', 'listing_price', 'months_since_2000']) # Filter out rows with mileage < 0, which are missing values

In [100]:
df_clean = df_clean.fillna(0)

In [102]:
df_clean.head()

Unnamed: 0,car_id,make,model,mileage,listing_price,months_since_2000,power_pk
0,093b2d48-ff65-480e-9be0-3d4fa3737925,audi,a3,15.0,47490.0,301.0,0.0
1,ca26948e-ad6d-4df2-8e30-74ba423e8147,opel,movano,10.0,47856.0,300.0,0.0
2,7706f159-2b71-46a8-9611-ef1a62f05f5a,volkswagen,golf,10.0,47350.0,301.0,0.0
3,7e8eee11-6b84-4838-85fb-7fc2a5964e9d,mercedes-benz,citan,10.0,47184.0,301.0,0.0
4,1d585d70-062b-4f97-9b6b-7f3699ad49da,volkswagen,crafter,10.0,47130.0,300.0,0.0


In [103]:
x = df_clean[['model','make','mileage', 'months_since_2000', 'power_pk']]

In [104]:
y = df_clean[['listing_price']]

In [105]:
categorical_low = ['make']
categorical_high = ['model']
numerical = ['mileage', 'months_since_2000', 'power_pk']

preprocessor = ColumnTransformer(
    transformers=[
        ('make', TargetEncoder(), categorical_low),
        ('model', TargetEncoder(), categorical_high),
        ('num', 'passthrough', numerical)
    ]
)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


In [106]:
model

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('make', ...), ('model', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [107]:
model.fit(x, y)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('make', ...), ('model', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [123]:
new_car = pd.DataFrame([{
    'make': 'toyota',
    'model': 'corolla',
    'mileage': 100,
    'months_since_2000': 0,
    'power_pk': 100
}])

In [124]:
y_pred = model.predict(new_car)

In [125]:
y_pred

array([[29178.09002746]])

In [130]:
scores = cross_val_score(model, x, y, cv=5, scoring='neg_mean_squared_error')

In [131]:
print("CV MAE:", scores.mean())

CV MAE: -1321052097.5045776
