In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(r"scraped\final\RELEVANT_car_data_autovit_final_20250928_142949.csv")

In [3]:
df.dropna

<bound method DataFrame.dropna of                brand      model  \
0               Ford     Mondeo   
1               Ford       Kuga   
2                BMW    Seria 3   
3             Nissan     Pulsar   
4            Citroën  C4 Cactus   
...              ...        ...   
31922         Nissan    Qashqai   
31923  Mercedes-Benz        GLS   
31924         Nissan       Juke   
31925            BMW    Seria 3   
31926     Land Rover  Discovery   

                                          full_title  engine_displacement_cm3  \
0           Ford Mondeo 2.0 TDCi Powershift Titanium                   1997.0   
1         Ford Kuga 2.0 TDCi 4WD Powershift Titanium                      NaN   
2                                        BMW Seria 3                      NaN   
3                                      Nissan Pulsar                      NaN   
4      Citroën C4 Cactus 1.2 PureTech S&S BVM6 Shine                   1199.0   
...                                              ...       

In [4]:
df

Unnamed: 0,brand,model,full_title,engine_displacement_cm3,power_hp,ad_description,price_eur,mileage_km,fuel_type,production_year
0,Ford,Mondeo,Ford Mondeo 2.0 TDCi Powershift Titanium,1997.0,180.0,ford mondeo ver-2-0-tdci-powershift-titanium,13150.00,187014,Diesel,2017
1,Ford,Kuga,Ford Kuga 2.0 TDCi 4WD Powershift Titanium,,,,8999.00,175000,Diesel,2014
2,BMW,Seria 3,BMW Seria 3,,,,9999.00,194000,Diesel,2017
3,Nissan,Pulsar,Nissan Pulsar,,,,5999.00,116000,Benzina,2016
4,Citroën,C4 Cactus,Citroën C4 Cactus 1.2 PureTech S&S BVM6 Shine,1199.0,130.0,"Citron C4 Cactus 1.2i 130CP EURO6 ''Facelift""",8200.00,143000,Benzina,2016
...,...,...,...,...,...,...,...,...,...,...
31922,Nissan,Qashqai,Nissan Qashqai 1.3 DIG-T MHEV Xtronic Acenta,,,,20240.00,97000,Benzina,2023
31923,Mercedes-Benz,GLS,Mercedes-Benz GLS 580 MHEV 4MATIC Aut,3982.0,489.0,Mercedes-Benz Gls,92900.00,102816,Benzina,2022
31924,Nissan,Juke,Nissan Juke,,,,21990.00,10000,Hibrid,2024
31925,BMW,Seria 3,BMW Seria 3 320d xDrive AT MHEV,,,,29477.22,129000,Diesel,2021


In [5]:
df_clean = df.copy()
def remove_outliers_iqr(dataframe, column_name):
    Q1 = dataframe[column_name].quantile(0.25)
    Q3 = dataframe[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return dataframe[(dataframe[column_name] >= lower) & (dataframe[column_name] <= upper)]
df_clean = remove_outliers_iqr(df_clean, 'price_eur')
df_clean = remove_outliers_iqr(df_clean, 'mileage_km')
df_clean = remove_outliers_iqr(df_clean, 'engine_displacement_cm3')
df_clean = remove_outliers_iqr(df_clean, 'power_hp')

In [6]:
df_model = df_clean.copy()

df_model = pd.get_dummies(df_model, columns=['brand', 'model', 'fuel_type', 'full_title'], drop_first=True)

X = df_model.drop(['price_eur', 'ad_description'], axis=1)
y = df_model['price_eur']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [9]:
print("Intercept:", lr.intercept_)
print("Number of coefficients:", len(lr.coef_))

Intercept: -1645093.0344839129
Number of coefficients: 9459


In [10]:
y_pred = lr.predict(X_test)

In [11]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("R2 score:", r2)

MSE: 15974145.592366945
R2 score: 0.8490079810289543


In [12]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Remove outliers, narrow the range of data
def remove_extreme_outliers(df, column, factor=3):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - factor * IQR
    upper = Q3 + factor * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

def clean_data(df):
    df = df.copy()
    
    #Filter ranges before removing outliers
    df = df[(df['price_eur'] >= 500) & (df['price_eur'] <= 200000)]
    df = df[(df['mileage_km'] >= 0) & (df['mileage_km'] <= 500000)]
    df = df[(df['engine_displacement_cm3'].isna()) | 
            ((df['engine_displacement_cm3'] >= 500) & (df['engine_displacement_cm3'] <= 6500))]
    df = df[(df['power_hp'].isna()) | ((df['power_hp'] >= 30) & (df['power_hp'] <= 800))]
    
    #Remove extreme outliers using 3x IQR
    for col in ['price_eur', 'mileage_km', 'engine_displacement_cm3', 'power_hp']:
        if col in df.columns:
            df = remove_extreme_outliers(df, col, factor=3)
    
    #Strip whitespace from column names
    df.columns = df.columns.str.strip()
    
    return df

#Reshape fields, like turning production year into a more valuable value in the form of the age of the car
def engineer_features(df):
    df = df.copy()
    current_year = pd.Timestamp.now().year
    
    #Car age
    df['age'] = current_year - df['production_year']
    
    #Mileage per year
    df['km_per_year'] = df['mileage_km'] / df['age'].replace(0,1)
    
    #Power per liter
    df['power_per_liter'] = df['power_hp'] / df['engine_displacement_cm3'].replace(0,1)
    
    return df


#Encode top-N categorical variables
def encode_top_categories(df, col_name, top_n=10, target='price_eur'):
    df = df.copy()
    top_categories = df[col_name].value_counts().head(top_n).index
    df[col_name + '_encoded'] = df[col_name].where(df[col_name].isin(top_categories), 'Other')
    
    #Target encoding
    mean_price = df.groupby(col_name + '_encoded')[target].mean()
    df[col_name + '_encoded'] = df[col_name + '_encoded'].map(mean_price)
    
    return df

def encode_fuel_type(df):
    df = df.copy()
    if 'fuel_type' in df.columns:
        df = pd.get_dummies(df, columns=['fuel_type'], drop_first=True)
    return df

#Prepare X/y
def prepare_xy(df):
    df = df.copy()
    
    #Drop text columns
    drop_cols = ['ad_description', 'full_title', 'brand', 'model']
    for c in drop_cols:
        if c in df.columns:
            df.drop(c, axis=1, inplace=True)
    
    #Fill NaNs with median
    for col in df.select_dtypes(include=['float', 'int']).columns:
        df[col] = df[col].fillna(df[col].median())
    
    #Separate target
    y = np.log(df['price_eur'])  
    X = df.drop('price_eur', axis=1)
    
    return X, y

#Apply all steps
def process_and_split(df, top_brand_n=10, top_model_n=10):
    df = clean_data(df)
    df = engineer_features(df)
    
    #Encode top brands/models
    if 'brand' in df.columns:
        df = encode_top_categories(df, 'brand', top_n=top_brand_n)
    if 'model' in df.columns:
        df = encode_top_categories(df, 'model', top_n=top_model_n)
    
    df = encode_fuel_type(df)

    X, y = prepare_xy(df)
    
    #Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test

#Train Linear Regression
def train_and_evaluate(df):
    X_train, X_test, y_train, y_test = process_and_split(df)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(np.exp(y_test), np.exp(y_pred))
    r2 = r2_score(np.exp(y_test), np.exp(y_pred))
    
    print("MSE:", mse)
    print("RMSE:", np.sqrt(mse))
    print("R2 score:", r2)
    
    return model, X_train.columns

model, feature_columns = train_and_evaluate(df)

MSE: 29642069.5404838
RMSE: 5444.453098382224
R2 score: 0.8508642150621978


In [13]:
scores_10fold = cross_val_score(model, X, y, cv=10, scoring='r2')

In [14]:
print("10-Fold Cross validation R2 scores:", scores_10fold)

10-Fold Cross validation R2 scores: [0.85221789 0.87347791 0.88335139 0.86445533 0.86232355 0.81332627
 0.86241514 0.86728136 0.85297921 0.86557982]


In [None]:
from sklearn.ensemble import RandomForestRegressor

def train_random_forest(df, n_estimators=200, max_depth=None):
    #Preprocess dataset, like for Linear Regression
    X_train, X_test, y_train, y_test = process_and_split(df)
    
    #Random Forest
    rf = RandomForestRegressor(n_estimators=n_estimators, 
                               max_depth=max_depth, 
                               random_state=42,
                               n_jobs=-1)
    
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    #Evaluate RF
    mse = mean_squared_error(np.exp(y_test), np.exp(y_pred))
    r2 = r2_score(np.exp(y_test), np.exp(y_pred))
    
    print("Random Forest Results:")
    print("MSE:", mse)
    print("RMSE:", np.sqrt(mse))
    print("R2 score:", r2)
    
    return rf, X_train.columns

rf_model, features = train_random_forest(df)

Random Forest Results:
MSE: 17125596.03182926
RMSE: 4138.308353884381
R2 score: 0.9138373519012775


In [16]:
scores_10fold_rf = cross_val_score(rf_model, X, y, cv=10, scoring='r2')
print("10-Fold Cross validation R2 scores for Random Forest:", scores_10fold_rf)

10-Fold Cross validation R2 scores for Random Forest: [0.91172681 0.92239175 0.92806372 0.92765798 0.92999592 0.90504358
 0.91553399 0.92119686 0.92438536 0.92399506]


In [17]:
from sklearn.ensemble import GradientBoostingRegressor
def train_gradient_boosting(df, n_estimators=200, learning_rate=0.1, max_depth=3):
    #Preprocess dataset like for Linear Regression again
    X_train, X_test, y_train, y_test = process_and_split(df)
    
    #Gradient Boosting
    gb = GradientBoostingRegressor(n_estimators=n_estimators, 
                                   learning_rate=learning_rate,
                                   max_depth=max_depth,
                                   random_state=42)
    
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    
    #Evaluate GB
    mse = mean_squared_error(np.exp(y_test), np.exp(y_pred))
    r2 = r2_score(np.exp(y_test), np.exp(y_pred))
    
    print("Gradient Boosting Results:")
    print("MSE:", mse)
    print("RMSE:", np.sqrt(mse))
    print("R2 score:", r2)
    
    return gb, X_train.columns


In [18]:
gb_model, features = train_gradient_boosting(df)

Gradient Boosting Results:
MSE: 18854472.50095325
RMSE: 4342.173706906859
R2 score: 0.9051389933426364


In [19]:
features

Index(['engine_displacement_cm3', 'power_hp', 'mileage_km', 'production_year',
       'age', 'km_per_year', 'power_per_liter', 'brand_encoded',
       'model_encoded', 'fuel_type_Benzina + CNG', 'fuel_type_Benzina + GPL',
       'fuel_type_Diesel', 'fuel_type_Hibrid', 'fuel_type_Hibrid Plug-In'],
      dtype='object')

In [20]:
scores_10fold_gb = cross_val_score(gb_model, X, y, cv=10, scoring='r2')
print("Gradient Boosting 10-Fold Cross validation R2 scores:", scores_10fold_gb)

Gradient Boosting 10-Fold Cross validation R2 scores: [0.88378771 0.90260643 0.91520595 0.90960144 0.90951142 0.8937109
 0.90468172 0.91057098 0.90028439 0.90562796]
