In [80]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib.patheffects as path_effects

import ipywidgets as widgets
from IPython.display import display

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
# Regressor models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

In [81]:
data = pd.read_csv('../../data/avito_clean.csv')
df = data.copy()

#df = df.dropna()
df = df[df['type_annonce'] == 'à vendre']
df = df.drop_duplicates(subset=['lien'])

print(f"{len(df)} annonces.")


1234 annonces.


## ---------------------- filtering outliers --------------------------------

In [None]:
# Removing the listings within 1% margin (means 1% within the hight or lowest price)
lower = df['prix'].quantile(0.01)
upper = df['prix'].quantile(0.99)

df = df[(df['prix'] >= lower) & (df['prix'] <= upper)]


print(f"Removed {len(df) - len(df)} outlier listings")


Removed 564 outlier listings


## ------------------------- ML analysis -----------------------------------

 ---------------------------------------------------- Regression (Prediction) ----------------------------------------------------------------------

In [82]:
### Linear regression prediction model
df['car_age'] = 2026 - df['annee']
features = ['marque', 'modele', 'car_age', 'carburant']
target = 'prix'

pr_df = df.dropna(subset=features + [target])

x = pr_df[features]
y = pr_df[target]

categorical = ['marque', 'modele', 'carburant']
numerical = ['car_age']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', 'passthrough',numerical)
        ])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


linear_regression_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', LinearRegression())
    ])


linear_regression_model.fit(x_train, y_train)

y_pred = linear_regression_model.predict(x_test)

print(f'Score train: {linear_regression_model.score(x_train, y_train)}\n')
print(f'Score test: {linear_regression_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Linear regression model score: ',r2_score(y_test, y_pred) * 100 ,'%')
linear_regression_model_score = r2_score(y_test, y_pred) * 100



Score train: 0.6618206254188688

Score test: 0.4655503511523522

Moyen Erreur absolut: 73207.69808747704
Linear regression model score:  46.55503511523522 %


In [83]:
### Decesion tree prediction model

decesion_tree_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', DecisionTreeRegressor())
    ])


decesion_tree_model.fit(x_train, y_train)

y_pred = decesion_tree_model.predict(x_test)

print(f'Score train: {decesion_tree_model.score(x_train, y_train)}\n')
print(f'Score test: {decesion_tree_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Decession tree model score: ',r2_score(y_test, y_pred) * 100 ,'%')
decesion_tree_model_score = r2_score(y_test, y_pred) * 100

Score train: 0.9445178929530184

Score test: 0.1921337643222455

Moyen Erreur absolut: 72372.87104622871
Decession tree model score:  19.21337643222455 %


In [84]:
### random forest prediction model

random_forest_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=200,
        max_depth=15,
        min_samples_leaf=5,
        random_state=42))
    ])

random_forest_model.fit(x_train, y_train)

y_pred = random_forest_model.predict(x_test)

print(f'Score train: {random_forest_model.score(x_train, y_train)}\n')
print(f'Score test: {random_forest_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Decession tree model score: ',r2_score(y_test, y_pred) * 100 ,'%')
random_forest_model_score = r2_score(y_test, y_pred) * 100

Score train: 0.49401621084843783

Score test: 0.47915995815260803

Moyen Erreur absolut: 75494.17498522463
Decession tree model score:  47.9159958152608 %


In [85]:
### K-neighbours prediction model

K_neighbours_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', KNeighborsRegressor())
    ])

K_neighbours_model.fit(x_train, y_train)

y_pred = K_neighbours_model.predict(x_test)

print(f'Score train: {K_neighbours_model.score(x_train, y_train)}\n')
print(f'Score test: {K_neighbours_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Decession tree model score: ',r2_score(y_test, y_pred) * 100 ,'%')
K_neighbours_model_score = r2_score(y_test, y_pred) * 100

Score train: 0.56392429111826

Score test: 0.40067502838475455

Moyen Erreur absolut: 73821.31678832116
Decession tree model score:  40.067502838475455 %


In [86]:
df.head()

Unnamed: 0,lien,titre_annonce,prix,proprietere,images,ville,quartier,date,category,type_annonce,annee,transmission,carburant,kilometrage,marque,modele,equipements,car_age
0,https://www.avito.ma/fr/hay_el_fath/voitures_d...,RANGE ROVER SPORT HSE,,Amine,['https://content.avito.ma/classifieds/images/...,rabat,hay el fath,1 heure,voitures d'occasion,à vendre,2016.0,automatique,diesel,135000.0,land rover,range rover sport,"['abs', 'airbags', 'cd/mp3/bluetooth', 'caméra...",10.0
1,https://www.avito.ma/fr/anfa/voitures_d_occasi...,Alfa Romeo Stelvio Quadrifoglio V6 Bi-trubo 510CH,640000.0,Stellantis You Casablanca,['https://content.avito.ma/classifieds/images/...,casablanca,anfa,23 heures,voitures d'occasion,à vendre,2022.0,automatique,essence,60000.0,alfa romeo,stelvio,[],4.0
2,https://www.avito.ma/fr/sidi_maarouf/voitures_...,dacia Duster,,auto fyi word services,['https://content.avito.ma/classifieds/images/...,casablanca,sidi maarouf,3 heures,voitures d'occasion,à vendre,2020.0,manuelle,diesel,130.0,dacia,duster,"['abs', 'airbags', 'cd/mp3/bluetooth', 'caméra...",6.0
3,https://www.avito.ma/fr/anfa/voitures_d_occasi...,Renault Express Diesel Manuelle 2023 à Casablanca,158000.0,Stellantis You Casablanca,['https://content.avito.ma/classifieds/images/...,casablanca,anfa,7 minutes,voitures d'occasion,à vendre,2023.0,manuelle,diesel,25000.0,renault,express,[],3.0
4,https://www.avito.ma/fr/bourgogne/voitures_d_o...,BMW Série 5 pack M edition 2025 à Casablanca,,Promorent Auto,['https://content.avito.ma/classifieds/images/...,casablanca,bourgogne,7 minutes,voitures d'occasion,à vendre,2025.0,automatique,diesel,10000.0,bmw,série 5,"['abs', 'airbags', 'cd/mp3/bluetooth', 'caméra...",1.0


In [90]:
def predict (marque, modele, annee, carburant):
    car = pd.DataFrame([{
        'marque': marque,
        'modele': modele,
        'car_age': 2026 - annee,
        'carburant': carburant
    }])
    line_prediction = linear_regression_model.predict(car)[0]
    rf_prediction = random_forest_model.predict(car)[0]
    kn_prediction = K_neighbours_model.predict(car)[0]
    dt_prediction = decesion_tree_model.predict(car)[0]

    return (
        f"Regression lineaire : {round(line_prediction):,} Dh {round(linear_regression_model_score)} %\n"
        f"Random Forest       : {round(rf_prediction):,} Dh {round(random_forest_model_score)} %\n"
        f"K-Nearest Neigh     : {round(kn_prediction):,} Dh {round(K_neighbours_model_score)} %\n"
        f"Decision Tree         : {round(dt_prediction):,} Dh {round(decesion_tree_model_score)} %"
    )

In [None]:
model_price_year = df.dropna(subset=['marque', 'modele', 'carburant', 'car_age'])

brands = sorted(df['marque'].dropna().unique())

def get_models(brand):
    return sorted(df[df['marque'] == brand]['modele'].dropna().unique())

def show(brand, modele, year, fuel):
    plt.figure(figsize=(8, 4))
    plt.axis('off')
    plt.text(0.5, 0.5, predict(brand, modele, year, fuel), ha='center', va='center', fontsize=14)
    plt.show()

brand_dropdown = widgets.Dropdown(
    options=brands,
    description='Marque:'
)
modele_dropdown = widgets.Dropdown(
    description='Modèle:'
)
year_dropdown = widgets.Dropdown(
    options= (df['annee'].sort_values(ascending=False).dropna().unique()),
    description='Année:'
)
fuel_dropdown = widgets.Dropdown(
    options= df['carburant'].dropna().unique(), 
    description='Carburant:'
)

def update_models(*args):
    modele_dropdown.options = get_models(brand_dropdown.value)

brand_dropdown.observe(update_models, names='value')
update_models()

widgets.interact(show ,brand = brand_dropdown, modele=modele_dropdown, year=year_dropdown, fuel=fuel_dropdown)
print(predict('dacia', 'sandero', 12, 'diesel'))

interactive(children=(Dropdown(description='Marque:', options=('abarth', 'alfa romeo', 'audi', 'bentley', 'bmw…

Regression lineaire : -34,001,405 Dh 47 %
Random Forest       : 36,829 Dh 48 %
K-Nearest Neigh     : 34,800 Dh 40 %
Decision Tree         : 17,000 Dh 19 %


In [92]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest', 'KNN'],
    'R2 Score (%)': [
        linear_regression_model_score,
        decesion_tree_model_score,
        random_forest_model_score,
        K_neighbours_model_score
    ]
})

results.sort_values('R2 Score (%)', ascending=False)

Unnamed: 0,Model,R2 Score (%)
2,Random Forest,47.915996
0,Linear Regression,46.555035
3,KNN,40.067503
1,Decision Tree,19.213376


In [93]:
model = random_forest_model.named_steps['regressor']
feature_names = random_forest_model.named_steps['preprocess'].get_feature_names_out()

importances = pd.Series(model.feature_importances_, index=feature_names)
importances.sort_values(ascending=False).head(15)

num__car_age                 0.380647
cat__marque_porsche          0.134236
cat__marque_mercedes-benz    0.128426
cat__marque_audi             0.088598
cat__modele_megane           0.044666
cat__modele_touareg          0.025692
cat__marque_bmw              0.025683
cat__marque_land rover       0.023549
cat__modele_cayenne          0.021396
cat__marque_dacia            0.021359
cat__marque_volkswagen       0.015490
cat__carburant_hybride       0.012817
cat__modele_a4               0.009984
cat__marque_fiat             0.008688
cat__marque_renault          0.008630
dtype: float64