In [204]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib.patheffects as path_effects

import ipywidgets as widgets
from IPython.display import display

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
# Regressor models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import LabelEncoder

In [205]:
data = pd.read_csv('../../data/avito_clean.csv')
df = data.copy()

#df = df.dropna()
df = df[df['type_annonce'] == 'à vendre']
df = df.drop_duplicates(subset=['lien'])

print(f"{len(df)} annonces.")


1234 annonces.


In [206]:
df['transmission'] = df['transmission'].apply(lambda x : 1 if x == 'automatique' else 0)

df['abs'] = df['equipements'].apply(lambda x : 1 if len(x) > 0 else 0)
df['airbags'] = df['equipements'].apply(lambda x : 1 if len(x) > 1 else 0)
df['mp3'] = df['equipements'].apply(lambda x : 1 if len(x) > 2 else 0)
df['climatisation'] = df['equipements'].apply(lambda x : 1 if len(x) > 3 else 0)
df['camera'] = df['equipements'].apply(lambda x : 1 if len(x) > 4 else 0)
df['esp'] = df['equipements'].apply(lambda x : 1 if len(x) > 5 else 0)

df['transmission'].head()

0    1
1    1
2    0
3    0
4    1
Name: transmission, dtype: int64

In [207]:
### Linear regression prediction model

features = ['marque', 'modele', 'annee', 'carburant',
             'kilometrage', 'transmission', 'abs', 'airbags',
             'mp3', 'camera', 'climatisation', 'esp']
target = 'prix'

pr_df = df.dropna(subset=features + [target])

x = pr_df[features]
y = pr_df[target]

categorical = ['marque', 'modele', 'carburant']
numerical = ['annee', 'kilometrage']
boolean = ['transmission', 'abs', 'airbags',
             'mp3', 'camera', 'climatisation', 'esp']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', 'passthrough',numerical),
        ('bool', 'passthrough', boolean)
        ])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


linear_regression_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', LinearRegression())
    ])


linear_regression_model.fit(x_train, y_train)

y_pred = linear_regression_model.predict(x_test)

print(f'Score train: {linear_regression_model.score(x_train, y_train)}\n')
print(f'Score test: {linear_regression_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Linear regression model score: ',r2_score(y_test, y_pred) * 100 ,'%')
linear_regression_model_score = r2_score(y_test, y_pred) * 100



Score train: 0.31557749398792445

Score test: 0.3801557294592579

Moyen Erreur absolut: 79867.47231046515
Linear regression model score:  38.01557294592579 %


In [208]:
### Decesion tree prediction model

decesion_tree_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', DecisionTreeRegressor())
    ])


decesion_tree_model.fit(x_train, y_train)

y_pred = decesion_tree_model.predict(x_test)

print(f'Score train: {decesion_tree_model.score(x_train, y_train)}\n')
print(f'Score test: {decesion_tree_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Decession tree model score: ',r2_score(y_test, y_pred) * 100 ,'%')
decesion_tree_model_score = r2_score(y_test, y_pred) * 100

Score train: 0.9998040713232028

Score test: -1.5106682490063288

Moyen Erreur absolut: 91786.14598540145
Decession tree model score:  -151.06682490063287 %


In [209]:
### random forest prediction model

random_forest_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=200,
        max_depth=15,
        min_samples_leaf=5,
        random_state=42))
    ])

random_forest_model.fit(x_train, y_train)

y_pred = random_forest_model.predict(x_test)

print(f'Score train: {random_forest_model.score(x_train, y_train)}\n')
print(f'Score test: {random_forest_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Decession tree model score: ',r2_score(y_test, y_pred) * 100 ,'%')
random_forest_model_score = r2_score(y_test, y_pred) * 100

Score train: 0.5534288003351204

Score test: 0.47643815196092143

Moyen Erreur absolut: 70933.02791002362
Decession tree model score:  47.64381519609214 %


In [210]:
### K-neighbours prediction model

K_neighbours_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', KNeighborsRegressor())
    ])

K_neighbours_model.fit(x_train, y_train)

y_pred = K_neighbours_model.predict(x_test)

print(f'Score train: {K_neighbours_model.score(x_train, y_train)}\n')
print(f'Score test: {K_neighbours_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Decession tree model score: ',r2_score(y_test, y_pred) * 100 ,'%')
K_neighbours_model_score = r2_score(y_test, y_pred) * 100

Score train: 0.3503011895713215

Score test: -0.06701047795449888

Moyen Erreur absolut: 109842.1109489051
Decession tree model score:  -6.701047795449888 %


In [211]:
df.head()

Unnamed: 0,lien,titre_annonce,prix,proprietere,images,ville,quartier,date,category,type_annonce,...,kilometrage,marque,modele,equipements,abs,airbags,mp3,climatisation,camera,esp
0,https://www.avito.ma/fr/hay_el_fath/voitures_d...,RANGE ROVER SPORT HSE,,Amine,['https://content.avito.ma/classifieds/images/...,rabat,hay el fath,1 heure,voitures d'occasion,à vendre,...,135000.0,land rover,range rover sport,"['abs', 'airbags', 'cd/mp3/bluetooth', 'caméra...",1,1,1,1,1,1
1,https://www.avito.ma/fr/anfa/voitures_d_occasi...,Alfa Romeo Stelvio Quadrifoglio V6 Bi-trubo 510CH,640000.0,Stellantis You Casablanca,['https://content.avito.ma/classifieds/images/...,casablanca,anfa,23 heures,voitures d'occasion,à vendre,...,60000.0,alfa romeo,stelvio,[],1,1,0,0,0,0
2,https://www.avito.ma/fr/sidi_maarouf/voitures_...,dacia Duster,,auto fyi word services,['https://content.avito.ma/classifieds/images/...,casablanca,sidi maarouf,3 heures,voitures d'occasion,à vendre,...,130.0,dacia,duster,"['abs', 'airbags', 'cd/mp3/bluetooth', 'caméra...",1,1,1,1,1,1
3,https://www.avito.ma/fr/anfa/voitures_d_occasi...,Renault Express Diesel Manuelle 2023 à Casablanca,158000.0,Stellantis You Casablanca,['https://content.avito.ma/classifieds/images/...,casablanca,anfa,7 minutes,voitures d'occasion,à vendre,...,25000.0,renault,express,[],1,1,0,0,0,0
4,https://www.avito.ma/fr/bourgogne/voitures_d_o...,BMW Série 5 pack M edition 2025 à Casablanca,,Promorent Auto,['https://content.avito.ma/classifieds/images/...,casablanca,bourgogne,7 minutes,voitures d'occasion,à vendre,...,10000.0,bmw,série 5,"['abs', 'airbags', 'cd/mp3/bluetooth', 'caméra...",1,1,1,1,1,1


In [212]:
['marque', 'modele', 'annee', 'carburant',
             'kilometrage', 'transmission', 'abs', 'airbags',
             'mp3', 'camera', 'climatisation', 'esp']
def predict (marque, modele, annee, carburant, mileage, transmission, abs, airbags,mp3, camera, climatisation, esp):

    car = pd.DataFrame([{
        'marque': marque,
        'modele': modele,
        'annee': annee,
        'carburant': carburant,
        'kilometrage': mileage,
        'transmission': transmission,
        'abs': abs,
        'airbags': airbags,
        'mp3': mp3,
        'camera': camera,
        'climatisation': climatisation,
        'esp': esp

   }])
    line_prediction = linear_regression_model.predict(car)[0]
    rf_prediction = random_forest_model.predict(car)[0]
    kn_prediction = K_neighbours_model.predict(car)[0]
    dt_prediction = decesion_tree_model.predict(car)[0]

    return (
        f"Regression lineaire : {round(line_prediction):,} Dh {round(linear_regression_model_score)} %\n"
        f"Random Forest       : {round(rf_prediction):,} Dh {round(random_forest_model_score)} %\n"
        f"K-Nearest Neigh     : {round(kn_prediction):,} Dh {round(K_neighbours_model_score)} %\n"
        f"Decision Tree         : {round(dt_prediction):,} Dh {round(decesion_tree_model_score)} %"
    )

In [218]:
model_price_year = df.dropna(subset=['marque', 'modele', 'carburant', 'annee', 'kilometrage'])

brands = sorted(df['marque'].dropna().unique())

def get_models(brand):
    return sorted(df[df['marque'] == brand]['modele'].dropna().unique())

def show(brand, modele, year, fuel, mileage):
    plt.figure(figsize=(8, 4))
    plt.axis('off')
    plt.text(0.5, 0.5, predict(brand, modele, year, fuel, mileage), ha='center', va='center', fontsize=14)
    plt.show()

brand_dropdown = widgets.Dropdown(
    options=brands,
    description='Marque:'
)
modele_dropdown = widgets.Dropdown(
    description='Modèle:'
)
year_dropdown = widgets.Dropdown(
    options= (df['annee'].sort_values(ascending=False).dropna().unique()),
    description='Année:'
)
fuel_dropdown = widgets.Dropdown(
    options= df['carburant'].dropna().unique(), 
    description='Carburant:'
)
mileage_dropdown = widgets.Dropdown(
    description= 'kilometrage~:',
    options= [i * 50000 for i in range(0, 9)]
)

def update_models(*args):
    modele_dropdown.options = get_models(brand_dropdown.value)

brand_dropdown.observe(update_models, names='value')
update_models()

#widgets.interact(show ,brand = brand_dropdown, modele=modele_dropdown, year=year_dropdown, fuel=fuel_dropdown, mileage=mileage_dropdown)
print(predict('dacia', 'sandero', 2021, 'diesel', 200000, transmission=0, abs=0, airbags=0 ,mp3=0 ,camera=0 , climatisation=True, esp=True))

Regression lineaire : 119,327 Dh 38 %
Random Forest       : 134,007 Dh 48 %
K-Nearest Neigh     : 249,600 Dh -7 %
Decision Tree         : 137,000 Dh -151 %


In [214]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest', 'KNN'],
    'R2 Score (%)': [
        linear_regression_model_score,
        decesion_tree_model_score,
        random_forest_model_score,
        K_neighbours_model_score
    ]
})

results.sort_values('R2 Score (%)', ascending=False)

Unnamed: 0,Model,R2 Score (%)
2,Random Forest,47.643815
0,Linear Regression,38.015573
3,KNN,-6.701048
1,Decision Tree,-151.066825


In [215]:
model = random_forest_model.named_steps['regressor']
feature_names = random_forest_model.named_steps['preprocess'].get_feature_names_out()

importances = pd.Series(model.feature_importances_, index=feature_names)
importances.sort_values(ascending=False).head(15)

bool__transmission           0.263302
num__annee                   0.220469
num__kilometrage             0.124085
cat__marque_porsche          0.114858
cat__marque_mercedes-benz    0.070554
cat__marque_audi             0.045289
cat__modele_megane           0.025784
cat__modele_cayenne          0.020407
cat__modele_touareg          0.015235
cat__marque_renault          0.013529
cat__modele_logan            0.011363
cat__marque_bmw              0.010462
cat__carburant_essence       0.010007
cat__marque_land rover       0.008435
cat__modele_a4               0.005812
dtype: float64