In [147]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib.patheffects as path_effects

import ipywidgets as widgets
from IPython.display import display

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
# Regressor models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

import joblib

In [134]:
data = pd.read_csv('../../data/avito_clean.csv')
df = data.copy()

#df = df.dropna()
df = df[df['type_annonce'] == 'à vendre']
df = df.drop_duplicates(subset=['lien'])

print(f"{len(df)} annonces.")


1234 annonces.


## ---------------------- filtering outliers --------------------------------

In [135]:
# Removing the listings within 1% margin (means 1% within the highestt or lowest price)
lower = df['prix'].quantile(0.01)
upper = df['prix'].quantile(0.99)

df_no_outliers = df[(df['prix'] >= lower) & (df['prix'] <= upper)]
print(f"Removed {len(df) - len(df_no_outliers)} outlier listings")

df = df_no_outliers


Removed 564 outlier listings


In [136]:
brand_counts = df['marque'].value_counts()
common_brands = brand_counts[brand_counts > 5].index  # threshold = 50 listings
df_no_outliers = df[df['marque'].isin(common_brands)]
print(f"Removed {len(df) - len(df_no_outliers)} outlier listings")
df = df_no_outliers

print(f"\nNumber of listings left: {len(df)}")


Removed 32 outlier listings

Number of listings left: 638


## ------------------------- ML analysis -----------------------------------

 ---------------------------------------------------- Regression (Prediction) ----------------------------------------------------------------------

In [137]:
df['car_age'] = 2026 - df['annee']
features = ['marque', 'modele', 'car_age', 'carburant']
target = 'prix'

pr_df = df.dropna(subset=features + [target])

x = pr_df[features]
y = pr_df[target]

categorical = ['marque', 'modele', 'carburant']
numerical = ['car_age']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', 'passthrough',numerical)
        ])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [138]:
### Linear regression prediction model
linear_regression_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', LinearRegression())
    ])


linear_regression_model.fit(x_train, y_train)

y_pred = linear_regression_model.predict(x_test)

print(f'Score train: {linear_regression_model.score(x_train, y_train)}\n')
print(f'Score test: {linear_regression_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Linear regression model score: ',r2_score(y_test, y_pred) * 100 ,'%')
linear_regression_model_score = r2_score(y_test, y_pred) * 100



Score train: 0.8754536680765109

Score test: 0.5685526799728586

Moyen Erreur absolut: 47518.23858899766
Linear regression model score:  56.855267997285864 %


In [139]:
### Decesion tree prediction model

decesion_tree_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', DecisionTreeRegressor(max_depth=14, min_samples_leaf=2))
    ])


decesion_tree_model.fit(x_train, y_train)

y_pred = decesion_tree_model.predict(x_test)

print(f'Score train: {decesion_tree_model.score(x_train, y_train)}\n')
print(f'Score test: {decesion_tree_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Decession tree model score: ',r2_score(y_test, y_pred) * 100 ,'%')
decesion_tree_model_score = r2_score(y_test, y_pred) * 100

Score train: 0.8657194392680349

Score test: 0.6057949992531662

Moyen Erreur absolut: 48260.94390616105
Decession tree model score:  60.57949992531662 %


In [140]:
### random forest prediction model

random_forest_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=100,
        max_depth=15,
        min_samples_leaf=1,
        random_state=39))
    ])

random_forest_model.fit(x_train, y_train)

y_pred = random_forest_model.predict(x_test)

print(f'Score train: {random_forest_model.score(x_train, y_train)}\n')
print(f'Score test: {random_forest_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Decession tree model score: ',r2_score(y_test, y_pred) * 100 ,'%')
random_forest_model_score = r2_score(y_test, y_pred) * 100

Score train: 0.9019592255266207

Score test: 0.6504965098880751

Moyen Erreur absolut: 42857.53852742018
Decession tree model score:  65.0496509888075 %


In [141]:
### K-neighbours prediction model

K_neighbours_model = Pipeline(steps=[
    ('preprocess' , preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=2))
    ])

K_neighbours_model.fit(x_train, y_train)

y_pred = K_neighbours_model.predict(x_test)

print(f'Score train: {K_neighbours_model.score(x_train, y_train)}\n')
print(f'Score test: {K_neighbours_model.score(x_test, y_test)}\n')

print('Moyen Erreur absolut:', mean_absolute_error(y_test, y_pred))
print('Decession tree model score: ',r2_score(y_test, y_pred) * 100 ,'%')
K_neighbours_model_score = r2_score(y_test, y_pred) * 100

Score train: 0.8483003423914315

Score test: 0.6288978143177405

Moyen Erreur absolut: 42232.421875
Decession tree model score:  62.88978143177405 %


In [142]:
def predict (marque, modele, annee, carburant):
    car = pd.DataFrame([{
        'marque': marque,
        'modele': modele,
        'car_age': 2026 - annee,
        'carburant': carburant
    }])
    line_prediction = linear_regression_model.predict(car)[0]
    rf_prediction = random_forest_model.predict(car)[0]
    kn_prediction = K_neighbours_model.predict(car)[0]
    dt_prediction = decesion_tree_model.predict(car)[0]

    return (
        f"Regression lineaire : {round(line_prediction):,} Dh {round(linear_regression_model_score)} %\n"
        f"Random Forest       : {round(rf_prediction):,} Dh {round(random_forest_model_score)} %\n"
        f"K-Nearest Neigh     : {round(kn_prediction):,} Dh {round(K_neighbours_model_score)} %\n"
        f"Decision Tree         : {round(dt_prediction):,} Dh {round(decesion_tree_model_score)} %"
    )

In [143]:
model_price_year = df.dropna(subset=['marque', 'modele', 'carburant', 'car_age'])

brands = sorted(df['marque'].dropna().unique())

def get_models(brand):
    return sorted(df[df['marque'] == brand]['modele'].dropna().unique())

def show(brand, modele, year, fuel):
    plt.figure(figsize=(8, 4))
    plt.axis('off')
    plt.text(0.5, 0.5, predict(brand, modele, year, fuel), ha='center', va='center', fontsize=14)
    plt.show()

brand_dropdown = widgets.Dropdown(
    options=brands,
    description='Marque:'
)
modele_dropdown = widgets.Dropdown(
    description='Modèle:'
)
year_dropdown = widgets.Dropdown(
    options= (df['annee'].sort_values(ascending=False).dropna().unique()),
    description='Année:'
)
fuel_dropdown = widgets.Dropdown(
    options= df['carburant'].dropna().unique(), 
    description='Carburant:'
)

def update_models(*args):
    modele_dropdown.options = get_models(brand_dropdown.value)

brand_dropdown.observe(update_models, names='value')
update_models()

widgets.interact(show ,brand = brand_dropdown, modele=modele_dropdown, year=year_dropdown, fuel=fuel_dropdown)
print(predict('dacia', 'sandero', 12, 'diesel'))

interactive(children=(Dropdown(description='Marque:', options=('audi', 'bmw', 'citroën', 'dacia', 'fiat', 'for…

Regression lineaire : -28,479,348 Dh 57 %
Random Forest       : 45,504 Dh 65 %
K-Nearest Neigh     : 45,000 Dh 63 %
Decision Tree         : 45,000 Dh 61 %


In [144]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest', 'KNN'],
    'R2 Score (%)': [
        linear_regression_model_score,
        decesion_tree_model_score,
        random_forest_model_score,
        K_neighbours_model_score
    ]
})

results.sort_values('R2 Score (%)', ascending=False)

Unnamed: 0,Model,R2 Score (%)
2,Random Forest,65.049651
3,KNN,62.889781
1,Decision Tree,60.5795
0,Linear Regression,56.855268


In [145]:
model = random_forest_model.named_steps['regressor']
feature_names = random_forest_model.named_steps['preprocess'].get_feature_names_out()

importances = pd.Series(model.feature_importances_, index=feature_names)
importances.sort_values(ascending=False).head(15)

num__car_age                 0.291382
cat__marque_mercedes-benz    0.140128
cat__marque_porsche          0.078724
cat__marque_audi             0.046338
cat__modele_touareg          0.032865
cat__marque_land rover       0.032665
cat__modele_classe gle       0.029972
cat__modele_q8               0.029636
cat__modele_série 8          0.026269
cat__modele_a3               0.024235
cat__modele_wrangler         0.024046
cat__modele_cayenne          0.022057
cat__marque_bmw              0.019032
cat__marque_volkswagen       0.015933
cat__modele_classe gls       0.013618
dtype: float64

In [None]:
joblib.dump(linear_regression_model, "../../app/models/linear_model.pkl")
joblib.dump(random_forest_model, "../../app/models/random_forest_model.pkl")
joblib.dump(decesion_tree_model, "../../app/models/decision_tree_model.pkl")
joblib.dump(K_neighbours_model, "../../app/models/knn_model.pkl")


['../../app/models/knn_model.pkl']