In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib.patheffects as path_effects

import ipywidgets as widgets
from IPython.display import display

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
# Regressor models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

import joblib

In [None]:
data = pd.read_csv('../../data/avito_clean.csv')
df = data.copy()

#df = df.dropna()
df = df[df['type_annonce'] == 'Ã  vendre']
df = df.drop_duplicates(subset=['lien'])

print(f"{len(df)} annonces.")


1234 annonces.


## ---------------------- filtering outliers --------------------------------

In [None]:
# Removing the listings within 1% margin (means 1% within the highestt or lowest price)
lower = df['prix'].quantile(0.01)
upper = df['prix'].quantile(0.99)

df_no_outliers = df[(df['prix'] >= lower) & (df['prix'] <= upper)]
print(f"Removed {len(df) - len(df_no_outliers)} outlier listings")

df = df_no_outliers


Removed 564 outlier listings


In [None]:
brand_counts = df['marque'].value_counts()
common_brands = brand_counts[brand_counts > 5].index  # threshold = 50 listings
df_no_outliers = df[df['marque'].isin(common_brands)]
print(f"Removed {len(df) - len(df_no_outliers)} outlier listings")
df = df_no_outliers

print(f"\nNumber of listings left: {len(df)}")


Removed 32 outlier listings

Number of listings left: 638


## ------------------------- ML analysis -----------------------------------

 -------------------------------------------------------- Classification  ----------------------------------------------------------------------

In [None]:
df['car_age'] = 2026 - df['annee']
features = ['marque', 'modele', 'car_age', 'carburant']
target = 'prix'

pr_df = df.dropna(subset=features + [target])

x = pr_df[features]
y = pr_df[target]

categorical = ['marque', 'modele', 'carburant']
numerical = ['car_age']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', 'passthrough',numerical)
        ])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)