In [None]:
import json
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import re
import unicodedata

In [None]:
data = []

with open("vinted_data.jsonl", "r", encoding="utf-8") as f: #indicate the right path to the raw data file 
    for line in f:
        line = line.strip()
        if line:  # skip empty lines
            data.append(json.loads(line))

df = pd.DataFrame(data)

In [4]:
def normalize_text(s):
    return unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")

In [None]:
#this function classifies brands into categories : luxury, premium, sport, mass market, discount, unknown 
#the dictionnaries have been obtained from gemini 

def classify_brand(brand_raw):
    if not isinstance(brand_raw, str):
        return "Unknown"
    
    brand = normalize_text(brand_raw.lower().strip())
    
    brand_categories = {
        # --- LUXE ---
        "chanel": "Luxe", "hermes": "Luxe", "louis vuitton": "Luxe", "vuitton": "Luxe",
        "dior": "Luxe", "christian dior": "Luxe", "gucci": "Luxe", "prada": "Luxe",
        "yves saint laurent": "Luxe", "ysl": "Luxe", "saint laurent": "Luxe",
        "balenciaga": "Luxe", "fendi": "Luxe", "versace": "Luxe", "burberry": "Luxe",
        "cartier": "Luxe", "rolex": "Luxe", "omega": "Luxe", "moncler": "Luxe",
        "kenzo": "Luxe", "givenchy": "Luxe", "valentino": "Luxe",
        "loewe": "Luxe", "celine": "Luxe", "jacquemus": "Luxe",
        "chloe": "Luxe", "dolce and gabbana": "Luxe", "jean paul gaultier": "Luxe",
        "off-white": "Luxe", "balmain": "Luxe",

        # --- PREMIUM ---
        "maje": "Premium", "sandro": "Premium", "the kooples": "Premium",
        "ralph lauren": "Premium", "tommy hilfiger": "Premium",
        "lacoste": "Premium", "calvin klein": "Premium",
        "hugo boss": "Premium", "boss": "Premium",
        "sezane": "Premium", "patagonia": "Premium",
        "the north face": "Premium", "carhartt": "Premium",
        "doc martens": "Premium", "ugg": "Premium",
        "diesel": "Premium", "guess": "Premium",
        "supreme": "Premium",

        # --- SPORT ---
        "nike": "Sport", "adidas": "Sport", "puma": "Sport",
        "new balance": "Sport", "asics": "Sport",
        "under armour": "Sport", "converse": "Sport",
        "vans": "Sport", "salomon": "Sport",
        "decathlon": "Sport", "quechua": "Sport",

        # --- MASS MARKET ---
        "zara": "Mass Market", "hm": "Mass Market", "mango": "Mass Market",
        "uniqlo": "Mass Market", "levis": "Mass Market",
        "bershka": "Mass Market", "pull and bear": "Mass Market",
        "stradivarius": "Mass Market", "kiabi": "Mass Market",
        "celio": "Mass Market", "brandy melville": "Mass Market",

        # --- DISCOUNT ---
        "shein": "Discount", "primark": "Discount",
        "boohoo": "Discount", "action": "Discount",
        "lidl": "Discount", "aldi": "Discount"
    }
    
    # Exact match
    if brand in brand_categories:
        return brand_categories[brand]
    
    # Substring / regex match
    for key, category in brand_categories.items():
        if re.search(r"\b" + re.escape(key) + r"\b", brand):
            return category
            
    return "Unknown"


In [6]:
df["brand_segment"] = df["marque"].apply(classify_brand)

In [None]:
df["price"] = df["prix"].apply(
    lambda x: float(x["amount"]) if isinstance(x, dict) else np.nan
)

df["total_price"] = df["prix_total"].apply(
    lambda x: float(x["amount"]) if isinstance(x, dict) else np.nan
)

# we keep France only (simplier for currency and language)
df = df[df["country"] == "fr"]

# drop missing prices 
df = df.dropna(subset=["price"])

In [8]:
CATEGORY_GENDER = {
    # HOMME — vêtements
    "32": "homme",
    "1206": "homme",
    "34": "homme",
    "85": "homme",
    "84": "homme",
    "92": "homme",
    "257": "homme",
    "76": "homme",
    "79": "homme",
    "80": "homme",
    "2910": "homme",
    "30": "homme",

    # HOMME — chaussures
    "1233": "homme",
    "2657": "homme",
    "1238": "homme",
    "2659": "homme",
    "1242": "homme",
    "2656": "homme",
    "2970": "homme",
    "2969": "homme",
    "2968": "homme",
    "1452": "homme",

    # FEMME — vêtements
    "13": "femme",
    "10": "femme",
    "12": "femme",
    "9": "femme",
    "1035": "femme",
    "29": "femme",
    "73": "femme",
    "1037": "femme",
    "8": "femme",
    "11": "femme",
    "183": "femme",
    "15": "femme",
    "28": "femme",
    "1176": "femme",
    "1782": "femme",

    # FEMME — chaussures
    "2954": "femme",
    "2623": "femme",
    "2955": "femme",
    "1049": "femme",
    "2953": "femme",
    "543": "femme",
    "2950": "femme",
    "215": "femme",
    "2632": "femme",
    "2952": "femme",
    "2951": "femme",
    "2949": "femme",
    "2630": "femme",
}


In [41]:
df["gender"] = df["query"].astype(str).map(CATEGORY_GENDER)

In [42]:
df["female"] = (df["gender"] == "femme").astype(int)

In [None]:
df["condition"] = df["status"]

In [44]:
df["condition"] = df["condition"].astype("category")

In [None]:
df["ln_price"] = np.log(df["price"]) #we take the log of the price

In [51]:
categorical_vars = ["brand_segment", "condition", "query"]

for col in categorical_vars:
    df[col] = df[col].astype("category")

In [None]:
#formula to compare the influence of female compared to other variables in the model
formula_all = """
ln_price ~ female
        + C(brand_segment)
        + C(condition)
"""

In [None]:
model = smf.ols(
    formula=formula_all, 
    data=df
).fit(cov_type="HC3")  


In [64]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               ln_price   R-squared:                       0.212
Model:                            OLS   Adj. R-squared:                  0.211
Method:                 Least Squares   F-statistic:                     317.8
Date:                Wed, 28 Jan 2026   Prob (F-statistic):               0.00
Time:                        00:51:39   Log-Likelihood:                -13448.
No. Observations:                9345   AIC:                         2.692e+04
Df Residuals:                    9334   BIC:                         2.700e+04
Df Model:                          10                                         
Covariance Type:                  HC3                                         
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
In

In [None]:
beta = model.params["female"]
se = model.bse["female"]
tval = model.tvalues["female"]
pval = model.pvalues["female"]
conf_int = model.conf_int().loc["female"]

print("Coefficient for female:")
print(f"Estimate: {beta:.4f}")
print(f"Std. Error: {se:.4f}")
print(f"t-value: {tval:.2f}")
print(f"p-value: {pval:.4f}")
print(f"95% CI: [{conf_int[0]:.4f}, {conf_int[1]:.4f}]")
print(f"Interpretation: Female products are {(np.exp(beta)-1)*100:.2f}% higher on average, controlling for brand, category, and condition.")

Coefficient for female:
Estimate: -0.2223
Std. Error: 0.0223
t-value: -9.97
p-value: 0.0000
95% CI: [-0.2659, -0.1786]
Interpretation: Female products are -19.93% higher on average, controlling for brand, category, and condition.
