### Comparaison des datasets

In [43]:
import pandas as pd

df_beautystore = pd.read_csv("../data/raw/beautystore_data.csv")
df_cosmetiquetn = pd.read_csv("../data/raw/cosmetiquetn_data.csv")
df_cherrybeauty = pd.read_csv("../data/raw/cherrybeauty_data.csv")

#### Dimensions

In [44]:
print("Dimensions des datasets:")
print(f"df_beautystore: {df_beautystore.shape}")
print(f"df_cosmetiquetn: {df_cosmetiquetn.shape}")
print(f"df_cherrybeauty: {df_cherrybeauty.shape}")

Dimensions des datasets:
df_beautystore: (1045, 13)
df_cosmetiquetn: (2766, 15)
df_cherrybeauty: (5099, 14)


In [45]:
# Dictionnaire pour faciliter le traitement
datasets = {
    'beautystore': df_beautystore,
    'cosmetiquetn': df_cosmetiquetn, 
    'cherrybeauty': df_cherrybeauty
}

#### Colonnes

In [46]:
for nom, df in datasets.items():
    print(f"\n* {nom.upper()} - Colonnes ({len(df.columns)}):")
    for i, col in enumerate(df.columns, 1):
        print(f"   {i:2d}. {col}")
    
    # Aperçu des données
    print(f"\n   Aperçu des premières lignes:")
    display(df.head(2))
    print("-" * 80)


* BEAUTYSTORE - Colonnes (13):
    1. id
    2. title
    3. price_raw
    4. price
    5. price_original_raw
    6. price_original
    7. discount_percent
    8. url
    9. image_small
   10. image_large
   11. description
   12. promo_type
   13. scrape_date

   Aperçu des premières lignes:


Unnamed: 0,id,title,price_raw,price,price_original_raw,price_original,discount_percent,url,image_small,image_large,description,promo_type,scrape_date
0,6235,AVENEHyaluron Activ B3 Serum Concentre Repulpa...,"145,000TND",145.0,"181,000TND",181.0,20.0,https://beautystore.tn/promos/6235-hyaluron-ac...,https://beautystore.tn/27600-home_default/hyal...,https://beautystore.tn/27600-large_default/hya...,Le sérum concentré en acide hyaluronique pur (...,Standard,2025-11-24
1,6234,"AVENEConcentré Anti-Imperfections Cleanance ""C...","61,900TND",61.9,"77,400TND",77.4,20.0,https://beautystore.tn/promos/6234-concentré-a...,https://beautystore.tn/27594-home_default/conc...,https://beautystore.tn/27594-large_default/con...,Le concentré anti-imperfections Cleanance Come...,Standard,2025-11-24


--------------------------------------------------------------------------------

* COSMETIQUETN - Colonnes (15):
    1. id
    2. title
    3. brand
    4. reference
    5. price_raw
    6. price
    7. price_original_raw
    8. price_original
    9. discount_percent
   10. url
   11. image_small
   12. image_large
   13. description
   14. promo_type
   15. scrape_date

   Aperçu des premières lignes:


Unnamed: 0,id,title,brand,reference,price_raw,price,price_original_raw,price_original,discount_percent,url,image_small,image_large,description,promo_type,scrape_date
0,,12.12 Rose Eau de Parfum pour Femme 50 ml - La...,Lacoste,3614228836067,"200,000 TND",200.0,"229,000 TND",229.0,29.0,https://cosmetique.tn/eaux-de-parfum-femme-pre...,"data:image/svg+xml,%3Csvg%20xmlns='http://www....",https://cosmetique.tn/16886-thickbox_default/1...,"Notes de tête : Essence de Mandarine verte, Es...",Standard,2025-11-24
1,,Amber original parfum corps 250ml -lolita bonita,Lolita Bonita,710535093773,"45,000 TND",45.0,"62,000 TND",62.0,17.0,https://cosmetique.tn/brume-de-corps/3169-loli...,"data:image/svg+xml,%3Csvg%20xmlns='http://www....",https://cosmetique.tn/7497-thickbox_default/lo...,Mystérieux\nExtravagant\nUnique\nOffrez-vous A...,Standard,2025-11-24


--------------------------------------------------------------------------------

* CHERRYBEAUTY - Colonnes (14):
    1. id
    2. title
    3. category
    4. price_raw
    5. price
    6. price_original_raw
    7. price_original
    8. discount_percent
    9. url
   10. image_small
   11. image_large
   12. description
   13. promo_type
   14. scrape_date

   Aperçu des premières lignes:


Unnamed: 0,id,title,category,price_raw,price,price_original_raw,price_original,discount_percent,url,image_small,image_large,description,promo_type,scrape_date
0,5584,Ahwak Eau de Parfum 100 ml Sahari,Eau de Parfum Femme,"58,812 TND",58.812,"67,600 TND",67.6,13.0,https://cherrybeauty.tn/5584-ahwak-eau-de-parf...,"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQA...",https://cherrybeauty.tn/9807-home_default/ahwa...,Ahwak Eau de Parfum 100 ml Sahari,Promotion,2025-11-24
1,5451,Asdaaf Amerat El Arab Privé Rose – Eau de Parf...,Eau de Parfum Femme,"60,813 TND",60.813,"69,900 TND",69.9,13.0,https://cherrybeauty.tn/5451-asdaaf-amerat-el-...,"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQA...",https://cherrybeauty.tn/9552-home_default/asda...,Asdaaf Amerat El Arab Privé Rose – Eau de Parf...,Promotion,2025-11-24


--------------------------------------------------------------------------------


##### Remarques faites:
* pour les datasets beautystore et cherrybeauty, on voit que le nom de la brand est dans le titre => Extraction du nom de la brand
* pour le dataset de cosmetique.tn, on a id = NaN et c'est reference qui nous informe sur l'ID

1. Extraction des brand names

In [47]:
def extract_brand_title(title, brands_list):
    if pd.isna(title):
        return pd.Series([None, None])
    for brand in brands_list:
        if title.upper().startswith(brand.upper()):
            brand_cap = " ".join([w.capitalize() for w in brand.split()])
            return pd.Series([brand_cap, title[len(brand):].strip()])
    return pd.Series([None, title])

Sur les sites, on retrouve la liste des marques

In [48]:
# Liste des marques BeautyStore
beautystore_brands_list = [
    "ACM","ADERMA","ALANIA","APOTHICA","ARTDECO","AVENE","BABYLISS","BEURER",
    "BIODERMA","BIO-OIL","Biolane","CERAVE","CETAPHIL","COLAB","ENEOMEY",
    "Eye Care Cosmetics","ESSENCE","Eylure","HEI POA","ISDIN","KÉRASTASE",
    "Lana Brasiles","LA ROCHE-POSAY","LES ESSENTIELLES LAB","L'ORÉAL PROFESSIONNEL",
    "Materna","Mustela","NUXE","PHYTEAL","REAL TECHNIQUES","REVOLUTION",
    "Rose Baie Paris","Rude Cosmetics","Sensilis skin lab","SVR","TITANIA",
    "VICHY","ZYNIA"
]

# Liste des marques CherryBeauty
cherrybeauty_brands_list = [
    "1943 LE BON PARFUMEUR", "Adidas", "AGIVA", "Alfaparf Semi di Lino", "Aquafresh",
    "ARTDECO", "AS Kozmetik", "ASTUS", "Axe", "Azal", "Beverly Hills Polo Club",
    "BH COSMETICS", "BLUEQUE", "Brazilian Glow", "C'Cool", "C'Fresh", "Cadiveu",
    "Champs Fleuris", "CHUPA CHUPS", "COLAB Dry Shampoo", "Colour Me",
    "COSMO COSMETICS", "COTTONPLUS", "Dabur", "Dazzle", "DermoMed", "DOVE",
    "Dr Rashel", "Echosline Seliar", "Emper Perfumes", "Energie Fruit", "Essence Cosmetics",
    "ESSENTIAL Beauty", "EuroStil", "Eveline Cosmetics", "Evoluderm", "F6", "F8",
    "FAMILY", "Fanal", "FLAWLESS", "Framesi - Professional Hair Fashion",
    "Franck Provost", "Gabrini Cosmetics", "Galaxy Plus Concept", "Garnier", "Glam of Sweden",
    "Glossco", "Golden Rose", "GUANJING", "H.ZONE", "Haokali", "Head & Shoulders",
    "Herbal Essences", "Holiday Depilatori", "imPRESS", "Inebrya", "Italwax", "Johnson's",
    "K-REINE", "KAVIGEN", "KeraGold Pro", "Keratin Power", "Keune", "KG Barber by KeraGold",
    "KIEPE", "Kolsi", "L'ANGELICA", "La Rive", "LaFera Cosmetics", "Le Professionnel",
    "Lella", "Lilas", "Lollis Beauty MakeUp", "LORENTI Tokyo & Seoul", "Magic Chrome Pigment Pen",
    "MAGK", "Mahassen", "Malizia", "Manino", "Milestone Perfumes", "Milmil", "Mirada", "Mixa",
    "Nana", "Nihel", "Nivea", "Nook", "NUXE", "Otto", "Palmolive", "Pantene",
    "Parfums SAPHIR", "Parisienne Italia", "Pasta Del Capitano", "Pétrole Hahn",
    "Pierre Cardin", "Prestige Professional", "PRO WAX", "PURALIA", "QBD", "QUEEN LIFE",
    "RAKO", "REAL TECHNIQUES", "RENEE BLANCHE", "REVOLUTION", "Rexona", "RICA", "Roial",
    "Rojanet", "ROSALIND", "RoseBaie", "Ruby Face", "Schwarzkopf Gliss", "Schwarzkopf Palette",
    "Sence Beauty", "Sensea", "Septona", "Signal", "SOLYSS", "Souplesse", "SUN", "Sunsilk",
    "Syoss", "T6", "TechnoCare", "TOPFACE", "Tricol Biosky X-Perience", "URBAN CARE",
    "USHAS", "V-Benz", "VIGOS", "Zaragoza", "ZOEVA", "Ahwak", "Asdaaf Amerat El Arab"
]

In [49]:
df_beautystore[['brand','title']] = df_beautystore['title'].apply(
    lambda x: extract_brand_title(x, beautystore_brands_list)
)
df_cherrybeauty[['brand','title']] = df_cherrybeauty['title'].apply(
    lambda x: extract_brand_title(x, cherrybeauty_brands_list)
)

2. confusion id / reference

In [50]:
df_cosmetiquetn["id"] = df_cosmetiquetn["reference"]

In [51]:
# Trouver les colonnes communes
colonnes_par_site = {}
for nom, df in datasets.items():
    colonnes_par_site[nom] = set(df.columns)

colonnes_communes = set.intersection(*colonnes_par_site.values())
colonnes_uniques = set.union(*colonnes_par_site.values()) - colonnes_communes

print(f"\nColonnes communes ({len(colonnes_communes)}):")
for col in sorted(colonnes_communes):
    print(f"   * {col}")

print(f"\nColonnes uniques par site ({len(colonnes_uniques)}):")
for nom in datasets.keys():
    uniques = colonnes_par_site[nom] - colonnes_communes
    if uniques:
        print(f"   {nom.upper()}: {list(uniques)}")


Colonnes communes (14):
   * brand
   * description
   * discount_percent
   * id
   * image_large
   * image_small
   * price
   * price_original
   * price_original_raw
   * price_raw
   * promo_type
   * scrape_date
   * title
   * url

Colonnes uniques par site (2):
   COSMETIQUETN: ['reference']
   CHERRYBEAUTY: ['category']


+ On va garder la colonne catégorie et essayer d'enrichir les lignes vides car ça sera interessant pour nous.
+ La colonne "reference" n'est pas significative, mais plutot un identifiant interne dans le site cherrybeauty.

In [52]:
import numpy as np

if 'reference' in datasets['cosmetiquetn'].columns:
    datasets['cosmetiquetn'] = datasets['cosmetiquetn'].drop('reference', axis=1)

for nom, df in datasets.items():
    if 'category' not in df.columns:
        datasets[nom]['category'] = np.nan

In [53]:
# Ajout de la colonne source à chaque dataset
for nom, df in datasets.items():
    datasets[nom] = df.copy()
    datasets[nom]['source_site'] = nom

In [54]:
for nom, df in datasets.items():
    print(f"   {nom:15}: {list(df.columns)}")

   beautystore    : ['id', 'title', 'price_raw', 'price', 'price_original_raw', 'price_original', 'discount_percent', 'url', 'image_small', 'image_large', 'description', 'promo_type', 'scrape_date', 'brand', 'category', 'source_site']
   cosmetiquetn   : ['id', 'title', 'brand', 'price_raw', 'price', 'price_original_raw', 'price_original', 'discount_percent', 'url', 'image_small', 'image_large', 'description', 'promo_type', 'scrape_date', 'category', 'source_site']
   cherrybeauty   : ['id', 'title', 'category', 'price_raw', 'price', 'price_original_raw', 'price_original', 'discount_percent', 'url', 'image_small', 'image_large', 'description', 'promo_type', 'scrape_date', 'brand', 'source_site']


On fait le merge

In [55]:
df_merged = pd.concat([datasets['beautystore'], datasets['cosmetiquetn'], datasets['cherrybeauty']], 
                     ignore_index=True, sort=False)

print(f"MERGE TERMINÉ!")
print(f"Dataset fusionné: {df_merged.shape[0]:,} lignes × {df_merged.shape[1]} colonnes")

MERGE TERMINÉ!
Dataset fusionné: 8,910 lignes × 16 colonnes


In [56]:
print("\nRÉPARTITION PAR SITE:")
repartition = df_merged['source_site'].value_counts()
for site, count in repartition.items():
    print(f"   {site:15}: {count:6} lignes ({count/len(df_merged)*100:5.1f}%)")


RÉPARTITION PAR SITE:
   cherrybeauty   :   5099 lignes ( 57.2%)
   cosmetiquetn   :   2766 lignes ( 31.0%)
   beautystore    :   1045 lignes ( 11.7%)


In [58]:
df_merged.to_csv("../data/clean/merged_data.csv", index=False, encoding='utf-8')