In [7]:
import sys
import csv
import json
import re
from dataclasses import dataclass, field, asdict
from typing import List

In [8]:
@dataclass
class Review:
    id: int
    content: str
    gender: str

@dataclass
class FragranceData:
    reviews: List[Review] = field(default_factory=list)

    def add_review(self, review: Review):
        self.reviews.append(review)

In [9]:
def extract_gender(title: str) -> str:
    """extraire le genre du titre du parfum"""
    title = title.lower().strip()
    if title.endswith("for men"):
        return "male"
    elif title.endswith("for women"):
        return "female"
    elif title.endswith("for women and men"):
        return "unisex"
    else:
        return "unknown"

def clean_review(text: str) -> str:
    """masquer les mots clé de genre dans les reviews"""
    text = text.lower()
    gendered_words = ["for men", "for women", "unisex", "pour homme", "pour femme", "male", "female", "man", "woman", "boy", "girl"]
    for word in gendered_words:
        text = re.sub(r'\b' + word + r'\b', "[MASK]", text)
    return text

def process_csv(file_path: str, output_json: str) -> None:
    """process CSV -> JSON output"""
    fragrance_data = FragranceData()
    
    with open(file_path, mode="r", encoding="utf-8") as csv_file:
        reader = csv.DictReader(csv_file)
        review_id = 1  
        
        for row in reader:
            #prendre les champs nécessaires
            title = row.get("title", "")
            reviews = row.get("reviews", "[]")
            reviews = eval(reviews) if reviews else []
            
            #déterminer le genre en fonction du titre
            gender = extract_gender(title)
            
            for review_text in reviews:
                if review_text:
                    #nettoyer et masquer les mots clés de genre
                    cleaned_text = clean_review(review_text)
                    review = Review(id=review_id, content=cleaned_text, gender=gender)
                    fragrance_data.add_review(review)
                    review_id += 1  #incrémenter le compteur pour id
    
    #écrire le fichier au format JSON
    with open(output_json, mode="w", encoding="utf-8") as json_file:
        json.dump([asdict(review) for review in fragrance_data.reviews], json_file, indent=2, ensure_ascii=False)


In [11]:
#augmenter la taille limite du csv
csv.field_size_limit(sys.maxsize)
#process le csv pour obtenir le fichier clean en JSON
csv_file_path = "../data/raw/perfumes_table.csv"
output_json_path = "../data/clean/cleaned_data.json"
process_csv(csv_file_path, output_json_path)

In [None]:
#visualisation graphique de la distribution des reviews