In [1]:
import spacy
import json

# Charger le modèle SpaCy
nlp = spacy.load("en_core_web_sm")

# Charger les données
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file]

# Charger les avis
reviews = load_jsonl("reviews.jsonl")
documents = [review["title"] + " " + review["text"] for review in reviews]

# Prétraitement
processed_reviews = []
for doc in nlp.pipe(documents, disable=["ner", "parser"]):
    tokens = [token.lemma_ for token in doc 
              if not token.is_stop 
              and not token.is_punct 
              and not token.is_digit 
              and len(token) > 1]
    processed_reviews.append(tokens)

# Sauvegarde des données nettoyées
with open("processed_reviews.json", "w", encoding="utf-8") as file:
    json.dump(processed_reviews, file, ensure_ascii=False, indent=2)

print("Données préparées et sauvegardées dans 'processed_reviews.json'")


Données préparées et sauvegardées dans 'processed_reviews.json'
