### Vérification connexion système

In [1]:
import sys
print(sys.executable)


c:\Users\moris\Desktop\large project\venv\Scripts\python.exe


In [2]:
from huggingface_hub import whoami
print(whoami())


{'type': 'user', 'id': '690527e96c3f98b54c3cd69a', 'name': 'AlbinMorisseau', 'fullname': 'Albin Morisseau', 'email': 'morisseau.albin@gmail.com', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/713fac6cdb1ae0c6ffcaffc4994e6536.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'test', 'role': 'read', 'createdAt': '2025-10-31T21:24:24.229Z'}}}


In [3]:
import requests
r = requests.get("https://huggingface.co/facebook/m2m100_418M/resolve/main/config.json")
print(r.status_code)


200


### Test modèle d'id de langue

In [4]:
from langid.langid import LanguageIdentifier, model
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
lang,prob = identifier.classify("Location, cleanliness, breakfast")
print(lang)
print(prob)

de
0.16767080679447224


In [5]:
import fasttext
import numpy as np

# Charger le modèle local
model = fasttext.load_model("../models/lid.176.bin")

# Texte à détecter
text = "Location, cleanliness, breakfast"

# Détecter la langue
labels, probs = model.predict(text)

# Convertir correctement les probabilités en array
probs = np.asarray(probs)

# Récupérer la langue et la probabilité
lang = labels[0].replace("__label__", "")
prob = probs[0]

print(lang, prob)


ValueError: ../models/lid.176.bin cannot be opened for loading!

### Test traduction

In [11]:
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# -------- CONFIGURATION --------
model_dir = "../m2m100_418M"
device = "cuda" if torch.cuda.is_available() else "cpu"

# -------- CHARGEMENT DU MODELE --------
tokenizer = M2M100Tokenizer.from_pretrained(model_dir)
model = M2M100ForConditionalGeneration.from_pretrained(model_dir).to(device)
model.eval()


M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
       

In [13]:
# -------- TEXTE À TRADUIRE --------
text_to_translate = "Location, cleanliness, breakfast"
src_lang = "de"  # langue source
tgt_lang = "en"  # langue cible

# -------- TRADUCTION --------
tokenizer.src_lang = src_lang
encoded = tokenizer(text_to_translate, return_tensors="pt").to(device)

# M2M100 a besoin de forced_bos_token_id pour la langue cible
forced_bos_token_id = tokenizer.get_lang_id(tgt_lang)

with torch.no_grad():
    generated_tokens = model.generate(
        **encoded,
        max_length=128,
        forced_bos_token_id=forced_bos_token_id
    )

translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

print("Original:", text_to_translate)
print("Traduit:", translated_text)

Original: Location, cleanliness, breakfast
Traduit: Location, cleanliness, breakfast


In [7]:
import pandas as pd
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from langid.langid import LanguageIdentifier, model
from tqdm.notebook import tqdm
import gc

# -------- CONFIGURATION --------
INPUT_FILE = "../data/original/Booking/val.csv"  # ou .parquet
OUTPUT_FILE = "test_processed_reviews.csv"
SRC_COL = "review_positive"
BATCH_SIZE = 64          # Ajustable selon la VRAM GPU
CHUNK_SIZE = 50000       # Lecture par chunks
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
THRESHOLD_VALUE =0.75

# -------- INITIALISATION DU MODELE M2M100 --------
MODEL_NAME = "facebook/m2m100_418M"
model_dir = "../m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_dir)
model_trans = M2M100ForConditionalGeneration.from_pretrained(model_dir).to(DEVICE)
model_trans.eval()

# -------- FONCTIONS --------
def detect_lang(text,threshold):
    """Détecte la langue avec langid"""
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    lang,prob = identifier.classify("Location, cleanliness, breakfast")
    if lang != "en" and prob > threshold:
        return lang
    else:
        return "en"

def translate_batch(texts, src_lang):
    """Traduit un batch de textes depuis src_lang vers l'anglais"""
    tgt_lang = "en"  # langue cible fixée à l'anglais
    tokenizer.src_lang = src_lang
    
    # Encodage du batch
    encoded = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(DEVICE)
    
    # On récupère l'ID du token de début pour la langue cible
    forced_bos_token_id = tokenizer.get_lang_id(tgt_lang)
    
    # Génération des traductions
    with torch.no_grad():
        translated_tokens = model_trans.generate(
            **encoded,
            max_length=512,
            forced_bos_token_id=forced_bos_token_id
        )
    
    # Décodage
    translated_texts = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    # Libération mémoire GPU et RAM
    del encoded, translated_tokens
    torch.cuda.empty_cache()
    gc.collect()
    
    return translated_texts

def process_chunk(df_chunk):
    # Détection de langue
    df_chunk['lang'] = df_chunk[SRC_COL].apply(lambda x: detect_lang(x, threshold=THRESHOLD_VALUE))
    df_chunk['review_en'] = None

    # Traduction par langue
    non_english = df_chunk[df_chunk['lang'] != "en"]
    for lang in non_english['lang'].unique():
        idxs = non_english[non_english['lang'] == lang].index
        texts = df_chunk.loc[idxs, SRC_COL].tolist()

        for i in tqdm(range(0, len(texts), BATCH_SIZE), desc=f"Translating {lang}->en"):
            batch_texts = texts[i:i+BATCH_SIZE]
            try:
                translated_texts = translate_batch(batch_texts, lang)
            except Exception as e:
                translated_texts = batch_texts  # fallback
            df_chunk.loc[idxs[i:i+BATCH_SIZE], 'review_en'] = translated_texts

    # Texte déjà en anglais
    df_chunk.loc[df_chunk['lang'] == "en", 'review_en'] = df_chunk.loc[df_chunk['lang'] == "en", SRC_COL]

    return df_chunk

# -------- LECTURE PAR CHUNKS ET TRAITEMENT --------
first_chunk = True
if INPUT_FILE.endswith(".csv"):
    reader = pd.read_csv(INPUT_FILE, chunksize=CHUNK_SIZE)
elif INPUT_FILE.endswith(".parquet"):
    reader = pd.read_parquet(INPUT_FILE, chunksize=CHUNK_SIZE)
else:
    raise ValueError("Format de fichier non supporté")

for chunk_idx, df_chunk in enumerate(reader, start=1):
    print(f"\n--- Processing chunk {chunk_idx} ---")
    processed_chunk = process_chunk(df_chunk)
    processed_chunk.to_csv(OUTPUT_FILE, mode='w' if first_chunk else 'a', index=False, header=first_chunk)
    first_chunk = False
    del processed_chunk
    gc.collect()
    torch.cuda.empty_cache()

print("\n Traitement terminé. Résultat sauvegardé dans", OUTPUT_FILE)


KeyboardInterrupt: 