# Nuage de mot

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
import nltk
import polars as pl

nltk.download('stopwords')

#df = pl.read_csv("../data/processed/data_pet_booking.csv")
df = pl.read_csv("../data/processed/data_pet_yelp.csv")
# --- 1. Exemple de corpus de reviews ---
reviews = df.select("text").to_series().to_list()

# --- 2. Combiner toutes les reviews en une seule cha√Æne ---
text = " ".join(reviews)

# --- 3. D√©finir les mots √† ignorer (stopwords) ---
stop_words = set(stopwords.words('english'))

# Add supplementary words
custom_stopwords = {"get", "could", "take", "know", "make", "go", "give", "would", "also", "even", "say", "try"}
stop_words.update(custom_stopwords)

# --- 4. Cr√©er le Word Cloud ---
wordcloud = WordCloud(width=800, height=400,
                      background_color='black',
                      stopwords=stop_words,
                      max_words=30).generate(text)

# --- 5. Afficher le Word Cloud ---
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Reviews about pets")
plt.show()


# mod√®le BERT

In [None]:
import torch
print(torch.__version__)

In [None]:
from transformers import pipeline

pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

## Classification avec RoBERTa

In [None]:
import torch
from transformers import pipeline
import polars as pl

# Charger un mod√®le adapt√© (RoBERTa est tr√®s bon ici)
pipe = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)

# Tes labels
labels = ["pet", "child", "handicap", "other"]

# Exemple de reviews
df = pl.read_csv("../data/processed/data_handicap_booking.csv")
# --- 1. Exemple de corpus de reviews ---
reviews = df.select("review_negative").head(5).to_series().to_list()

# Classification
for r in reviews:
    result = pipe(r, candidate_labels=labels, multi_label=False)
    print(f"\nTexte : {r}")
    print(f"‚Üí Cat√©gorie pr√©dite : {result['labels'][0]} (score={result['scores'][0]:.3f})")


## Translation

In [None]:
from deep_translator import GoogleTranslator

translated = GoogleTranslator(source='auto', target='en').translate("albin cat faire √† manger et il sera avec son chien") 
print(translated)

In [None]:
# Parallel translation

import polars as pl
from deep_translator import GoogleTranslator
from concurrent.futures import ThreadPoolExecutor, as_completed

df = pl.read_csv("../data/original/Booking/val.csv")

texts = df["review_positive"].head(50).to_list()

translator = GoogleTranslator(source='auto', target='en')

def translate_one(t):
    try:
        return translator.translate(t)
    except Exception as e:
        return f"[ERROR: {e}]"

# Parallelisation (ex: 8 threads)
with ThreadPoolExecutor(max_workers=8) as executor:
    results = list(executor.map(translate_one, texts))

df_result = pl.DataFrame({
    "translated_text": results
})

print(df_result.head())


In [None]:
# 1. Compter combien de fois chaque review appara√Æt par dataset
df_counts = (
    df.group_by(["original_dataset", "review"])
      .agg(pl.len().alias("count"))
)

# 2. Filtrer pour ne garder que les reviews qui apparaissent plus d'une fois
df_duplicates = df_counts.filter(pl.col("count") > 1)

# 3. R√©sumer : total de doublons par dataset
df_summary = (
    df_duplicates.group_by("original_dataset")
                 .agg(pl.sum("count").alias("total_duplicates"))
                 .sort("original_dataset")
)

print(df_summary)


In [None]:
import polars as pl

# 1Ô∏è‚É£ Supprimer les reviews vides
df_non_empty = df.filter(pl.col("review").str.strip_chars() != "")

# 2Ô∏è‚É£ Compter les occurrences de chaque review par dataset
df_counts = (
    df_non_empty.group_by(["review", "original_dataset"])
                .agg(pl.len().alias("count"))
)

# 3Ô∏è‚É£ Pour chaque review, trouver le dataset o√π elle appara√Æt le plus
df_max_dataset = (
    df_counts.sort(["review", "count"], descending=[False, True])
             .group_by("review")
             .agg([
                 pl.first("original_dataset").alias("most_common_dataset"),
                 pl.max("count").alias("max_count")
             ])
)

print(df_max_dataset)


In [None]:
max_value = df_max_dataset.select(pl.max("max_count")).item()
df_max_dataset.filter(pl.col("max_count") == max_value)


In [None]:
top5 = df_max_dataset.sort("max_count", descending=True).head(15)

print(top5)

## Language identification

In [None]:
from transformers import pipeline

text = [
    "Brevity is the soul of wit.",
    "Amor, ch'a nullo amato amar perdona."
]

model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt)
pipe(df, top_k=1, truncation=True)

In [None]:
import polars as pl
df = pl.read_csv("../data/processed/all_reviews.csv")

In [None]:
texts = df.select("review").head(10)["review"].to_list()
pipe(texts, top_k =1, truncation = True)

In [None]:
from transformers import pipeline
import polars as pl

# Charger le mod√®le de d√©tection de langue
model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt)

df = df.head(1000)

# 1Ô∏è‚É£ Convertir la colonne Polars en liste Python
texts = df.select("review")["review"].to_list()

# 2Ô∏è‚É£ Appliquer le pipeline
results = pipe(texts, top_k=1, truncation=True)

# 3Ô∏è‚É£ Extraire les labels dominants
langs = [r[0]["label"] for r in results]

# 4Ô∏è‚É£ Ajouter les r√©sultats dans le DataFrame Polars
df = df.with_columns(pl.Series("detected_lang", langs))

# 5Ô∏è‚É£ Compter le nombre de textes par langue
counts = df.group_by("detected_lang").agg(
    pl.len().alias("nb_texts")
)

print(counts)


In [None]:
from transformers import pipeline
import polars as pl
from tqdm import tqdm  # barre de progression

# Charger le mod√®le
model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt)

df = df.head(1000)

# Param√®tre : taille de batch
batch_size = 100

all_langs = []
num_rows = df.height

# 1Ô∏è‚É£ Traitement par batch avec barre de progression
for i in tqdm(range(0, num_rows, batch_size), desc="D√©tection de langue"):
    batch_texts = df[i:i+batch_size, "review"].to_list()
    batch_results = pipe(batch_texts, top_k=1, truncation=True)
    batch_langs = [r[0]["label"] for r in batch_results]
    all_langs.extend(batch_langs)

# 2Ô∏è‚É£ Ajouter la colonne d√©tect√©e au DataFrame
df = df.with_columns(pl.Series("detected_lang", all_langs))

# 3Ô∏è‚É£ Compter le nombre de textes par langue
counts = df.group_by("detected_lang").agg(
    pl.len().alias("nb_texts")
).sort("nb_texts", descending=True)

print(counts)


In [None]:
# Multi threading test
from transformers import pipeline
import polars as pl
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Charger le mod√®le
model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt)

# Exemple : on limite √† 1000 lignes pour le test
df = df.head(1000)

# Param√®tres
batch_size = 100
num_threads = 4  # nombre de threads √† utiliser

# Fonction pour traiter un batch de textes
def process_batch(batch_texts):
    results = pipe(batch_texts, top_k=1, truncation=True)
    return [r[0]["label"] for r in results]

# 1Ô∏è‚É£ Cr√©er les batches
batches = [df[i:i+batch_size, "review"].to_list() for i in range(0, df.height, batch_size)]

all_langs = []

# 2Ô∏è‚É£ Multithreading avec ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # soumettre tous les batches
    futures = {executor.submit(process_batch, batch): batch for batch in batches}

    # r√©cup√©ration des r√©sultats avec barre de progression
    for future in tqdm(as_completed(futures), total=len(futures), desc="D√©tection de langue"):
        batch_langs = future.result()
        all_langs.extend(batch_langs)

# 3Ô∏è‚É£ Ajouter la colonne d√©tect√©e au DataFrame
df = df.with_columns(pl.Series("detected_lang", all_langs))

# 4Ô∏è‚É£ Compter le nombre de textes par langue
counts = df.group_by("detected_lang").agg(
    pl.len().alias("nb_texts")
).sort("nb_texts", descending=True)

print(counts)


In [None]:
# test avec langid pour accelerer le processus
import polars as pl
import langid
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Fonction pour identifier la langue d'une review
def detect_lang(text):
    if not isinstance(text, str) or not text.strip():
        return None
    lang, score = langid.classify(text)
    return lang

# Param√®tres
num_threads = 4  # nombre de threads √† utiliser
texts = df["review"].to_list()

# 1Ô∏è‚É£ Parallelisation avec ThreadPoolExecutor
all_langs = []
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # tqdm pour la barre de progression
    for result in tqdm(executor.map(detect_lang, texts), total=len(texts), desc="D√©tection de langue"):
        all_langs.append(result)

# 2Ô∏è‚É£ Ajouter la colonne d√©tect√©e au DataFrame Polars
df = df.with_columns(pl.Series("detected_lang", all_langs))

# 3Ô∏è‚É£ Compter le nombre de textes par langue
counts = df.group_by("detected_lang").agg(
    pl.len().alias("nb_texts")
).sort("nb_texts", descending=True)

print(counts)


## M√©thode pour accelerer encore plus les processus ( a regarder)

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import polars as pl
import langid
from tqdm import tqdm

def detect_language_parallel_optimized(df: pl.DataFrame, column_name: str, num_processes: int = None, batch_size: int = 50000) -> pl.DataFrame:
    """
    Detect the language of a text column in a Polars DataFrame using langid in parallel with ProcessPoolExecutor and batching.

    Args:
        df (pl.DataFrame): Input DataFrame.
        column_name (str): Name of the text column to process.
        num_processes (int): Number of processes to use for parallel processing (default: all available cores).
        batch_size (int): Number of rows per batch (default=50_000).

    Returns:
        pl.DataFrame: New DataFrame with an added column 'detected_lang' containing language codes.
    """
    if num_processes is None:
        import multiprocessing
        num_processes = multiprocessing.cpu_count()

    def detect_lang(texts):
        """Detect language for a list of texts."""
        return [langid.classify(t)[0] if isinstance(t, str) and t.strip() else None for t in texts]

    all_langs = []

    for i in tqdm(range(0, df.height, batch_size), desc="Language detection (batched)"):
        batch_texts = df[i:i+batch_size, column_name].to_list()
        with ProcessPoolExecutor(max_workers=num_processes) as executor:
            # submit one batch to a process
            future = executor.submit(detect_lang, batch_texts)
            all_langs.extend(future.result())

    # Return new DataFrame with added column
    return df.with_columns(pl.Series("detected_lang", all_langs))


## LDA

LDA est un mod√®le bay√©sien qui d√©compose un corpus en topics latents, chacun repr√©sent√© par un ensemble de mots, et attribue √† chaque document une proportion de ces topics. C‚Äôest l‚Äôoutil classique pour le topic modeling non supervis√©.


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim_models
import pyLDAvis
import polars as pl

# --- 0. Pr√©paration nltk ---
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# --- 1. Corpus exemple ---
df = pl.read_csv("../data/processed/key_word_test_a_supprimer.csv")
df = df.head(50)
documents = df["review"].to_list()


# --- 2. Nettoyage et tokenisation ---
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return tokens

texts = [preprocess(doc) for doc in documents]

# --- 3. Cr√©er le dictionnaire et corpus pour LDA ---
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# --- 4. Entra√Æner le mod√®le LDA ---
num_topics = 3  # on suppose 2 topics dans ce corpus
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, passes=15)

# --- 5. Afficher les topics ---
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

# --- 6. Visualisation interactive ---
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.show(vis)  # ouvrira une page web interactive


In [None]:
import pyLDAvis
pyLDAvis.display(vis)

Resultat style mais inutile

# Synonymous research

C'est possible de trouver des synonymes √† l'aide de la biblioth√®que nltk. Cela fonctionne g√©n√©ralement sur des mots simples, mais cela reste une bonne premi√®re √©tape.

In [None]:
import nltk
from nltk.corpus import wordnet

nltk.download("wordnet")
nltk.download("omw-1.4")  # pour avoir des synonymes en anglais

categories = {
    "handicap": [
        "handicap", "wheelchair", "accessible", "braille", "ramp", "lift", "elevator",
        "disabled", "barrier-free", "accessible toilet", "toilet accessible",
        "mobility aid", "adapted", "hearing aid", "visual impairment", "accessible entrance"
    ],
    "pet": [
        "dog", "cat", "pet", "animal", "rabbit", "hamster", "ferret", "bird",
        "pet-friendly", "animals allowed", "dog-friendly", "cat-friendly",
        "pet welcome", "pup", "dog bowl"
    ],
    "child": [
        "child", "baby", "kid", "stroller", "son", "daughter", "toddler",
        "infant", "playground", "high chair", "changing table", "family-friendly",
        "childcare", "kids menu", "baby seat", "family","baby bed", "cot", "crib"
    ]
}

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            # exclure les underscores et majuscules
            clean_word = lemma.name().replace("_", " ").lower()
            if clean_word != word.lower():
                synonyms.add(clean_word)
    return list(synonyms)

categories_with_synonyms = {}
for cat, words in categories.items():
    expanded = set(words)  # inclure les mots d'origine
    for w in words:
        syns = get_synonyms(w)
        expanded.update(syns)
    categories_with_synonyms[cat] = list(expanded)

# Affichage d'un exemple
for cat, words in categories_with_synonyms.items():
    print(f"{cat}: {words}")  


In [None]:
def print_synonyms(categories_with_synonyms, max_per_line=8):
    for cat, words in categories_with_synonyms.items():
        print(f"\n=== {cat.upper()} ===")
        sorted_words = sorted(words)
        line = ""
        for i, w in enumerate(sorted_words, 1):
            line += f"{w}, "
            if i % max_per_line == 0:
                print(line[:-2])  # enl√®ve la derni√®re virgule
                line = ""
        if line:  # imprime le reste
            print(line[:-2])

# Exemple d'utilisation
print_synonyms(categories_with_synonyms)


In [1]:
def build_vllm_prompt(review_text: str) -> str:
    # 1. Message System & Assistant (Instructions et Confirmation)
    system_instruction = (
        "You are a strict business-travel-review classifier. Your task is to analyze a review and determine "
        "whether the traveler(s) have any type of handicap or if the reviews contains a specific needs "
        "associated with a disability (transporations, amenities, etc.). "
        "Respond strictly with 'yes' if the review indicates a handicaped traveler or a special need related to handicap travelling, or 'no' if not. "
        "ONE word only, no explanations or extra text."
    )
    
    # 2. Exemples (Contextualisation)
    examples = (
        "Here are some examples:\n"
        "Review: \"Plant to go to London in September Need information about Accessible Van in London airport\" -> yes\n"
        "Review: \"The room was great, big enough to move around in my power chair in both the bedroom and bathroom\" -> yes\n"
        "Review: \"I would like to sell my wheelchair.please contact me\" -> no\n"
        "Review: \"It's new digital travel magazine targeted exclusively for travelers with disabilities.\" -> no\n"
        "Review: \"Nice roll-in shower with a pull-down bench, but the amenities were again too high\" -> yes"
    )

    # 3. La Question finale
    question = f"Now classify this review:\n\"{review_text}\""
    
    # 4. Fusion du prompt complet (en utilisant des sauts de ligne clairs)
    full_prompt = (
        f"### SYSTEM INSTRUCTION ###\n{system_instruction}\n\n"
        f"### EXAMPLES ###\n{examples}\n\n"
        f"### CLASSIFICATION TASK ###\n{question} -> " # Notez le " -> " √† la fin pour pr√©parer la r√©ponse
    )
    
    return full_prompt.strip()

# Exemple de revue √† classer
review_text_to_test = """I chose to stay here on the basis of a previous W Hotel experience which was fantastic. I have to admit that I was left slightly disappointed. The location itself could not be better for the Washington sights and metro, but the service leaves something to be desired. If you wish to take advantage of the complimentary drop off service (within 5 miles), book well in advance to avoid long waiting times which can mess up your day. (We ended up taking a taxi).
There are two entrances to the hotel and I instinctively opted for the one without a ramp (!) and had to drag my luggage up the stairs. There was no-one to help me or even hold the door open. Had I chosen the other door, there was a ramp but on another occasion, despite the doorman, I still received no help at all.
The foyer had a display of pop art, to which I have no objection, but it rendered it feeling extremely busy and over cluttered. It also hid the check-in desk behind, which didn't help whilst dragging my luggage around! I also noted an interesting complimentary drink available near the foyer bar area but it was always empty throughout my stay so I sadly never had the opportunity to sample it.
We waited for quite some time to check in, which can happen and I, again, have no objection to that. However, what did leave me feeling irritated was the complete lack of acknowledgement despite three members of staff directly in front of us. Eventually, a lovely gentleman who I assume was the manager or supervisor, saw us waiting and gave us a quick apology and assured us that we would be seen to asap. He saved his staff on several other occasions during our stay. 
We also had problems during check-out, and again when we returned to collect our luggage. Due to the lack of space, there was a limited waiting area and we were repeatedly queue jumped by other more aggressive hotel clients. The W staff were oblivious to the growing anger of their more polite customers around them. Not a good start - or finish - to our stay.
We stayed in a 'wonderful' room - the most basic, with an internal view. Since we spent much of our time out and about, this didn't bother us. For peace and quiet in DC, request a room as high up as possible to avoid noise. I was annoyed that, despite requesting this on booking, I had to re-request on check-in, although it was changed with no qualms. Our room was on the 7th floor and we had no problems. As always with W hotels, the room was cleverly designed and luxurious. The photos don't really do it justice. No faults there at all.. although it would have been nice for such a large hotel chain to acknowledge our wedding anniversary while we were there. Other, often smaller hotels have either upgraded us, given us a little something extra in the room, or even just wished us a happy anniversary.
With a keen interest in bars and cocktails, we were intrigued to visit the highly rated 'POV' rooftop bar. The views over Washington are second to none but to call their house specialty cocktail a disappointment barely begins to cover it. I was horrified to watch the bar tender serve me a premix topped up with mixer. How can this possibly be a renowned cocktail bar in Washington?! Simple cocktail mistakes were also made behind the bar for other customers which were obvious to amateurs like us. If you are really interested in a true cocktail experience, I suggest you try 'The Passenger' (for which I have also written a review).
Do try the J&G steakhouse for dinner. The food was delicious and service good.
There is a 'Bliss' spa in the basement and a reasonably equipped gym, neither of which I had time to experience. One thing that was missing, however, was a jacuzzi/sauna for hotel client use. There is a steam room available within the spa but it is not advertised.
Overall, the location and room were up to scratch, but as for the rest... what a shame."""

# Cr√©ation du prompt
final_prompt = build_vllm_prompt(review_text_to_test)

print("--- Prompt vLLM G√©n√©r√© ---")
print(final_prompt)
print("--------------------------")

--- Prompt vLLM G√©n√©r√© ---
### SYSTEM INSTRUCTION ###
You are a strict business-travel-review classifier. Your task is to analyze a review and determine whether the traveler(s) have any type of handicap or if the reviews contains a specific needs associated with a disability (transporations, amenities, etc.). Respond strictly with 'yes' if the review indicates a handicaped traveler or a special need related to handicap travelling, or 'no' if not. ONE word only, no explanations or extra text.

### EXAMPLES ###
Here are some examples:
Review: "Plant to go to London in September Need information about Accessible Van in London airport" -> yes
Review: "The room was great, big enough to move around in my power chair in both the bedroom and bathroom" -> yes
Review: "I would like to sell my wheelchair.please contact me" -> no
Review: "It's new digital travel magazine targeted exclusively for travelers with disabilities." -> no
Review: "Nice roll-in shower with a pull-down bench, but the ame

In [None]:
import requests
import json
from typing import Dict, Any

# ==============================================================================
# 1. Fonction pour construire le prompt (voir ci-dessus)
# ==============================================================================
def build_vllm_prompt(review_text: str) -> str:
    # (Collez ici la fonction build_vllm_prompt de l'√âtape 1)
    system_instruction = (...) # ...
    examples = (...) # ...
    question = f"Now classify this review:\n\"{review_text}\""
    full_prompt = (
        f"### SYSTEM INSTRUCTION ###\n{system_instruction}\n\n"
        f"### EXAMPLES ###\n{examples}\n\n"
        f"### CLASSIFICATION TASK ###\n{question} -> "
    )
    return full_prompt.strip()

# ==============================================================================
# 2. Param√®tres de la Requ√™te
# ==============================================================================
API_URL = "http://localhost:8000/generate"
HEADERS = {"Content-Type": "application/json"}

review_text_to_test = """I chose to stay here on the basis of a previous W Hotel experience which was fantastic. I have to admit that I was left slightly disappointed. The location itself could not be better for the Washington sights and metro, but the service leaves something to be desired. If you wish to take advantage of the complimentary drop off service (within 5 miles), book well in advance to avoid long waiting times which can mess up your day. (We ended up taking a taxi).
There are two entrances to the hotel and I instinctively opted for the one without a ramp (!) and had to drag my luggage up the stairs. There was no-one to help me or even hold the door open. Had I chosen the other door, there was a ramp but on another occasion, despite the doorman, I still received no help at all.
The foyer had a display of pop art, to which I have no objection, but it rendered it feeling extremely busy and over cluttered. It also hid the check-in desk behind, which didn't help whilst dragging my luggage around! I also noted an interesting complimentary drink available near the foyer bar area but it was always empty throughout my stay so I sadly never had the opportunity to sample it.
We waited for quite some time to check in, which can happen and I, again, have no objection to that. However, what did leave me feeling irritated was the complete lack of acknowledgement despite three members of staff directly in front of us. Eventually, a lovely gentleman who I assume was the manager or supervisor, saw us waiting and gave us a quick apology and assured us that we would be seen to asap. He saved his staff on several other occasions during our stay. 
We also had problems during check-out, and again when we returned to collect our luggage. Due to the lack of space, there was a limited waiting area and we were repeatedly queue jumped by other more aggressive hotel clients. The W staff were oblivious to the growing anger of their more polite customers around them. Not a good start - or finish - to our stay.
We stayed in a 'wonderful' room - the most basic, with an internal view. Since we spent much of our time out and about, this didn't bother us. For peace and quiet in DC, request a room as high up as possible to avoid noise. I was annoyed that, despite requesting this on booking, I had to re-request on check-in, although it was changed with no qualms. Our room was on the 7th floor and we had no problems. As always with W hotels, the room was cleverly designed and luxurious. The photos don't really do it justice. No faults there at all.. although it would have been nice for such a large hotel chain to acknowledge our wedding anniversary while we were there. Other, often smaller hotels have either upgraded us, given us a little something extra in the room, or even just wished us a happy anniversary.
With a keen interest in bars and cocktails, we were intrigued to visit the highly rated 'POV' rooftop bar. The views over Washington are second to none but to call their house specialty cocktail a disappointment barely begins to cover it. I was horrified to watch the bar tender serve me a premix topped up with mixer. How can this possibly be a renowned cocktail bar in Washington?! Simple cocktail mistakes were also made behind the bar for other customers which were obvious to amateurs like us. If you are really interested in a true cocktail experience, I suggest you try 'The Passenger' (for which I have also written a review).
Do try the J&G steakhouse for dinner. The food was delicious and service good.
There is a 'Bliss' spa in the basement and a reasonably equipped gym, neither of which I had time to experience. One thing that was missing, however, was a jacuzzi/sauna for hotel client use. There is a steam room available within the spa but it is not advertised.
Overall, the location and room were up to scratch, but as for the rest... what a shame."""

final_prompt = build_vllm_prompt(review_text_to_test)

# ==============================================================================
# 3. Construction du Payload (sans 'model')
# ==============================================================================
payload: Dict[str, Any] = {
    "prompt": final_prompt,
    "max_tokens": 1,           # Tr√®s faible pour forcer la r√©ponse en UN seul mot ('yes' ou 'no')
    "temperature": 0.0,        # Temp√©rature basse pour une classification d√©terministe
    "stop": ["\n", "\r"],      # Arr√™ter si le mod√®le g√©n√®re une nouvelle ligne apr√®s 'yes' ou 'no'
}

print(f"Envoi de la requ√™te avec prompt de longueur: {len(final_prompt)} caract√®res.")

# ==============================================================================
# 4. Envoi et Traitement
# ==============================================================================
try:
    response = requests.post(API_URL, headers=HEADERS, data=json.dumps(payload))
    response.raise_for_status()
    
    data = response.json()
    
    # Extraction du texte (en utilisant la cl√© 'text' qui a fonctionn√© pour vous)
    if 'text' in data and isinstance(data['text'], list) and data['text']:
        raw_completion = data['text'][0]
        
        # Le mod√®le r√©pond avec le prompt + la classification, donc on ne garde que la fin
        # On nettoie et on prend le premier mot
        classification_result = raw_completion.replace(final_prompt, "").strip().split()[0].lower()
        
        print(f"\n‚úÖ Classification R√©ussie :")
        print(f"   Texte de la Revue : \"{review_text_to_test}\"")
        print(f"   R√©sultat : **{classification_result}**")
    else:
        print("\nERREUR: La r√©ponse du mod√®le est vide ou a un format inattendu.")
        print(json.dumps(data, indent=4))

except requests.exceptions.RequestException as e:
    print(f"\nüõë ERREUR DE CONNEXION: Assurez-vous que le serveur vLLM est d√©marr√©.")
    print(f"D√©tails : {e}")

In [None]:
import pandas as pd
import grequests
import json
import time
from concurrent.futures import ProcessPoolExecutor
from typing import List, Dict, Any

# ==============================================================================
# üõ†Ô∏è Configuration du Traitement
# ==============================================================================
INPUT_CSV_PATH = "../../data/processed/data_validated/validated_data_accessiblego.csv"
REVIEW_COLUMN_NAME = "review"
OUTPUT_CSV_PATH = "reviews_classified_results.csv"

API_URL = "http://localhost:8000/generate"
HEADERS = {"Content-Type": "application/json"}
MAX_WORKERS = 32
BATCH_SIZE = 250

MODEL_PARAMS = {
    "max_tokens": 1,      
    "temperature": 0.0,
    "stop": ["\n", "\r"],
}

# ==============================================================================
# üß† Fonction de construction du prompt
# ==============================================================================
def build_vllm_prompt(review_text: str) -> str:
    system_instruction = (
        "You are a strict review classifier. Your task is to analyze a review and determine "
        "whether the traveler(s) have any type of handicap or if the reviews contains a specific needs "
        "associated with a disability (transporations, amenities, etc.). "
        "Respond strictly with 'yes' if the review indicates a handicaped traveler or a special need related to handicap travelling, or 'no' if not. "
        "ONE word only, no explanations or extra text."
    )
    
    examples = (
        "Here are some examples:\n"
        "Review: \"Plant to go to London in September Need information about Accessible Van in London airport\" -> yes\n"
        "Review: \"The room was great, big enough to move around in my power chair in both the bedroom and bathroom\" -> yes\n"
        "Review: \"I would like to sell my wheelchair.please contact me\" -> no\n"
        "Review: \"It's new digital travel magazine targeted exclusively for travelers with disabilities.\" -> no\n"
        "Review: \"Nice roll-in shower with a pull-down bench, but the amenities were again too high\" -> yes"
    )

    question = f"Now classify this review:\n\"{review_text}\""
    
    full_prompt = (
        f"### SYSTEM INSTRUCTION ###\n{system_instruction}\n\n"
        f"### EXAMPLES ###\n{examples}\n\n"
        f"### CLASSIFICATION TASK ###\n{question} -> "
    )
    
    return full_prompt.strip()

# ==============================================================================
# üîß Gestion d'Erreur Asynchrone
# ==============================================================================
def handle_exception(request, exception):
    """Fonction appel√©e par grequests en cas d'erreur r√©seau/timeout."""
    return None

def process_batch(batch_data: pd.DataFrame) -> pd.DataFrame:
    """Envoie un lot de reviews √† l'API vLLM SIMULTAN√âMENT."""
    
    # Stocker les donn√©es dans une structure s√©par√©e avec index
    batch_rows = []
    requests_to_send = []
    
    for idx, (index, row) in enumerate(batch_data.iterrows()):
        original_row_copy = row.to_dict()
        review_text = str(original_row_copy[REVIEW_COLUMN_NAME])
        prompt = build_vllm_prompt(review_text)

        payload = {"prompt": prompt, **MODEL_PARAMS}

        req = grequests.post(
            API_URL,
            headers=HEADERS,
            data=json.dumps(payload),
            timeout=60,
        )
        
        # Stocker les donn√©es s√©par√©ment avec un index de correspondance
        batch_rows.append({
            'index': idx,
            'data': original_row_copy,
            'prompt': prompt
        })
        requests_to_send.append(req)

    # Envoi SIMULTAN√â de toutes les requ√™tes du lot
    responses = grequests.map(requests_to_send, exception_handler=handle_exception, size=MAX_WORKERS)

    results_list: List[Dict[str, Any]] = []

    # Traitement des R√©ponses avec correspondance par index
    for idx, response in enumerate(responses):
        # R√©cup√©rer les donn√©es originales via l'index
        if idx >= len(batch_rows):
            continue
            
        current_row = batch_rows[idx]
        current_data = current_row['data']
        prompt_sent = current_row['prompt']
        
        classification = "ERROR_UNSPECIFIED_PROCESSING"

        try:
            # G√©rer les r√©ponses None (erreurs r√©seau)
            if response is None:
                classification = "ERROR_NETWORK_TIMEOUT"
            else:
                response.raise_for_status()
                data = response.json()

                if 'text' in data and data['text']:
                    raw_completion = data['text'][0]
                    # Nettoyage
                    classification = raw_completion.replace(prompt_sent, "").strip().split()[0].lower()
                else:
                    classification = "ERROR_FORMAT_NO_TEXT"
                    
        except Exception as e:
            classification = f"ERROR_HTTP_{e.__class__.__name__}"
            
        # Ajouter la classification √† la copie de la ligne originale
        current_data['classification_result'] = classification
        results_list.append(current_data)

    return pd.DataFrame(results_list)

# ==============================================================================
# üèÉ Boucle d'Ex√©cution
# ==============================================================================
def run_inference_pipeline():
    print(f"Chargement du dataset depuis : {INPUT_CSV_PATH}")
    try:
        df = pd.read_csv(INPUT_CSV_PATH)
    except FileNotFoundError:
        print(f"üõë ERREUR: Fichier introuvable √† {INPUT_CSV_PATH}. V√©rifiez le chemin.")
        return

    if REVIEW_COLUMN_NAME not in df.columns:
        print(f"üõë ERREUR: Colonne '{REVIEW_COLUMN_NAME}' non trouv√©e dans le CSV.")
        print(f"Colonnes disponibles : {list(df.columns)}")
        return

    TOTAL_REVIEWS = len(df)
    print(f"Nombre total de reviews √† inf√©rer : {TOTAL_REVIEWS}")

    # Diviser le DataFrame en lots
    list_of_batches = [df[i:i + BATCH_SIZE] for i in range(0, TOTAL_REVIEWS, BATCH_SIZE)]
    print(f"Divis√© en {len(list_of_batches)} lots de taille {BATCH_SIZE}.")

    start_time = time.time()
    
    # Utilisation d'un ThreadPoolExecutor pour l'ex√©cution parall√®le
    all_results_dfs = []
    
    print(f"\nüöÄ D√©marrage de l'inf√©rence en lots s√©quentiels...")
    
    for i, batch in enumerate(list_of_batches): # Parcourir les lots S√âQUENTIELLEMENT
        try:
            result_df = process_batch(batch) # Process_batch s'occupe des requ√™tes CONCURRENTES
            all_results_dfs.append(result_df)
            
            # Affichage de la progression
            if (i + 1) % 1 == 0: # Afficher √† chaque lot pour une boucle s√©quentielle
                processed_count = min((i + 1) * BATCH_SIZE, TOTAL_REVIEWS)
                progress_percent = (i + 1) / len(list_of_batches) * 100
                elapsed = time.time() - start_time
                speed = processed_count / elapsed if elapsed > 0 else 0
                print(f"Progression: {i + 1}/{len(list_of_batches)} lots ({progress_percent:.1f}%) | "
                      f"{processed_count}/{TOTAL_REVIEWS} reviews | Vitesse: {speed:.1f} req/s")

        except Exception as e:
            # Cette erreur g√®re les probl√®mes qui surviendraient DANS process_batch (hors grequests.map)
            print(f"‚ö†Ô∏è ERREUR CRITIQUE lors du traitement du lot {i}: {e}. Ce lot sera ignor√©.")

    if all_results_dfs:
        final_df = pd.concat(all_results_dfs, ignore_index=True)
        final_df.to_csv(OUTPUT_CSV_PATH, index=False)
        
        end_time = time.time()
        
        print(f"\n--- Inf√©rence Termin√©e ---")
        print(f"Total des reviews class√©es : {len(final_df)}")
        print(f"Temps total √©coul√© : {end_time - start_time:.2f} secondes")
        print(f"Vitesse moyenne : {len(final_df) / (end_time - start_time):.2f} requ√™tes/seconde")
        print(f"‚úÖ R√©sultats sauvegard√©s dans : {OUTPUT_CSV_PATH}")
    else:
        print("üõë AUCUN r√©sultat n'a √©t√© trait√©.")


if __name__ == "__main__":
    run_inference_pipeline()

In [4]:
import pandas as pd
from collections import Counter

df_classified =  pd.read_csv("reviews_classified_results.csv")

unique_values = df_classified["classification_result"]

counter = Counter(unique_values)
for k,v in counter.items():
    print(f"Nombre de valeurs pour {k} :  {v}")

Nombre de valeurs pour yes :  1330
Nombre de valeurs pour ERROR_HTTP_HTTPError :  25
Nombre de valeurs pour no :  132


In [None]:
import pandas as pd
import httpx
import json
import time
import asyncio
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict, Any, Tuple

# ==============================================================================
# üõ†Ô∏è Configuration du Traitement
# ==============================================================================
INPUT_CSV_PATH = "../../data/processed/data_validated/validated_data_european_hotel_reviews.csv"
REVIEW_COLUMN_NAME = "review"
OUTPUT_CSV_PATH = "reviews_classified_results_async_retry.csv"

API_URL = "http://localhost:8000/generate"
HEADERS = {"Content-Type": "application/json"}

# Param√®tres de concurrence (Ajust√©s pour r√©duire la saturation, √† re-tester apr√®s r√©glage de vLLM)
MAX_CONCURRENT_REQUESTS = 18  # R√©duire la concurrence pour √©viter la saturation du pool vLLM
MAX_WORKERS_THREADPOOL = 32   # Nombre de lots trait√©s en parall√®le
BATCH_SIZE = 250             # Taille de lot r√©duite

# Param√®tres de RE-TENTATIVE
MAX_RETRIES = 3              # Nombre maximal de tentatives (1√®re tentative + 2 retries)
BASE_RETRY_DELAY = 5         # D√©lai d'attente initial en secondes (Augment√© pour d√©saturer vLLM)
TIMEOUT = 180                # D√©lai d'attente pour chaque requ√™te (Augment√© pour le ReadTimeout)

MODEL_PARAMS = {
    "max_tokens": 1,
    "temperature": 0.0,
    "stop": ["\n", "\r"],
}

# ==============================================================================
# üß† Fonction de construction du prompt
# ==============================================================================
def build_vllm_prompt(review_text: str) -> str:
    system_instruction = (
        "You are a strict business-travel-review classifier. Your task is to analyze a review and determine "
        "whether the traveler(s) have any type of handicap or if the reviews contains a specific needs "
        "associated with a disability (transporations, amenities, etc.). "
        "Respond strictly with 'yes' if the review indicates a handicaped traveler or a special need related to handicap travelling, or 'no' if not. "
        "ONE word only, no explanations or extra text."
    )
    
    examples = (
        "Here are some examples:\n"
        "Review: \"Plant to go to London in September Need information about Accessible Van in London airport\" -> yes\n"
        "Review: \"The room was great, big enough to move around in my power chair in both the bedroom and bathroom\" -> yes\n"
        "Review: \"I would like to sell my wheelchair.please contact me\" -> no\n"
        "Review: \"It's new digital travel magazine targeted exclusively for travelers with disabilities.\" -> no\n"
        "Review: \"Nice roll-in shower with a pull-down bench, but the amenities were again too high\" -> yes"
    )

    question = f"Now classify this review:\n\"{review_text}\""
    
    full_prompt = (
        f"### SYSTEM INSTRUCTION ###\n{system_instruction}\n\n"
        f"### EXAMPLES ###\n{examples}\n\n"
        f"### CLASSIFICATION TASK ###\n{question} -> "
    )
    
    return full_prompt.strip()

# ==============================================================================
# üöÄ Fonctions Asynchrones (async/await)
# ==============================================================================

async def classify_single_review(
    client: httpx.AsyncClient,
    review_data: Dict[str, Any],
    prompt: str,
) -> Tuple[Dict[str, Any], str]:
    """Envoie une requ√™te asynchrone √† l'API vLLM avec un syst√®me de re-tentative."""
    
    original_data = review_data.copy()
    payload = {"prompt": prompt, **MODEL_PARAMS}
    
    # BOUCLE DE RE-TENTATIVE
    for attempt in range(MAX_RETRIES):
        classification = "ERROR_UNSPECIFIED_PROCESSING"
        
        try:
            # 1. ENVOI DE LA REQU√äTE
            response = await client.post(
                API_URL,
                headers=HEADERS,
                json=payload,
                timeout=TIMEOUT,
            )
            
            # 2. V√âRIFICATION DU STATUT HTTP
            response.raise_for_status()
            data = response.json()

            # 3. TRAITEMENT DE LA R√âPONSE (Succ√®s)
            if 'text' in data and data['text']:
                raw_completion = data['text'][0]
                classification = raw_completion.replace(prompt, "").strip().split()[0].lower()
                # Sortie imm√©diate en cas de succ√®s
                return original_data, classification
            else:
                classification = "ERROR_FORMAT_NO_TEXT"
                # Erreur non r√©cup√©rable
                return original_data, classification
            
        # 4. GESTION DES ERREURS R√âCUP√âRABLES
        # Inclut ReadError, ReadTimeout, PoolTimeout (PoolTimeout est un type de TimeoutException dans httpx)
        except (httpx.TimeoutException, httpx.ReadError, httpx.ConnectError) as e:
            error_type = e.__class__.__name__
            
            if attempt < MAX_RETRIES - 1:
                # Calcul du d√©lai d'attente exponentiel
                delay = BASE_RETRY_DELAY * (2 ** attempt)
                print(f"  [ATTENTION] T√¢che √©chou√©e ({error_type}, Tentative {attempt + 1}/{MAX_RETRIES}). Attente de {delay:.1f}s avant re-tentative.")
                await asyncio.sleep(delay)
                continue # Nouvelle tentative
            else:
                # √âchec apr√®s la derni√®re tentative
                classification = f"ERROR_FINAL_RECOVERABLE_{error_type}"
                return original_data, classification
                
        # 5. GESTION DES ERREURS NON R√âCUP√âRABLES
        except httpx.HTTPStatusError as e:
            classification = f"ERROR_HTTP_{e.response.status_code}"
            return original_data, classification
        
        except Exception as e:
            classification = f"ERROR_EXCEPTION_{e.__class__.__name__}"
            return original_data, classification

    return original_data, classification # Fallback


async def process_batch_async(batch_data: pd.DataFrame) -> pd.DataFrame:
    """Cr√©e et ex√©cute toutes les t√¢ches asynchrones pour un lot."""
    
    tasks = []
    
    # httpx.Limits est utilis√© pour contr√¥ler la concurrence au sein du lot.
    limits = httpx.Limits(max_connections=MAX_CONCURRENT_REQUESTS, max_keepalive_connections=20)
    
    async with httpx.AsyncClient(limits=limits) as client:
        for _, row in batch_data.iterrows():
            review_text = str(row[REVIEW_COLUMN_NAME])
            prompt = build_vllm_prompt(review_text)
            
            tasks.append(
                classify_single_review(
                    client=client,
                    review_data=row.to_dict(),
                    prompt=prompt,
                )
            )

        # Ex√©cution de toutes les requ√™tes du lot SIMULTAN√âMENT
        results = await asyncio.gather(*tasks, return_exceptions=False)
        
        results_list: List[Dict[str, Any]] = []
        
        for original_data, classification in results:
            original_data['classification_result'] = classification
            results_list.append(original_data)

        return pd.DataFrame(results_list)


def run_batch_in_loop(batch_data: pd.DataFrame) -> pd.DataFrame:
    """Ex√©cute la fonction asynchrone dans une boucle d'√©v√©nement."""
    return asyncio.run(process_batch_async(batch_data))


# ==============================================================================
# üèÉ Boucle d'Ex√©cution
# ==============================================================================
def run_inference_pipeline():
    print(f"Chargement du dataset depuis : {INPUT_CSV_PATH}")
    try:
        df = pd.read_csv(INPUT_CSV_PATH)
    except FileNotFoundError:
        print(f"üõë ERREUR: Fichier introuvable √† {INPUT_CSV_PATH}. V√©rifiez le chemin.")
        return

    if REVIEW_COLUMN_NAME not in df.columns:
        print(f"üõë ERREUR: Colonne '{REVIEW_COLUMN_NAME}' non trouv√©e dans le CSV.")
        print(f"Colonnes disponibles : {list(df.columns)}")
        return

    TOTAL_REVIEWS = len(df)
    print(f"Nombre total de reviews √† inf√©rer : {TOTAL_REVIEWS}")

    # Diviser le DataFrame en lots
    list_of_batches = [df[i:i + BATCH_SIZE] for i in range(0, TOTAL_REVIEWS, BATCH_SIZE)]
    print(f"Divis√© en {len(list_of_batches)} lots de taille {BATCH_SIZE}.")
    print(f"Syst√®me de Re-tentative activ√© : {MAX_RETRIES} tentatives max, d√©lai initial de {BASE_RETRY_DELAY}s, Timeout de {TIMEOUT}s.")

    start_time = time.time()
    all_results_dfs = []
    
    print(f"\nüöÄ D√©marrage de l'inf√©rence avec ThreadPoolExecutor...")
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS_THREADPOOL) as executor:
        # Soumettre tous les lots √† l'executor
        future_to_batch = {executor.submit(run_batch_in_loop, batch): i for i, batch in enumerate(list_of_batches)}
        
        for future in future_to_batch:
            batch_index = future_to_batch[future]
            try:
                # R√©cup√©rer le r√©sultat du lot
                result_df = future.result() 
                all_results_dfs.append(result_df)
                
                # Affichage de la progression
                processed_count = sum(len(df) for df in all_results_dfs)
                progress_percent = processed_count / TOTAL_REVIEWS * 100
                elapsed = time.time() - start_time
                speed = processed_count / elapsed if elapsed > 0 else 0
                
                print(f"Progression: {len(all_results_dfs)}/{len(list_of_batches)} lots ({progress_percent:.1f}%) | "
                      f"{processed_count}/{TOTAL_REVIEWS} reviews | Vitesse: {speed:.1f} req/s")
                
            except Exception as e:
                print(f"‚ö†Ô∏è ERREUR CRITIQUE lors du traitement du lot {batch_index}: {e}. Ce lot sera ignor√©.")
                
    if all_results_dfs:
        final_df = pd.concat(all_results_dfs, ignore_index=True)
        final_df.to_csv(OUTPUT_CSV_PATH, index=False)
        
        end_time = time.time()
        
        print(f"\n--- Inf√©rence Termin√©e ---")
        print(f"Total des reviews class√©es : {len(final_df)}")
        print(f"Temps total √©coul√© : {end_time - start_time:.2f} secondes")
        print(f"Vitesse moyenne : {len(final_df) / (end_time - start_time):.2f} requ√™tes/seconde")
        print(f"‚úÖ R√©sultats sauvegard√©s dans : {OUTPUT_CSV_PATH}")
    else:
        print("üõë AUCUN r√©sultat n'a √©t√© trait√©.")


if __name__ == "__main__":
    run_inference_pipeline()

Chargement du dataset depuis : ../../data/processed/data_validated/validated_data_european_hotel_reviews.csv
Nombre total de reviews √† inf√©rer : 15885
Divis√© en 64 lots de taille 250.
Syst√®me de Re-tentative activ√© : 3 tentatives max, d√©lai initial de 5s, Timeout de 180s.

üöÄ D√©marrage de l'inf√©rence avec ThreadPoolExecutor...
  [ATTENTION] T√¢che √©chou√©e (ReadError, Tentative 1/3). Attente de 5.0s avant re-tentative.
  [ATTENTION] T√¢che √©chou√©e (ReadError, Tentative 1/3). Attente de 5.0s avant re-tentative.
  [ATTENTION] T√¢che √©chou√©e (ReadError, Tentative 1/3). Attente de 5.0s avant re-tentative.
  [ATTENTION] T√¢che √©chou√©e (ReadError, Tentative 1/3). Attente de 5.0s avant re-tentative.
  [ATTENTION] T√¢che √©chou√©e (ReadError, Tentative 1/3). Attente de 5.0s avant re-tentative.
  [ATTENTION] T√¢che √©chou√©e (ReadError, Tentative 1/3). Attente de 5.0s avant re-tentative.
  [ATTENTION] T√¢che √©chou√©e (ReadError, Tentative 1/3). Attente de 5.0s avant re-tentati

In [None]:
import pandas as pd
from collections import Counter

df_classified =  pd.read_csv("reviews_classified_results_async.csv")

unique_values = df_classified["classification_result"]

counter = Counter(unique_values)
for k,v in counter.items():
    print(f"Nombre de valeurs pour {k} :  {v}")