In [1]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Using", device, "device")

Using cuda device


In [None]:
import torch
from transformers import pipeline
from tqdm import tqdm
import pandas as pd

model_id = "meta-llama/Llama-3.2-3B"

pipe = pipeline(
    "text-generation", 
    model=model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="auto"
)

pipe("Negative reviews of hotels")

# Define prompts for different star ratings
prompts = {
    1: "Write a very negative hotel review, describing poor service, dirty rooms, rude staff, and long wait times. Make it very clear that the guest had an unpleasant experience. In the subject line of your email, you should include the words “I’m writing this review because…” This will make it easier for the hotel to identify the review as a complaint.",
    2: "Write a negative hotel review, mentioning average service, some cleanliness issues, and minor inconvenience during the stay.",
    3: "Write a neutral hotel review, highlighting some good aspects but mentioning a few flaws.",
    4: "Write a positive hotel review, mentioning clean rooms, good service, and pleasant stay.",
    5: "Write an excellent hotel review praising the hotel staff, cleanliness, amenities, and overall experience."
}

# Generate fake reviews
reviews = []
number_of_reviews = 10
total_reviews = len(prompts) * number_of_reviews  # 20 reviews per rating

with tqdm(total=total_reviews) as pbar:
    for rating, prompt in prompts.items():
        for _ in range(number_of_reviews):  # Generate 20 reviews for each rating
            generated = pipe(
                prompt,
                max_new_tokens=200,  # Set max length of generated text
                do_sample=True,      # Enable sampling for diverse outputs
                top_k=50,            # Use top-k sampling
                top_p=0.9,           # Narrow the sampling to the top 90% most likely tokens
                temperature=1,       # Randomness in output
                truncation=True,     # Explicitly enable truncation
                pad_token_id=50256   # Padding token ID for models like GPT-2/3
            )
            # Remove the prompt from the generated text
            full_text = generated[0]["generated_text"]
            review = full_text[len(prompt):].strip()  # Remove the prompt from the start
            reviews.append({"review": review, "rating": rating})
            pbar.update(1)

import pandas as pd
df = pd.DataFrame(reviews)
display(df)

In [1]:
import torch
from transformers import pipeline
from tqdm import tqdm
import pandas as pd


device = "cuda" if torch.cuda.is_available() else "cpu"
display(f"Device: {device}")
model_id = "meta-llama/Llama-3.2-3B-Instruct"
display(f"Model: {model_id}")

pipe = pipeline(
    model=model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="auto",  #"balanced_low_0",
)

# Define review prompts for different ratings (with roles for the system and user)
review_prompts = [
    {"rating": 1, "prompt": "Write a very negative hotel review with poor service, dirty rooms, rude staff, and long wait times."},  # Rating 1 (Very Negative)
    {"rating": 2, "prompt": "Write a negative hotel review mentioning some problems, like slow service or slightly dirty rooms, but not extreme."},  # Rating 2 (Negative)
    {"rating": 3, "prompt": "Write a neutral hotel review describing an average experience with no strong feelings about the stay."},  # Rating 3 (Neutral)
    #{"rating": 4, "prompt": "Write a positive hotel review praising the clean rooms, friendly staff, and great service."},  # Rating 4 (Positive)
    #{"rating": 5, "prompt": "Write a very positive hotel review emphasizing an excellent stay, top-notch service, and cleanliness."}  # Rating 5 (Very Positive)
]
# Initialize a list to store the generated reviews
generated_reviews = []
number_of_reviews = 10
# Loop through each review prompt and generate text for each
for review in tqdm(review_prompts, desc="Generating reviews", ncols=100):
    for _ in range(number_of_reviews):  # Generate 10 reviews per rating
        # Create the role-based structured prompt
        messages = [
            {"role": "system", "content": "You are a guest how is writing a review of a hotel you visit. Use the prompt as referemce but do not start with the same sentence"},
            {"role": "user", "content": review["prompt"]}
        ]
        
        # Generate the review text using the pipeline
        outputs = pipe(
            messages,
            max_new_tokens=300,
            num_return_sequences=1,  # Only generate one review per prompt
            do_sample=True,
            temperature=1.0,
            top_p=1,
            pad_token_id=50256,
        )
        
        # Extract the generated review text
        generated_text =  outputs[0]['generated_text'][2]['content'].replace("\n", "")
        # Store the generated review along with the rating in the list
        generated_reviews.append({
            "rating": review["rating"], 
            "review": generated_text
        })

# Create a DataFrame from the list of generated reviews
df_reviews = pd.DataFrame(generated_reviews)
# Display the DataFrame
print(df_reviews)
# Optionally, save the DataFrame to a CSV file
df_reviews.to_csv("hotel_reviews.csv", index=False)

'Device: cuda'

'Model: meta-llama/Llama-3.2-3B-Instruct'

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating reviews:  33%|███████████████                              | 1/3 [01:02<02:05, 62.86s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating reviews: 100%|█████████████████████████████████████████████| 3/3 [03:01<00:00, 60.55s/it]

    rating                                             review
0        1  **Avoid This Place at All Costs - A Nightmaris...
1        1  Title: A Disaster from Start to Finish - Avoid...
2        1  I've had the misfortune of staying at this hot...
3        1  **1/5 Stars - A Descent into Chaos at the 'Com...
4        1  Title: A Stay to Forget: A Disaster of Epic Pr...
5        1  Title: A Disappointing Stay at This Once-Promi...
6        1  Title: A Scathing Disappointment - Avoid This ...
7        1  **A Disaster at the Horizon Hotel**I'm still t...
8        1  Title: Avoid This Hotel at All Costs - Overpri...
9        1  I'm still trying to shake off the lingering fe...
10       2  Title: A Disappointing Stay at the Oakwood Hot...
11       2  I recently spent a weekend at the Oakwood Hote...
12       2  My recent stay at this hotel was a decent expe...
13       2  Average Experience at This AccommodationI rece...
14       2  **Disappointing Stay at Oakwood Hotel**I recen...
15      




In [1]:
import torch
from transformers import pipeline
from tqdm import tqdm
import pandas as pd

# Configuración del modelo
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Inicialización del pipeline con optimización de memoria
pipe = pipeline(
    model=model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="auto",
)

# Prompts para generar reviews
review_prompts = [
    {"rating": 1, "prompt": "Write a very negative hotel review with poor service, dirty rooms, rude staff, and long wait times."},  
    {"rating": 2, "prompt": "Write a negative hotel review mentioning some problems, like slow service or slightly dirty rooms, but not extreme."},  
    #{"rating": 3, "prompt": "Write a neutral hotel review describing an average experience with no strong feelings about the stay."},  
]

# Cantidad de reviews a generar por rating
number_of_reviews = 250

# Preparación de mensajes para el dataset
data = []
for review in review_prompts:
    for _ in range(number_of_reviews):
        messages = [
            {"role": "system", "content": "You are a guest writing a review of a hotel you visited. Use the prompt as a reference but do not start with the same sentence."},
            {"role": "user", "content": review["prompt"]}
        ]
        data.append({"rating": review["rating"], "messages": messages})

# Generación en lote utilizando tqdm para el progreso
batch_size = 16  # Ajusta según la capacidad de tu GPU
generated_reviews = []

for i in tqdm(range(0, len(data), batch_size), desc="Generating reviews", ncols=100):
    batch = data[i:i+batch_size]
    messages_batch = [item["messages"] for item in batch]
    ratings_batch = [item["rating"] for item in batch]

    # Generar en lote
    outputs = pipe(
        messages_batch,
        max_new_tokens=300,
        num_return_sequences=1,
        do_sample=True,
        temperature=1.0,
        top_p=1,
        pad_token_id=50256,
    )
    
    # Extraer y almacenar resultados
    for output, rating in zip(outputs, ratings_batch):
        generated_text = output[0]['generated_text'][2]['content'].replace("\n\n", " ")
        generated_text = output[0]['generated_text'][2]['content'].replace("\n", " ")
        generated_reviews.append({"rating": rating, "review": generated_text})

# Crear un DataFrame con los resultados
df_reviews = pd.DataFrame(generated_reviews)

# Guardar en un archivo CSV
df_reviews.to_csv("hotel_reviews.csv", index=False)

print(df_reviews)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating reviews:  31%|█████████████▏                            | 10/32 [16:46<36:42, 100.10s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating reviews: 100%|██████████████████████████████████████████| 32/32 [55:28<00:00, 104.00s/it]

     rating                                             review
0         1  Title: A Disaster of a Stay: Avoid This Hotel ...
1         1  Title: A Disaster of a Stay - Avoid This Hotel...
2         1  I had the misfortune of staying at this hotel ...
3         1  Title: An Unacceptable Nightmare - Avoid This ...
4         1  Title: AVOID THIS HOTEL AT ALL COSTS  During m...
..      ...                                                ...
495       2  My recent stay at the Oakwood Hotel was a mixe...
496       2  My recent stay at the Grand Plaza Hotel was a ...
497       2  Title: A Solid but Not Stellar Stay  I recentl...
498       2  **A Solid but Not Stellar Stay**  During my re...
499       2  My recent stay at this hotel was a mixed bag, ...

[500 rows x 2 columns]



