In [1]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import ast
import re
import torch
import csv
import pandas as pd

categories_path = "./helper_data/categories.csv"
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
irrelevant_reviews_path = "./raw/irelevant_reviews.csv"
indirect_reviews_path = "./raw/indirect_reviews.csv"

In [2]:
def load_categories(path):
    df = pd.read_csv(path)
    return df["category"].tolist()

def generate_specific_prompt(category):
    prompt = f"""
You are a creative assistant tasked with writing **realistic customer reviews** for a store of a specific category. 
**Only content relevant to this exact category is allowed.** Any review that could apply to a different category should be considered irrelevant and must NOT be included.
You are to focus on making only negative reviews.

Category: "{category}"

Instructions:
1. Generate exactly 5 unique reviews.
2. Each review should be short to medium length (1-3 sentences) and feel like it was written by a real customer.
3. Reviews must be **highly specific to this category** — mention details, products, services, or experiences that only a store in this category would provide.
4. Do NOT include content that could apply to any other category.
5. Only generate NEGATIVE reviews.
6. Return ONLY a Python-style list of strings.
7. Do NOT include explanations, numbering, or any text outside the list.

Example output format:

["Review 1 highly specific to the category",
 "Review 2 highly specific to the category",
 "Review 3 highly specific to the category",
 "Review 4 highly specific to the category",
 "Review 5 highly specific to the category"]
"""
    return prompt

def generate_indirect_prompt(category):
    prompt = f"""
You are a creative assistant tasked with writing **realistic customer reviews** for a store of a specific category. 
These reviews should be **indirect negative reviews**, written as if the reviewer did **not personally experience** the store but is repeating what they heard or read online. Use phrases like "I heard that", "I saw online that", "People say", etc.

Category: "{category}"

Instructions:
1. Generate exactly 5 unique indirect negative reviews.
2. Each review should be short to medium length (1-3 sentences) and feel like a real customer comment.
3. Reviews must be **highly specific to this category** — mention details, products, services, or experiences that only a store in this category would provide.
4. Do NOT include content that could apply to any other category.
5. Return ONLY a Python-style list of strings.
6. Do NOT include explanations, numbering, or any text outside the list.

Example output format:

["I heard that the grooming services at this pet salon are often delayed, and some pets come back unkempt.",
 "I saw online that the museum exhibits were outdated and not very interactive, according to other visitors.",
 "People say that the bakery frequently runs out of popular pastries, which is disappointing.",
 "I read that the café has inconsistent quality in their drinks and long wait times.",
 "I heard that the hardware store has poor customer service and limited stock."]
"""
    return prompt

def generate_prompts_grouped(
    categories_path, 
    max_group_prompt=100,
):
    categories_list = load_categories(categories_path)

    batch = []
    for category in categories_list:
        prompt = generate_indirect_prompt(category)
        batch.append((prompt, category))
        if len(batch) >= max_group_prompt:
            yield batch
            batch = []

    if batch:
        yield batch

def parse_list(s: str, length: int | None = None):
    start = s.find('[')
    end = s.rfind(']')
    
    if start == -1 or end == -1 or start > end:
        return None
    
    list_str = s[start:end+1]
    try:
        extracted_list = ast.literal_eval(list_str)
        
        if not isinstance(extracted_list, list):
            return None
        
        is_valid = all(isinstance(x, str) and len(x.strip()) > 0 for x in extracted_list)
        
        if length is not None and len(extracted_list) != length:
            return None

        if is_valid:
            return extracted_list
        return None
    
    except (ValueError, SyntaxError):
        return None

def reviews_to_csv(output_path, reviews, append=True):
    fieldnames = [
        "text", "category"
    ]
    mode = "a" if append else "w"

    with open(output_path, mode, newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        if not append:  
            writer.writeheader()
        for review in reviews:
            writer.writerow(review)

def clean_output(prompt: str, output_str: str) -> str:
    if output_str.startswith(prompt):
        return output_str[len(prompt):].strip()
    return output_str.strip()

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map={"": 0})

for batch in generate_prompts_grouped(
    categories_path=categories_path,
    max_group_prompt=64
):
    prompts = [prompt for prompt, _ in batch]
    categories = [category for _, category in batch]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=256)

    errors = 0
    rows_to_write = []
    for prompt_idx, output in enumerate(outputs):
        output_str = tokenizer.decode(output, skip_special_tokens=True)
        output_str = clean_output(prompts[prompt_idx], output_str)
        reviews = parse_list(output_str, 5)

        if reviews is None:
            errors += 1
            continue 

        for review in reviews:
            rows_to_write.append({
                "text": review,
                "category": categories[prompt_idx],
            })

    print(f"Writing {len(rows_to_write)} rows to {indirect_reviews_path}, errors omitted: {errors}")
    reviews_to_csv(indirect_reviews_path, rows_to_write)

    del inputs, outputs
    torch.cuda.empty_cache()

print(f"Classification complete. Results saved to {indirect_reviews_path}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 320 rows to ./raw/indirect_reviews.csv, errors omitted: 0
Writing 80 rows to ./raw/indirect_reviews.csv, errors omitted: 0
Classification complete. Results saved to ./raw/indirect_reviews.csv


In [5]:
import pandas as pd
import ast
import random
import csv

def assign_gmap_ids(locations_path, ads_path, indirect_path, irelevant_path,
                    output_ads, output_indirect, output_irelevant):
    locations = pd.read_csv(locations_path)
    locations['category'] = locations['category'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    # --- ADS REVIEWS ---
    ads = pd.read_csv(ads_path)
    ads['gmap_id'] = ads.apply(lambda _: random.choice(locations['gmap_id']), axis=1)
    ads.to_csv(output_ads, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)
    
    # --- INDIRECT REVIEWS ---
    indirect = pd.read_csv(indirect_path)
    def assign_indirect(row):
        candidates = locations[locations['category'].apply(lambda cats: isinstance(cats, list) and row['category'] in cats)]
        if not candidates.empty:
            return random.choice(candidates['gmap_id'].tolist())
        else:
            return None 
    indirect['gmap_id'] = indirect.apply(assign_indirect, axis=1)
    indirect.to_csv(output_indirect, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)
    
    # --- IRRELEVANT REVIEWS ---
    irelevant = pd.read_csv(irelevant_path)
    def assign_irelevant(row):
        candidates = locations[locations['category'].apply(lambda cats: isinstance(cats, list) and row['category'] not in cats)]
        if not candidates.empty:
            return random.choice(candidates['gmap_id'].tolist())
        else:
            return None
    irelevant['gmap_id'] = irelevant.apply(assign_irelevant, axis=1)
    irelevant.to_csv(output_irelevant, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)
    
    print("Assignment complete!")

assign_gmap_ids(
    locations_path=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\locations.csv",
    ads_path=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\ads_reviews.csv",
    indirect_path=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\indirect_reviews.csv",
    irelevant_path=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\irelevant_reviews.csv",
    output_ads=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\ads_reviews_assigned.csv",
    output_indirect=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\indirect_reviews_assigned.csv",
    output_irelevant=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\irelevant_reviews_assigned.csv"
)


Assignment complete!


In [6]:
# Paths to the _assigned CSVs
assigned_ads = r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\ads_reviews_assigned.csv"
assigned_indirect = r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\indirect_reviews_assigned.csv"
assigned_irelevant = r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\irelevant_reviews_assigned.csv"

# Output paths without "_assigned"
output_ads = assigned_ads.replace("_assigned", "")
output_indirect = assigned_indirect.replace("_assigned", "")
output_irelevant = assigned_irelevant.replace("_assigned", "")

# --- ADS REVIEWS ---
ads = pd.read_csv(assigned_ads)
ads.to_csv(output_ads, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

# --- INDIRECT REVIEWS ---
indirect = pd.read_csv(assigned_indirect)
indirect = indirect.drop(columns=['category'], errors='ignore')
indirect.to_csv(output_indirect, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

# --- IRRELEVANT REVIEWS ---
irelevant = pd.read_csv(assigned_irelevant)
irelevant = irelevant.drop(columns=['category'], errors='ignore')
irelevant.to_csv(output_irelevant, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

print("CSV files updated successfully!")

CSV files updated successfully!
