In [1]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import ast
import re
import torch
import csv
import pandas as pd
from faker import Faker
import random

categories_path = "./helper_data/categories.csv"
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
irrelevant_reviews_path = "./raw/irelevant_reviews.csv"
indirect_reviews_path = "./raw/indirect_reviews.csv"
ads_reviews_path = "./raw/ads_reviews.csv"

In [2]:
def load_categories(path):
    df = pd.read_csv(path)
    return df["category"].tolist()

fake = Faker()

def generate_specific_positive_prompt(category):
    prompt = f"""
You are, {fake.name()}, a creative assistant tasked with writing **realistic customer reviews** for a store of a specific category. 
**Only content relevant to this exact category is allowed.** Any review that could apply to a different category should be considered irrelevant and must NOT be included.
You are to focus on making only POSITIVE reviews.

Category: "{category}"

Instructions:
1. Generate exactly 5 unique reviews.
2. Each review should be short to medium length (1-3 sentences) and feel like it was written by a real customer.
3. Reviews must be **highly specific to this category** — mention details, products, services, or experiences that only a store in this category would provide.
4. Do NOT include content that could apply to any other category.
5. Only generate POSITIVE reviews.
6. Invent realistic store names, products, or services if needed. Do NOT use generic placeholders like "XYZ company".
7. Return ONLY a Python-style list of strings.
8. Do NOT include explanations, numbering, or any text outside the list.

Example output format:

[
    "<Review text 1>",
    "<Review text 2>",
    "<Review text 3>",
    "<Review text 4>",
    "<Review text 5>"
]
"""
    return prompt

def generate_specific_negative_prompt(category):
    prompt = f"""
You are, {fake.name()}, a creative assistant tasked with writing **realistic customer reviews** for a store of a specific category. 
**Only content relevant to this exact category is allowed.** Any review that could apply to a different category should be considered irrelevant and must NOT be included.
You are to focus on making only negative reviews.

Category: "{category}"

Instructions:
1. Generate exactly 5 unique reviews.
2. Each review should be short to medium length (1-3 sentences) and feel like it was written by a real customer.
3. Reviews must be **highly specific to this category** — mention details, products, services, or experiences that only a store in this category would provide.
4. Do NOT include content that could apply to any other category.
5. Only generate NEGATIVE reviews.
6. Invent realistic store names, products, or services if needed. Do NOT use generic placeholders like "XYZ company".
7. Return ONLY a Python-style list of strings.
8. Do NOT include explanations, numbering, or any text outside the list.

Example output format:

[
    "<Review text 1>",
    "<Review text 2>",
    "<Review text 3>",
    "<Review text 4>",
    "<Review text 5>"
]
"""
    return prompt

def generate_indirect_positive_prompt(category):
    prompt = f"""
You are, {fake.name()}, a creative assistant tasked with writing **realistic customer reviews** for a store of a specific category. 
The reviews should be indirect, written as if the reviewer is reporting what they heard from others, read online, or observed second-hand. Encourage creative indirect phrasing rather than repeating specific examples.

Category: "{category}"

Instructions:
1. Generate exactly 5 unique indirect positive reviews.
2. Each review should be short to medium length (1-3 sentences) and feel like a real customer comment.
3. Reviews must be **highly specific to this category** — mention details, products, services, or experiences that only a store in this category would provide.
4. Do NOT include content that could apply to any other category.
5. Do not use any store names and make it general so that it can be applied to any store in the category.
6. Return ONLY a Python-style list of strings.
7. Do NOT include explanations, numbering, or any text outside the list.
8. Use your own natural expressions to indicate that the information is second-hand or from other sources. Do not copy fixed phrases. Make each review unique in wording and tone.

Example output format:

[
    "<Review text 1>",
    "<Review text 2>",
    "<Review text 3>",
    "<Review text 4>",
    "<Review text 5>"
]
"""
    return prompt

def generate_indirect_negative_prompt(category):
    prompt = f"""
You are, {fake.name()}, a creative assistant tasked with writing **realistic customer reviews** for a store of a specific category. 
The reviews should be indirect, written as if the reviewer is reporting what they heard from others, read online, or observed second-hand. Encourage creative indirect phrasing rather than repeating specific examples.

Category: "{category}"

Instructions:
1. Generate exactly 5 unique indirect negative reviews.
2. Each review should be short to medium length (1-3 sentences) and feel like a real customer comment.
3. Reviews must be **highly specific to this category** — mention details, products, services, or experiences that only a store in this category would provide.
4. Do NOT include content that could apply to any other category.
5. Do not use any store names and make it general so that it can be applied to any store in the category.
6. Return ONLY a Python-style list of strings.
7. Do NOT include explanations, numbering, or any text outside the list.
8. Use your own natural expressions to indicate that the information is second-hand or from other sources. Do not copy fixed phrases. Make each review unique in wording and tone.

Example output format:

[
    "<Review text 1>",
    "<Review text 2>",
    "<Review text 3>",
    "<Review text 4>",
    "<Review text 5>"
]
"""
    return prompt

def generate_advert_service_prompt(category):
    prompt = f"""
You are {fake.name()}, a creative assistant tasked with writing **positive advertisements** for other stores or websites inspired by a store in the category: "{category}". 
These ads should be **directly promotional**, highlighting features, products, or services that make the store appealing to potential customers.

Instructions:
1. Generate exactly 5 unique positive ads.
2. Each ad should be short to medium length (1-3 sentences) and feel like a real marketing message.
3. Ads must be **highly specific to this category** — mention products, services, store features, or website offerings that only a business in this category would provide.
4. Invent realistic store names, products, or services if needed. Do NOT use generic placeholders like "XYZ company".
5. Return ONLY a Python-style list of strings.
6. Do NOT include explanations, numbering, or any text outside the list.
7. Ads should be location/service based.

Example output format:

[
    "<Review text 1>",
    "<Review text 2>",
    "<Review text 3>",
    "<Review text 4>",
    "<Review text 5>"
]
"""
    return prompt

def generate_advert_website_prompt(category):
    prompt = f"""
You are {fake.name()}, a creative assistant tasked with writing **positive advertisements** for other stores or websites inspired by a store in the category: "{category}". 
These ads should be **directly promotional**, highlighting features, products, or services that make the store appealing to potential customers.

Instructions:
1. Generate exactly 5 unique positive ads.
2. Each ad should be short to medium length (1-3 sentences) and feel like a real marketing message.
3. Ads must be **highly specific to this category** — mention products, services, store features, or website offerings that only a business in this category would provide.
4. Invent realistic store names, products, or services if needed. Do NOT use generic placeholders like "XYZ company".
5. Return ONLY a Python-style list of strings.
6. Do NOT include explanations, numbering, or any text outside the list.
7. Ads should be website based.

Example output format:

[
    "<Review text 1>",
    "<Review text 2>",
    "<Review text 3>",
    "<Review text 4>",
    "<Review text 5>"
]
"""
    return prompt

def generate_prompts_grouped(
    categories_path, 
    max_group_prompt=100,
):
    categories_list = load_categories(categories_path)

    batch = []
    for category in categories_list:
        prompt = generate_advert_service_prompt(category)
        batch.append((prompt, category))
        if len(batch) >= max_group_prompt:
            yield batch
            batch = []

    if batch:
        yield batch

    for category in categories_list:
        prompt = generate_advert_website_prompt(category)
        batch.append((prompt, category))
        if len(batch) >= max_group_prompt:
            yield batch
            batch = []

    if batch:
        yield batch

def parse_list(s: str, length: int | None = None):
    start = s.find('[')
    end = s.rfind(']')
    
    if start == -1 or end == -1 or start > end:
        return None
    
    list_str = s[start:end+1]
    try:
        extracted_list = ast.literal_eval(list_str)
        
        if not isinstance(extracted_list, list):
            return None
        
        is_valid = all(isinstance(x, str) and len(x.strip()) > 0 for x in extracted_list)
        
        if length is not None and len(extracted_list) != length:
            return None

        if is_valid:
            return extracted_list
        return None
    
    except (ValueError, SyntaxError):
        return None

def reviews_to_csv(output_path, reviews, append=True):
    fieldnames = [
        "text", "category"
    ]
    mode = "a" if append else "w"

    with open(output_path, mode, newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        if not append:  
            writer.writeheader()
        for review in reviews:
            writer.writerow(review)

def clean_output(prompt: str, output_str: str) -> str:
    if output_str.startswith(prompt):
        return output_str[len(prompt):].strip()
    return output_str.strip()

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map={"": 0})

for batch in generate_prompts_grouped(
    categories_path=categories_path,
    max_group_prompt=64
):
    prompts = [prompt for prompt, _ in batch]
    categories = [category for _, category in batch]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(**inputs, 
        max_new_tokens=384,
        pad_token_id=tokenizer.eos_token_id
    )

    errors = 0
    rows_to_write = []
    for prompt_idx, output in enumerate(outputs):
        output_str = tokenizer.decode(output, skip_special_tokens=True)
        output_str = clean_output(prompts[prompt_idx], output_str)
        reviews = parse_list(output_str, 5)

        if reviews is None:
            errors += 1
            continue 

        for review in reviews:
            rows_to_write.append({
                "text": review,
                "category": categories[prompt_idx],
            })

    print(f"Writing {len(rows_to_write)} rows to {ads_reviews_path}, errors omitted: {errors}")
    reviews_to_csv(ads_reviews_path, rows_to_write)

    del inputs, outputs
    torch.cuda.empty_cache()

print(f"Classification complete. Results saved to {ads_reviews_path}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Writing 285 rows to ./raw/ads_reviews.csv, errors omitted: 7
Writing 280 rows to ./raw/ads_reviews.csv, errors omitted: 8
Writing 305 rows to ./raw/ads_reviews.csv, errors omitted: 3
Writing 285 rows to ./raw/ads_reviews.csv, errors omitted: 7
Writing 310 rows to ./raw/ads_reviews.csv, errors omitted: 2
Writing 285 rows to ./raw/ads_reviews.csv, errors omitted: 7
Writing 285 rows to ./raw/ads_reviews.csv, errors omitted: 7
Writing 295 rows to ./raw/ads_reviews.csv, errors omitted: 5
Writing 290 rows to ./raw/ads_reviews.csv, errors omitted: 6
Writing 300 rows to ./raw/ads_reviews.csv, errors omitted: 4
Writing 305 rows to ./raw/ads_reviews.csv, errors omitted: 3
Writing 295 rows to ./raw/ads_reviews.csv, errors omitted: 5
Writing 290 rows to ./raw/ads_reviews.csv, errors omitted: 6
Writing 285 rows to ./raw/ads_reviews.csv, errors omitted: 7
Writing 70 rows to ./raw/ads_reviews.csv, errors omitted: 2
Writing 305 rows to ./raw/ads_reviews.csv, errors omitted: 3
Writing 315 rows to ./raw

In [4]:
import pandas as pd
import ast
import random
import csv

def assign_gmap_ids(locations_path, ads_path, indirect_path, irelevant_path,
                    output_ads, output_indirect, output_irelevant):
    locations = pd.read_csv(locations_path)
    locations['category'] = locations['category'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    # --- ADS REVIEWS ---
    ads = pd.read_csv(ads_path)
    ads['gmap_id'] = ads.apply(lambda _: random.choice(locations['gmap_id']), axis=1)
    ads.to_csv(output_ads, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)
    
    # --- INDIRECT REVIEWS ---
    indirect = pd.read_csv(indirect_path)
    def assign_indirect(row):
        candidates = locations[locations['category'].apply(lambda cats: isinstance(cats, list) and row['category'] in cats)]
        if not candidates.empty:
            return random.choice(candidates['gmap_id'].tolist())
        else:
            return None 
    indirect['gmap_id'] = indirect.apply(assign_indirect, axis=1)
    indirect.to_csv(output_indirect, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)
    
    # --- IRRELEVANT REVIEWS ---
    irelevant = pd.read_csv(irelevant_path)
    def assign_irelevant(row):
        candidates = locations[locations['category'].apply(lambda cats: isinstance(cats, list) and row['category'] not in cats)]
        if not candidates.empty:
            return random.choice(candidates['gmap_id'].tolist())
        else:
            return None
    irelevant['gmap_id'] = irelevant.apply(assign_irelevant, axis=1)
    irelevant.to_csv(output_irelevant, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)
    
    print("Assignment complete!")

assign_gmap_ids(
    locations_path=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\locations.csv",
    ads_path=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\raw\ads_reviews.csv",
    indirect_path=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\raw\indirect_reviews.csv",
    irelevant_path=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\raw\irelevant_reviews.csv",
    output_ads=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\ads_reviews.csv",
    output_indirect=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\indirect_reviews.csv",
    output_irelevant=r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\irelevant_reviews.csv"
)


Assignment complete!


In [5]:
# Paths to the _assigned CSVs
assigned_ads = r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\ads_reviews.csv"
assigned_indirect = r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\indirect_reviews.csv"
assigned_irelevant = r"C:\Users\ian\Desktop\Coding\ReviewClassification\model\data\processed\irelevant_reviews.csv"

# --- ADS REVIEWS ---
ads = pd.read_csv(assigned_ads)
ads.to_csv(assigned_ads, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

# --- INDIRECT REVIEWS ---
indirect = pd.read_csv(assigned_indirect)
indirect = indirect.drop(columns=['category'], errors='ignore')
indirect.to_csv(assigned_indirect, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

# --- IRRELEVANT REVIEWS ---
irelevant = pd.read_csv(assigned_irelevant)
irelevant = irelevant.drop(columns=['category'], errors='ignore')
irelevant.to_csv(assigned_irelevant, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

print("CSV files updated successfully!")

CSV files updated successfully!
