In [5]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import ast
import re
import torch
import csv
import pandas as pd

location_path = "./raw/meta-other.json"
review_path = "./raw/review-other.json"
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
review_classification_path = "./raw/reviews_classified.csv"

In [2]:
def load_location_dict(path):
    meta = {}
    drop_keys = {"latitude", "longitude", "url"}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            for key in drop_keys:
                obj.pop(key, None)
            meta[obj["gmap_id"]] = obj
    return meta

def load_reviews_dict(path):
    meta = []
    drop_keys = {"user_id", "time"}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            for key in drop_keys:
                obj.pop(key, None)
            meta.append(obj)
    return meta

def load_reviews_gen(path, start=0, end=1000):
    drop_keys = {"user_id", "time"}
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i < start:
                continue
            if end is not None and i >= end:
                break
            obj = json.loads(line)
            for key in drop_keys:
                obj.pop(key, None)
            yield obj

def load_reviews_grouped(path, start=0, end=1000, max_group=5):
    drop_keys = {"user_id", "time"}
    current_group = []
    current_gmap_id = None

    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i < start:
                continue
            if end is not None and i >= end:
                break

            obj = json.loads(line)
            for key in drop_keys:
                obj.pop(key, None)

            gmap_id = obj.get("gmap_id")

            if gmap_id != current_gmap_id:
                if current_group:
                    yield current_group
                current_group = [obj]
                current_gmap_id = gmap_id
            else:
                current_group.append(obj)
                if len(current_group) >= max_group:
                    yield current_group
                    current_group = []
                    current_gmap_id = None

        if current_group:
            yield current_group

In [3]:
def generate_prompt(reviews, location):
    if isinstance(reviews, dict):
        reviews = [reviews]
    review_texts = "\n\n".join(
        f"Review {i+1}:\n\"\"\"{r.get('text', '')}\"\"\"" for i, r in enumerate(reviews)
    )
    location_context = str(location)

    prompt = f"""
You are a content moderation assistant. Your job is to determine, for each review provided, 
whether it is relevant to the given location and complies with the policies listed below.

Here are the reviews:

{review_texts}

Location context:

\"\"\"{location_context}\"\"\"

Policies to enforce:
1. No advertisements or promotional content.
2. No irrelevant content (e.g., reviews about unrelated topics).
3. No rants or complaints from users who have not visited the place.

Instructions:
- Evaluate each review independently.
- If a review is relevant and complies with all policies, assign 1.
- If a review is irrelevant or violates any policy, assign 0.
- ONLY return a single Python-style list of integers, one element per review, in the SAME ORDER as the reviews.
- Do not include any text, explanations, or punctuation outside of the list.

Example output format (for four reviews):

[1, 0, 1, 1]
"""

    return prompt

def generate_prompts_grouped(
    review_path, 
    location_path, 
    start=0, 
    end=1000, 
    max_group_prompt=100, 
    max_group_review=5
):
    location_dict = load_location_dict(location_path)

    batch = []
    for group in load_reviews_grouped(review_path, start, end, max_group_review):
        gmap_id = group[0]["gmap_id"]
        location = location_dict.get(gmap_id, None)
        if location:
            prompt = generate_prompt(group, location)
            batch.append((prompt, group))
            if len(batch) >= max_group_prompt:
                yield batch
                batch = []
        else:
            continue

    if batch:
        yield batch

def parse_list(s: str, length: int | None = None):
    match = re.search(r'\[.*\]', s, re.DOTALL)
    if not match:
        return None
    
    list_str = match.group(0)
    try:
        extracted_list = ast.literal_eval(list_str)
        
        if not isinstance(extracted_list, list):
            return None
        
        is_valid = all(isinstance(x, (int, float)) and x in (0, 1) for x in extracted_list)
        
        if length is not None and len(extracted_list) != length:
            return None

        if is_valid:
            return [float(x) for x in extracted_list]
        return None
    
    except (ValueError, SyntaxError):
        return None

def reviews_to_csv(output_path, reviews, append=True):
    fieldnames = [
        "name", "rating", "text",
        "resp", "gmap_id", "label"
    ]
    mode = "a" if append else "w"

    with open(output_path, mode, newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        if not append:  
            writer.writeheader()
        for review in reviews:
            writer.writerow(review)

def clean_output(prompt: str, output_str: str) -> str:
    if output_str.startswith(prompt):
        return output_str[len(prompt):].strip()
    return output_str.strip()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map={"": 0})

for batch in generate_prompts_grouped(
    review_path=review_path,
    location_path=location_path,
    start=0,
    end=50000,
    max_group_prompt=16,
    max_group_review=5
):
    prompts = [prompt for prompt, _ in batch]
    groups = [group for _, group in batch]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=48)

    errors = 0
    rows_to_write = []
    for prompt_idx, output in enumerate(outputs):
        output_str = tokenizer.decode(output, skip_special_tokens=True)
        output_str = clean_output(prompts[prompt_idx], output_str)
        labels = parse_list(output_str, len(groups[prompt_idx]))

        if labels is None:
            errors += 1
            continue 

        for review, label in zip(groups[prompt_idx], labels):
            rows_to_write.append({
                "name": review.get("name", ""),
                "rating": review.get("rating", ""),
                "text": review.get("text", ""),
                "resp": review.get("resp", ""),
                "gmap_id": review.get("gmap_id", ""),
                "label": label,
            })

    print(f"Writing {len(rows_to_write)} rows to {review_classification_path}, errors omitted: {errors}")
    reviews_to_csv(review_classification_path, rows_to_write)

    del inputs, outputs
    torch.cuda.empty_cache()

print(f"Classification complete. Results saved to {review_classification_path}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 63 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 67 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 64 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 69 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 58 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 64 rows to ./raw/reviews_classified.csv, errors omitted: 1


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 65 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 78 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 73 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 71 rows to ./raw/reviews_classified.csv, errors omitted: 1


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 76 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 66 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 75 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 63 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 68 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 74 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 70 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 68 rows to ./raw/reviews_classified.csv, errors omitted: 1


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 67 rows to ./raw/reviews_classified.csv, errors omitted: 1


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 78 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 78 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 75 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 73 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 78 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 72 rows to ./raw/reviews_classified.csv, errors omitted: 1


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 63 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 79 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 77 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 80 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 72 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 74 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 73 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 59 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 77 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 78 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 80 rows to ./raw/reviews_classified.csv, errors omitted: 0


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Writing 79 rows to ./raw/reviews_classified.csv, errors omitted: 0


In [9]:
df = pd.read_csv(review_classification_path)

nan_count = df["text"].isna().sum()
print(f"Number of NaN 'text' rows: {nan_count}")

df = df.dropna(subset=["text"])
count_1 = (df["label"] == 1.0).sum()
count_0 = (df["label"] == 0.0).sum()

print(f"Number of reviews labeled 1: {count_1}")
print(f"Number of reviews labeled 0: {count_0}")

Number of NaN 'text' rows: 13209
Number of reviews labeled 1: 26822
Number of reviews labeled 0: 2686


In [11]:
positive_df = df[df["label"] == 1.0][["text", "gmap_id"]]
positive_df.to_csv(
    "positive_reviews.csv",
    index=False,
    encoding="utf-8",
    quoting=csv.QUOTE_ALL
)