In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import csv
from tqdm import tqdm
import pandas as pd

In [19]:
# 加载文本分析工具
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Load the model and tokenizer
model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)


Using device: cuda


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)

In [20]:
#load dataset
reviews_path = '../city_filter/reviews_Indianapolis.csv'

In [28]:
# predict sentiment
def predict_sentiment_batch(text_list, batch_size=32):
    """
    批量预测文本的情感分数
    :param text_list: 文本列表
    :param batch_size: 批量大小
    :return: 情感分数列表
    """
    all_probs = []

    for i in tqdm(range(0, len(text_list), batch_size)):
        batch = text_list[i:i + batch_size]
        try:
            inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}  # 输入放入 GPU
            with torch.no_grad():
                outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            all_probs.extend(probs.cpu())  # 放回 CPU，方便后续处理
        except Exception as e:
            print(f"Error in batch {i}: {e}")
            all_probs.extend([None] * len(batch))  # 出错时填充 None

    return all_probs

In [31]:
text = ["I love this place!"]
sentiments = predict_sentiment_batch(text)
print(sentiments)

100%|██████████| 1/1 [00:00<00:00, 223.83it/s]

[tensor([0.0193, 0.0216, 0.0845, 0.2901, 0.5844])]





In [32]:
# use slide window to process long text
def slide_window_analysis_batch(text, window_size=512, step_size=256):
    """
    Split long text into overlapping chunks and analyze sentiment using batch GPU processing.
    """
    sentiment_map = {
        0: 'Very Negative',
        1: 'Negative',
        2: 'Neutral',
        3: 'Positive',
        4: 'Very Positive'
    }

    tokens = tokenizer.encode(text)

    # 如果长度小于512，直接分析
    if len(tokens) < window_size:
        probs = predict_sentiment_batch([text])[0]
        if probs is None:
            return None
        sentiment = sentiment_map[int(torch.argmax(probs))]
        return sentiment

    # 滑动窗口切分 chunk
    chunks = []
    for i in range(0, len(tokens), step_size):
        chunk = tokens[i:i + window_size]
        if not chunk:
            break
        chunks.append(tokenizer.decode(chunk))

    # 确保最后一块也在
    if len(tokens) > 0:
        last_chunk = tokenizer.decode(tokens[-window_size:])
        if last_chunk not in chunks:
            chunks.append(last_chunk)

    # 批量分析所有 chunk
    chunk_probs = predict_sentiment_batch(chunks)

    # 过滤掉 None（出错的部分）
    valid_probs = [p for p in chunk_probs if p is not None]
    if not valid_probs:
        return None

    # 平均合并所有 chunk 的预测
    avg_probs = torch.mean(torch.stack(valid_probs), dim=0)
    sentiment = sentiment_map[int(torch.argmax(avg_probs))]
    return sentiment



In [34]:
long_text = ["I've been visiting this restaurant for years, and honestly, it's a bit of a mixed bag. The ambiance is cozy, and the staff are usually friendly, though sometimes service can be a little slow, especially on weekends. I really enjoy the pasta—they make it fresh, and you can taste the difference. Their tiramisu is easily one of the best in town, super creamy and not too sweet. That said, their coffee could use some work—it’s often too bitter or burnt-tasting. Prices have gone up lately, which is understandable, but still noticeable. On my last visit, they accidentally messed up my order, but the manager came over personally to apologize and comped the dish. That level of customer service keeps me coming back. Overall, while it's not perfect, it’s one of the more reliable places around, and I always leave feeling satisfied. Definitely worth a try if you’re in the neighborhood and craving Italian."]
print(slide_window_analysis_batch(long_text[0], window_size=512, step_size=256))

100%|██████████| 1/1 [00:00<00:00, 187.17it/s]

Neutral





In [35]:
try_path = '../yelp_reviews_first100.csv'

In [38]:
# 分析评论
def review_analysis_optimized(reviews_path, output_path, batch_size=32):
    """
    Analyze reviews from a CSV file using GPU-accelerated batch processing and sliding window.
    Each 'text' is replaced with its sentiment label.

    :param reviews_path: str, path to input CSV with reviews
    :param output_path: str, path to save the output CSV with sentiments
    :param batch_size: int, number of reviews to process at once
    """
    import csv

    processed_rows = []
    batch_texts = []
    batch_rows = []

    with open(reviews_path, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames
        total_lines = sum(1 for _ in open(reviews_path, 'r', encoding='utf-8')) - 1  # skip header

        infile.seek(0)
        reader = csv.DictReader(infile)

        for row in tqdm(reader, total=total_lines, desc="Analyzing"):
            text = row['text'].strip()
            batch_texts.append(text)
            batch_rows.append(row)

            # 满一批就处理
            if len(batch_texts) == batch_size:
                sentiments = [slide_window_analysis_batch(text) for text in batch_texts]
                for r, s in zip(batch_rows, sentiments):
                    r['text'] = s if s is not None else "Unknown"
                    processed_rows.append(r)
                batch_texts, batch_rows = [], []

        # 处理剩下不足一批的
        if batch_texts:
            sentiments = [slide_window_analysis_batch(text) for text in batch_texts]
            for r, s in zip(batch_rows, sentiments):
                r['text'] = s if s is not None else "Unknown"
                processed_rows.append(r)

    # 写出新 CSV
    with open(output_path, 'w', encoding='utf-8', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(processed_rows)

    print(f"情感分析完成，总共处理 {len(processed_rows)} 条记录，结果已保存至: {output_path}")

            

In [40]:
output_path = "reviews_Indianapolis_analyzed.csv"
review_analysis_optimized(reviews_path=reviews_path, output_path=output_path)

100%|██████████| 1/1 [00:00<00:00, 202.04it/s] ?it/s]
100%|██████████| 1/1 [00:00<00:00, 155.97it/s]
100%|██████████| 1/1 [00:00<00:00, 245.15it/s]
100%|██████████| 1/1 [00:00<00:00, 257.87it/s]
100%|██████████| 1/1 [00:00<00:00, 192.63it/s]
100%|██████████| 1/1 [00:00<00:00, 218.72it/s]
100%|██████████| 1/1 [00:00<00:00, 264.34it/s]
100%|██████████| 1/1 [00:00<00:00, 254.60it/s]
100%|██████████| 1/1 [00:00<00:00, 266.93it/s]
100%|██████████| 1/1 [00:00<00:00, 187.94it/s]
100%|██████████| 1/1 [00:00<00:00, 246.83it/s]
100%|██████████| 1/1 [00:00<00:00, 262.21it/s]
100%|██████████| 1/1 [00:00<00:00, 265.93it/s]
100%|██████████| 1/1 [00:00<00:00, 249.29it/s]
100%|██████████| 1/1 [00:00<00:00, 249.08it/s]
100%|██████████| 1/1 [00:00<00:00, 249.57it/s]
100%|██████████| 1/1 [00:00<00:00, 248.10it/s]
100%|██████████| 1/1 [00:00<00:00, 253.42it/s]
100%|██████████| 1/1 [00:00<00:00, 252.32it/s]
100%|██████████| 1/1 [00:00<00:00, 269.40it/s]
100%|██████████| 1/1 [00:00<00:00, 266.73it/s]
100%|█

KeyboardInterrupt: 