In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import csv
from tqdm import tqdm
import pandas as pd

In [2]:
# 加载文本分析工具
model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


In [3]:
#load dataset
reviews_path = '../city_filter/reviews_Indianapolis.csv'

In [4]:
# predict sentiment
def predict_sentiment(text):
    """
    Predict sentiment for a list of texts using a pre-trained model.
    params:
        text: strings to analyze
    returns:
        probabilities: tensor of sentiment probabilities [Very Negative, Negative, Neutral, Positive, Very Positive]
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return probabilities

In [5]:
text = "I love this place!"
sentiments = predict_sentiment(text)
print(sentiments)

tensor([[0.0193, 0.0216, 0.0845, 0.2901, 0.5844]])


In [6]:
# use slide window to process long text
def slide_window_analysis(text, window_size=512, step_size=256):
    """
    Split text into overlapping chunks for processing.
    params:
        text: string to analyze
        window_size: size of each chunk
        step_size: size of the overlap between chunks
    returns:
        sentiment: sentiment label
    """
    sentiment_map = {
        0: 'Very Negative',
        1: 'Negative',
        2: 'Neutral',
        3: 'Positive',
        4: 'Very Positive'
    }

    tokens = tokenizer.encode(text)

    # 如果长度小于512，直接分析
    if len(tokens) < 512:
        probabilities = predict_sentiment(text)
        sentiment = sentiment_map[int(torch.argmax(probabilities))]
        return sentiment
    
    # 如果长度大于512，使用滑动窗口方法
    chunks = []
    
    for i in range(0, len(tokens), step_size):
        chunk = tokens[i:i + window_size]
        if len(chunk) == 0:
            break
        chunks.append(tokenizer.decode(chunk))

    # 确保最后一块也被包含
    if len(tokens) > 0:
        last_chunk = tokenizer.decode(tokens[-window_size:])
        if last_chunk not in chunks:
            chunks.append(last_chunk)

    # analyze each chunk
    sentiment_probabilities = []
    for chunk in chunks:
        probabilities = predict_sentiment(chunk)
        sentiment_probabilities.append(probabilities)
    # combine sentiments by average
    sentiment_probabilities = torch.mean(torch.stack(sentiment_probabilities), dim=0)
    sentiment = sentiment_map[int(torch.argmax(sentiment_probabilities))]
    return sentiment


In [10]:
long_text = ["I've been visiting this restaurant for years, and honestly, it's a bit of a mixed bag. The ambiance is cozy, and the staff are usually friendly, though sometimes service can be a little slow, especially on weekends. I really enjoy the pasta—they make it fresh, and you can taste the difference. Their tiramisu is easily one of the best in town, super creamy and not too sweet. That said, their coffee could use some work—it’s often too bitter or burnt-tasting. Prices have gone up lately, which is understandable, but still noticeable. On my last visit, they accidentally messed up my order, but the manager came over personally to apologize and comped the dish. That level of customer service keeps me coming back. Overall, while it's not perfect, it’s one of the more reliable places around, and I always leave feeling satisfied. Definitely worth a try if you’re in the neighborhood and craving Italian."]
print(slide_window_analysis(long_text[0], window_size=512, step_size=256))

Neutral


In [11]:
try_path = '../yelp_reviews_first100.csv'

In [7]:
# 分析评论
def review_analysis(reviews_path, output_path):
    """
    Analyze reviews from a CSV file and save the results to a new CSV file.
    Each 'text' is replaced with its sentiment label.
    
    :param reviews_path: str, path to input CSV with reviews
    :param output_path: str, path to save the output CSV with sentiments
    """
    processed_rows = []

    with open(reviews_path, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        for i, row in tqdm(enumerate(reader),total=361701):
            review_text = row['text']
            sentiment = slide_window_analysis(review_text)
            row['text'] = sentiment  # 替换原来的文本为情感标签
            processed_rows.append(row)


    # 写入新CSV
    with open(output_path, 'w', encoding='utf-8', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
        writer.writeheader()
        writer.writerows(processed_rows)

    print(f"情感分析完成，结果已保存至: {output_path}")
            

In [None]:
output_path = "reviews_Indianapolis_analyzed.csv"
review_analysis(reviews_path=reviews_path,output_path=output_path)