In [None]:
import re
import pandas as pd

from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, pipeline

model_name = "cardiffnlp/twitter-roberta-base-sentiment"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
classifier = pipeline("sentiment-analysis", model=model_name)
# config = AutoConfig.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name)

label_map = {
    'LABEL_0': 'NEGATIVE',
    'LABEL_1': 'NEUTRAL',
    'LABEL_2': 'POSITIVE'
}

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'LABEL_2', 'score': 0.9889271259307861}]


In [98]:
def remove_html(text):
    clean = re.sub(r'<[^>]+>', ' ', text)
    return re.sub(r'\s+', ' ', clean).strip()

def clean_review(text):
    if pd.isna(text):
        return None
    text = remove_html(text)
    return text

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "error"


def classify_sentiment(text):
    try:
        return label_map[classifier(text[:514])[0]['label']]
    except Exception as e:
        print(f"Error processing: {text[:60]}... -> {e}")
        return "error"

* PREPROCESSING

In [81]:
df = pd.read_csv("data/reviews.csv")
df.dropna(subset=['comments'], inplace=True)

df['comments'] = df['comments'].apply(clean_review)

df = df[df['comments'].apply(detect_language) == 'en']

SENTIMENT ANALYSIS

In [99]:
df['sentiment'] = df['comments'].apply(classify_sentiment)

Error processing: 公寓地段非常好，距离最近的地铁站步行五分钟，去City Hall两站地铁，去Galma Stan三站地铁，去Vasa M... -> The expanded size of the tensor (583) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 583].  Tensor sizes: [1, 514]
Error processing: 房間整潔，位置良好，離地鐵真的近，有電梯，附近生活機能還不錯 沒看清楚是我的疏失，對不起，廚房可以使用但是不能用爐子，就... -> The expanded size of the tensor (564) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 564].  Tensor sizes: [1, 514]
Error processing: 저렴한 가격임에도 불구하고 진짜 너무 좋은 방이었어요. 지하철 2분거리에 웬만한 곳들은 지하철 30분 이내로... -> The expanded size of the tensor (517) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 517].  Tensor sizes: [1, 514]
Error processing: Eva is very friendly and humorous. The house was more spacio... -> The expanded size of the tensor (528) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 528].  Tensor sizes: [1, 514]
Error processing: One star I can give. 很糟糕的经历。1如果您是女

In [101]:
df.to_csv("data/sentiment_reviews.csv", index=False)
df[df['sentiment'] == 'NEGATIVE'][['comments', 'sentiment']].to_csv('data/negative_reviews.csv', index=False)
df[df['sentiment'] == 'POSITIVE'][['comments', 'sentiment']].to_csv('data/positive_reviews.csv', index=False)