In [None]:
import re
import pandas as pd
from tqdm import tqdm

from transformers import pipeline

model_name = "cardiffnlp/twitter-roberta-base-sentiment"

classifier = pipeline("sentiment-analysis", model=model_name, device=0)


label_map = {
    'LABEL_0': 'NEGATIVE',
    'LABEL_1': 'NEUTRAL',
    'LABEL_2': 'POSITIVE'
}

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

1. **Helper Functions**  
   Several utility functions are defined to support preprocessing and classification:
   - `remove_html(text)`: Strips HTML tags from the input text.
   - `clean_review(text)`: Performs text cleaning using the `remove_html` function.
   - `detect_language(text)`: Detects the language of the review to filter non-English entries.
   - `classify_sentiment(text)`: Uses a pretrained Hugging Face model to classify the sentiment of the review as positive, neutral, or negative.


In [None]:
def remove_html(text):
    clean = re.sub(r'<[^>]+>', ' ', text)
    return re.sub(r'\s+', ' ', clean).strip()

def clean_review(text):
    if pd.isna(text):
        return None
    text = remove_html(text)
    return text

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "error"


def classify_sentiment(text):
    try:
        return label_map[classifier(text[:514])[0]['label']]
    except Exception as e:
        print(f"Error processing: {text[:60]}... -> {e}")
        return "error"

2. **Preprocessing**  
   Raw review data is loaded and cleaned:
   - Read the CSV file: `data/reviews.csv`.
   - Remove entries with missing comments.
   - Apply `clean_review` to clean the review.
   - Filter out reviews that are not written in English using `detect_language`.

In [None]:
df = pd.read_csv("data/reviews.csv")
df.dropna(subset=['comments'], inplace=True)

df['comments'] = df['comments'].apply(clean_review)

df = df[df['comments'].apply(detect_language) == 'en']

3. **Sentiment Analysis**  
   The cleaned and filtered English reviews are passed in batches to a Hugging Face transformer pipeline running on GPU to classify each review's sentiment.

In [None]:
texts = df['comments'].str[:514].tolist()
batch_size = 32
results = []

for i in tqdm(range(0, len(texts), batch_size), desc="Classifying"):
    batch = texts[i:i + batch_size]
    try:
        batch_results = classifier(batch)
        batch_labels = [label_map.get(result['label'], 'unknown') for result in batch_results]
    except Exception as e:
        print(f"Error processing batch {i//batch_size}: {e}")
        batch_labels = ['error'] * len(batch)
    results.extend(batch_labels)

df['sentiment'] = results

4. **Saving Results**  
   The final dataset, including predicted sentiment labels, is saved. Additional filtering can be done to extract only positive or negative reviews for further analysis.

In [None]:
df[df['sentiment'] != 'NEUTRAL'].to_csv('data/sentiment_reviews.csv', index=False)
df[df['sentiment'] == 'NEGATIVE'][['comments', 'sentiment']].to_csv('data/negative_reviews.csv', index=False)
df[df['sentiment'] == 'POSITIVE'][['comments', 'sentiment']].to_csv('data/positive_reviews.csv', index=False)

print("Sentiment classification completed and saved to sentiment_reviews.csv")
print("Negative reviews saved to negative_reviews.csv size:", len(df[df['sentiment'] == 'NEGATIVE'].index))
print("Positive reviews saved to positive_reviews.csv size:", len(df[df['sentiment'] == 'POSITIVE'].index))