# Sentiment Analysis Using RoBERTa
Load the necessary packages

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import torch
from collections import defaultdict
import pickle
import os
import datetime
from tqdm import tqdm
from collections import defaultdict
import tensorflow as tf

This tells the device to run the analysis on the GPU. I have CUDA installed on my machine. I did run this script from the command line which has used its own conda environment. This was the only way to have the GPU do this task successfully. Use of the CPU takes way longer.

In [2]:
# Roberta
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

cpu


This reads in the data and the neccesary columns. I also initalize some variables here.

In [3]:
columns_to_read = ['videoId', 'caption_text']
file_path_df = r'new_data.feather'
df = pd.read_feather(file_path_df, columns=columns_to_read)
# Initialize variables
processed_count = 0
video_sentiment_scores = defaultdict(list)

This defines the function that is used to collect the sentiment for each chunk.

In [4]:
def chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, total_len, model, window_length=510):
    logits_list = []
    input_ids = [101] + input_ids + [102]
    attention_mask = [1] + attention_mask + [1]
    input_ids = torch.tensor(input_ids).to(device)
    attention_mask = torch.tensor(attention_mask).to(device)

    start = 0
    while start < total_len:
        end = start + window_length
        if end >= total_len:
            end = total_len

        input_ids_chunk = input_ids[start:end]
        attention_mask_chunk = attention_mask[start:end]

        # Calculate token length for this specific chunk
        token_length = len(input_ids_chunk)

        input_dict = {
            'input_ids': input_ids_chunk.unsqueeze(0).long(),
            'attention_mask': attention_mask_chunk.unsqueeze(0).int()
        }

        with torch.no_grad():
            outputs = model(**input_dict)

        logits_list.append((outputs.logits, token_length)) 
        
        start = end

    return logits_list

This creates a progress bar so I can see the progress. (44,100 videos to do)

In [5]:
def process_rows(start_index, end_index):
    global processed_count
    global video_sentiment_scores

    # Initialize the tqdm progress bar
    with tqdm(total= end_index - start_index + 1, desc='Processing texts', unit='texts') as pbar:
        for index in range(start_index, end_index + 1):
            try:
                row = df.iloc[index]
                text = row['caption_text']
                video_id = row['videoId']

                # Tokenize the text to obtain input_ids and attention_mask
                tokens = tokenizer.encode_plus(text, add_special_tokens=False)
                input_ids = tokens['input_ids']
                attention_mask = tokens['attention_mask']

                # Chunk the text and predict probabilities for each segment
                proba_list = chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, len(input_ids), model)

                # Extend or create a list of segments for the video ID in the dictionary
                if video_id in video_sentiment_scores:
                    video_sentiment_scores[video_id].extend(proba_list)
                else:
                    video_sentiment_scores[video_id] = proba_list

                pbar.update(1)
                processed_count += 1

            except Exception as e:
                print(f"Error processing text at index {index}: {e}")
                # Optionally handle or log the specific error
                continue

This defines where the function should start and stop. Call 'process_rows' to run the function. Then save it.  
For this example I went through 51 videos (index 50). Final index is 44099. 

In [6]:
start_index = 0
end_index = 50
process_rows(start_index, end_index)
video_sentiment_scores_cpu = defaultdict(list)

for video_id, sentiment_scores in video_sentiment_scores.items():
    video_sentiment_scores_cpu[video_id] = [
        (scores.cpu().numpy(), token_length) for scores, token_length in sentiment_scores
    ]

save_folder = r'C:\\Users\\goral\\Documents\\Python\\Scripts\\SAVES'
file_name = r"video_sentiment_scores_local_sample"
save_path = os.path.join(save_folder, file_name + ".pkl")
with open(save_path, 'wb') as file:
    pickle.dump(video_sentiment_scores_cpu, file)

print(f"Video sentiment scores saved to: {save_path}")

Processing texts: 100%|██████████| 51/51 [00:09<00:00,  5.29texts/s]

Video sentiment scores saved to: video_sentiment_scores_local_sample.pkl



