## Aggregating RoBERTa scores 
Load the necessary packages and load the pickle file with the gathered sentiment scores.

In [11]:
import pickle
import torch
import pandas as pd
import numpy as np
import csv

# Specify the path to the saved pickle file
load_path = r'video_sentiment_scores_local_gpu.pkl'

# Load the pickle file
with open(load_path, 'rb') as file:
    loaded_video_sentiment_scores = pickle.load(file)

I chose three videos to print to see what the output was. 

In [12]:
# Define keys you want to print
keys_to_print = ['Xd9Aj9PYats', 'TGiyIQ2SC9U', 'a_DKejYUb6E']

# Print only specific keys from the loaded data
for key in keys_to_print:
    if key in loaded_video_sentiment_scores:
        value = loaded_video_sentiment_scores[key]
        print(f"Key: {key}, Type: {type(value)}")
        print(f"Value: {value}\n")
    else:
        print(f"Key '{key}' not found.")

Key: Xd9Aj9PYats, Type: <class 'list'>
Value: [(array([[-2.8199172 ,  0.23089172,  2.9052267 ]], dtype=float32), 180)]

Key: TGiyIQ2SC9U, Type: <class 'list'>
Value: [(array([[ 0.7475943,  0.5757689, -1.5683148]], dtype=float32), 510), (array([[ 0.40991086,  0.77049816, -1.4032675 ]], dtype=float32), 510), (array([[ 0.6675381 ,  0.64852756, -1.5578878 ]], dtype=float32), 510), (array([[ 0.7406259,  0.6555336, -1.598153 ]], dtype=float32), 93)]

Key: a_DKejYUb6E, Type: <class 'list'>
Value: [(array([[-2.425388 ,  1.211941 ,  1.1355975]], dtype=float32), 510), (array([[-0.8975626,  1.7862841, -1.2017012]], dtype=float32), 47)]



First create an empty list. Then weigh each chunk before taking the average for each video. Then append the list with the new values. 

In [13]:
aggregated_elements_list = []

# Assuming loaded_data_cpu is a dictionary with video IDs as keys and lists of tuples as values
for video_id, scores in loaded_video_sentiment_scores.items():
    if isinstance(scores, list) and len(scores) > 0:
        total_positive, total_neutral, total_negative = 0, 0, 0
        total_tokens = sum(weight for _, weight in scores)  # Summing up the weights for total_tokens
        chunk_count = len(scores)  # Count of chunks for each video ID
        
        for tensor_scores, weight in scores:
            if isinstance(tensor_scores, np.ndarray) and tensor_scores.size > 0:
                if tensor_scores.shape[1] >= 3:  # Check the number of columns in the array
                    total_negative += tensor_scores[0, 0] * weight
                    total_neutral += tensor_scores[0, 1] * weight
                    total_positive += tensor_scores[0, 2] * weight

        if chunk_count > 0:
            positive_avg = total_positive / total_tokens
            neutral_avg = total_neutral / total_tokens
            negative_avg = total_negative / total_tokens
            
            positive_avg = format(positive_avg, '.8f')
            neutral_avg = format(neutral_avg, '.8f')
            negative_avg = format(negative_avg, '.8f')
            
            # Append the averages with labels to the list for each video ID
            aggregated_elements_list.append((video_id, {'neg': negative_avg, 'neu': neutral_avg, 'pos': positive_avg, 'total_tokens': total_tokens, 'chunk_count': chunk_count}))


Quick print of the end result.

In [14]:
for item in aggregated_elements_list[:20]:
    print(item)


('Xd9Aj9PYats', {'neg': '-2.81991720', 'neu': '0.23089172', 'pos': '2.90522671', 'total_tokens': 180, 'chunk_count': 1})
('mj3KdTI-MJc', {'neg': '-2.17857313', 'neu': '0.63513565', 'pos': '1.47983086', 'total_tokens': 278, 'chunk_count': 1})
('NB1MbFGLP-4', {'neg': '-3.39951777', 'neu': '0.45537752', 'pos': '3.10518146', 'total_tokens': 111, 'chunk_count': 1})
('BNzOf0LMMmU', {'neg': '-3.14023209', 'neu': '0.71023858', 'pos': '2.36409974', 'total_tokens': 79, 'chunk_count': 1})
('jmSIiXWTuXU', {'neg': '-2.24710751', 'neu': '0.80607367', 'pos': '1.01895618', 'total_tokens': 78, 'chunk_count': 1})
('hptX6cW2ECo', {'neg': '-2.48175192', 'neu': '1.62193692', 'pos': '0.59548968', 'total_tokens': 119, 'chunk_count': 1})
('GRs2eMGVQzk', {'neg': '-2.43378901', 'neu': '0.42441916', 'pos': '2.10528517', 'total_tokens': 178, 'chunk_count': 1})
('G3ZfdXs0-9I', {'neg': '-1.24455845', 'neu': '0.80801034', 'pos': '0.13362151', 'total_tokens': 93, 'chunk_count': 1})
('Qsp9UZE4meE', {'neg': '-0.4126936

Then save the output into a CSV to be used in the analysis.

In [15]:
csv_file_path = 'aggregated_elements_list.csv'
with open(csv_file_path, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    
    # Write header
    csv_writer.writerow(['video_id', 'neg', 'neu', 'pos', 'total_tokens', 'chunk_count'])
    
    # Write data
    for video_id, aggregated_data in aggregated_elements_list:
        csv_writer.writerow([video_id, aggregated_data['neg'], aggregated_data['neu'], 
                             aggregated_data['pos'], aggregated_data['total_tokens'], 
                             aggregated_data['chunk_count']])