In [20]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from tqdm import tqdm
import torch.nn.functional as F


import gensim.downloader
from scipy.sparse import hstack

import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

import emoji

import time

# Importing Data

In [10]:
annotated_data = pd.read_csv('data/incomplete_annotations_data.csv')
full_data = pd.read_csv('data/merged_comments2.csv')

In [13]:
print(annotated_data.shape)
annotated_data = annotated_data.rename(columns={'Brand': 'brand', 'Search Term': 'search_term', 'Comment': 'comment', 'Metadata': 'additional_info', 'Source': 'source', 'Subjectivity': 'subjectivity', 'Polarity': 'polarity'})
annotated_data.head()

(2683, 7)


Unnamed: 0,brand,search_term,comment,source,additional_info,subjectivity,polarity
0,Nike,waste,Designing products with sustainability in mind...,Twitter,"{'Name': 'Angla Sicurella', 'Handle': '@AnglaS...",0.0,
1,Nike,waste,Kirby would have been a waste of time - why ev...,Twitter,"{'Name': 'LisaKingWheless', 'Handle': '@Lisapc...",1.0,0.0
2,Nike,waste,I wouldn’t spend another dollar at that theate...,Twitter,"{'Name': 'Sheila McSheilerton', 'Handle': '@sh...",1.0,0.0
3,Nike,waste,Call them back and tell them they’re lying bec...,Twitter,"{'Name': 'UncleChrissy', 'Handle': '@uncle_chr...",1.0,0.0
4,Nike,waste,I’m really sitting here going in on myself..li...,Twitter,"{'Name': 'Jade ☥', 'Handle': '@jmerarity', 'Ti...",1.0,1.0


In [14]:
full_data.head()

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp
0,1,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
1,2,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
2,3,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
3,4,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
4,5,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,


In [4]:
print(full_data.shape)
full_data.head()

(41172, 7)


Unnamed: 0,Brand,Search Term,Comment,Source,Metadata,likes,timestamp
0,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
1,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
2,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
3,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
4,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,


In [15]:
full_data.drop_duplicates(subset='comment', inplace=True)
annotated_data.drop_duplicates(subset='comment', inplace=True)

In [16]:
# Merge full_data with annotated_data based on 'Comment' column
merged_data = pd.merge(full_data, annotated_data[['comment','subjectivity','polarity']], on='comment', how='left')

merged_data.head()

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp,subjectivity,polarity
0,1,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,,
1,2,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,,
2,3,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,,
3,4,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,,
4,5,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,,


In [18]:
unannotated_data = merged_data[merged_data['subjectivity'].isnull()]

# Preprocessing

In [19]:
# Creating extra column for preprocessed text
unannotated_data['preprocessed_comment'] = unannotated_data['comment']

{"ain't": 'is not', "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "'cause": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'd've": 'he would have', "he'll": 'he will', "he'll've": 'he he will have', "he's": 'he is', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "i'd": 'i would', "i'd've": 'i would have', "i'll": 'i will', "i'll've": 'i will have', "i'm": 'i am', "i've": 'i have', "isn't": 'is not', "it'd": 'it would', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'migh

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unannotated_data['preprocessed_comment'] = unannotated_data['comment']


In [21]:
# Normalizing emojis

def demojize_with_delimiters(text):
    return emoji.demojize(text, delimiters=(" ", " "))

unannotated_data['preprocessed_comment'] = unannotated_data['preprocessed_comment'].apply(lambda x: demojize_with_delimiters(x) if isinstance(x, str) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unannotated_data['preprocessed_comment'] = unannotated_data['preprocessed_comment'].apply(lambda x: demojize_with_delimiters(x) if isinstance(x, str) else x)


In [22]:
# Lowercasing

unannotated_data['preprocessed_comment'] = unannotated_data['preprocessed_comment'].apply(lambda x: x.lower() if isinstance(x, str) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unannotated_data['preprocessed_comment'] = unannotated_data['preprocessed_comment'].apply(lambda x: x.lower() if isinstance(x, str) else x)


In [23]:
# Removing stopwords
nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(text):
    # Ensure the input is a string
    if isinstance(text, str):
        # Tokenize the text into words
        words = nltk.word_tokenize(text)

        # Get the list of stopwords
        stop_words = set(stopwords.words('english'))

        # Remove stopwords from the tokenized words
        filtered_words = [word for word in words if word.lower() not in stop_words]

        # Join the filtered words back into a single string
        filtered_text = ' '.join(filtered_words)

        return filtered_text
    else:
        return text

unannotated_data['preprocessed_comment'] = unannotated_data['preprocessed_comment'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Louis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Louis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unannotated_data['preprocessed_comment'] = unannotated_data['preprocessed_comment'].apply(remove_stopwords)


In [24]:
with open('abbreviations_list.pkl', 'rb') as file:
    abbreviations = pickle.load(file)

print(abbreviations)

# Function to manually tokenize text including punctuations
def custom_tokenize(text):
    # Regex pattern to match words (including contractions) and separate punctuation
    tokens = re.findall(r"[\w']+|[.,!?;]", text)
    return tokens

# Normalize slangs and abbreviations
def normalize_slangs_abbreviations_custom(text, slang_dict):
    if isinstance(text, str):
        tokens = custom_tokenize(text)
        normalized_tokens = [slang_dict.get(token.lower(), token) for token in tokens]
        # Reconstruct the text
        normalized_text = ' '.join(normalized_tokens).replace(" ,", ",").replace(" .", ".").replace(" !", "!").replace(" ?", "?")
        return normalized_text
    else:
        return text

unannotated_data['preprocessed_comment'] = unannotated_data['preprocessed_comment'].apply(lambda x: normalize_slangs_abbreviations_custom(x, abbreviations))

{"ain't": 'is not', "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "'cause": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'd've": 'he would have', "he'll": 'he will', "he'll've": 'he he will have', "he's": 'he is', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "i'd": 'i would', "i'd've": 'i would have', "i'll": 'i will', "i'll've": 'i will have', "i'm": 'i am', "i've": 'i have', "isn't": 'is not', "it'd": 'it would', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'migh

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unannotated_data['preprocessed_comment'] = unannotated_data['preprocessed_comment'].apply(lambda x: normalize_slangs_abbreviations_custom(x, abbreviations))


In [25]:
unannotated_data

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp,subjectivity,polarity,preprocessed_comment
0,1,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,,,trash space hippie behind design nike i nike é...
1,2,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,,,trash space hippie behind design nike so lie?
2,3,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,,,trash space hippie behind design nike this pro...
3,4,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,,,trash space hippie behind design nike just got...
4,5,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,,,trash space hippie behind design nike i tried ...
...,...,...,...,...,...,...,...,...,...,...,...
41163,41164,Puma,quality,"What do Post Malone, Black Pumas and Lou Reed ...",Twitter,"{'Name': 'Ale𝕏 Gear & Tech', 'Handle': '@AlexG...",29,2023-11-04T12:47:12Z,,,"post malone, black pumas lou reed common? coll..."
41164,41165,Nike,quality,Are you comparing it to the price of the shamr...,Twitter,"{'Name': '', 'Handle': '@WeAreNDFans', 'Timest...",0,2022-09-19T14:58:24Z,,,"comparing price shamrock series?, fair compari..."
41166,41167,Uniqlo,price,Some people see their prices and compare them ...,Twitter,"{'Name': 'Miguel Marcos Martinez, FRSA, MCybSo...",2,2022-09-15T11:09:51Z,,,people see prices compare uniqlo whoever. comp...
41167,41168,Nike,price,BASED ON POPULAR DEMANDS*NIKE AIR FORCE1 LOW M...,Twitter,"{'Name': 'Ibrahim', 'Handle': '@exclusive43779...",0,2024-03-29T17:53:13Z,,,based popular demands nike air force1 low mich...


# Ensemble Model

## Subjectivity Detection - RoBERTa

In [26]:
# Load unannotated data
unannotated_texts = unannotated_data['preprocessed_comment'].tolist()

# Tokenize unannotated texts
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
encoded_unannotated = tokenizer(unannotated_texts, padding=True, truncation=True, return_tensors='pt')

# Prepare DataLoader for unannotated data
unannotated_dataset = TensorDataset(encoded_unannotated['input_ids'], encoded_unannotated['attention_mask'])
unannotated_loader = DataLoader(unannotated_dataset, batch_size=16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.load_state_dict(torch.load('models/best_roberta_adamw_subjectivity2.pth'))
model.to(device)
model.eval()

# Predict
start_time = time.time()

predictions = []
for batch in tqdm(unannotated_loader, desc="Predicting"):
    input_ids, attention_mask = batch
    
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).detach().cpu().numpy()
        predictions.extend(preds)


end_time = time.time()

# Save or process predictions
unannotated_data['subjectivity'] = predictions

print(f"Time taken to predict subjectivity labels: {end_time - start_time}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting: 100%|██████████| 2408/2408 [11:46<00:00,  3.41it/s]

Time taken to predict subjectivity labels: 706.2395353317261



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unannotated_data['subjectivity'] = predictions


12m 40.8s, 38515 records --> 0.0198s/rec

In [27]:
unannotated_data

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp,subjectivity,polarity,preprocessed_comment
0,1,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0,,trash space hippie behind design nike i nike é...
1,2,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0,,trash space hippie behind design nike so lie?
2,3,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,,trash space hippie behind design nike this pro...
3,4,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,,trash space hippie behind design nike just got...
4,5,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0,,trash space hippie behind design nike i tried ...
...,...,...,...,...,...,...,...,...,...,...,...
41163,41164,Puma,quality,"What do Post Malone, Black Pumas and Lou Reed ...",Twitter,"{'Name': 'Ale𝕏 Gear & Tech', 'Handle': '@AlexG...",29,2023-11-04T12:47:12Z,0,,"post malone, black pumas lou reed common? coll..."
41164,41165,Nike,quality,Are you comparing it to the price of the shamr...,Twitter,"{'Name': '', 'Handle': '@WeAreNDFans', 'Timest...",0,2022-09-19T14:58:24Z,1,,"comparing price shamrock series?, fair compari..."
41166,41167,Uniqlo,price,Some people see their prices and compare them ...,Twitter,"{'Name': 'Miguel Marcos Martinez, FRSA, MCybSo...",2,2022-09-15T11:09:51Z,1,,people see prices compare uniqlo whoever. comp...
41167,41168,Nike,price,BASED ON POPULAR DEMANDS*NIKE AIR FORCE1 LOW M...,Twitter,"{'Name': 'Ibrahim', 'Handle': '@exclusive43779...",0,2024-03-29T17:53:13Z,0,,based popular demands nike air force1 low mich...


In [28]:
unannotated_data[unannotated_data['subjectivity'].isnull()]

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp,subjectivity,polarity,preprocessed_comment


## Polarity Detection

In [29]:
unannotated_polarity_data = unannotated_data[unannotated_data['subjectivity']==1]
unannotated_polarity_data.head()

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp,subjectivity,polarity,preprocessed_comment
2,3,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,,trash space hippie behind design nike this pro...
3,4,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,,trash space hippie behind design nike just got...
6,7,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,,trash space hippie behind design nike are plan...
8,9,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,,trash space hippie behind design nike sick des...
14,15,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,,trash space hippie behind design nike good int...


In [30]:
# Load pre-trained Word2Vec model
word_embeddings = gensim.downloader.load('word2vec-google-news-300')

In [31]:
# FEATURE EXTRACTION

# Extract comments and corresponding subjectivity labels
comments = unannotated_polarity_data['preprocessed_comment'].tolist()
labels = unannotated_polarity_data['polarity'].tolist()

# Convert text data to TF-IDF features
with open('models/tfidf_vectorizer.pkl', 'rb') as file:
    tfidf_vectorizer = pickle.load(file)
tfidf_features = tfidf_vectorizer.transform(comments)

# Convert text data to n-gram features
with open('models/ngram_vectorizer.pkl', 'rb') as file:
    ngram_vectorizer = pickle.load(file)
ngram_features = ngram_vectorizer.transform(comments)

# Convert each comment to a vector representation using word embeddings
comment_vectors = []
for comment in comments:
    words = comment.split()
    vectors = [word_embeddings[word] for word in words if word in word_embeddings]
    if vectors:
        comment_vectors.append(sum(vectors) / len(vectors))  # Average of word vectors in the comment
    else:
        comment_vectors.append([0] * 300)  # Use zero vector if no word found

# Combine features
combined_features = hstack((tfidf_features, ngram_features, comment_vectors))

In [32]:
with open('models/logistic_regression_polarity.pkl', 'rb') as file:
    logistic_regression_model = pickle.load(file)

In [33]:
# Predict labels
log_start_time = time.time()

predicted_labels = logistic_regression_model.predict(combined_features)

log_end_time = time.time()
print("Time taken to predict polarity labels using Logistic Regression:", log_end_time - log_start_time)

# Add predictions back to the DataFrame
unannotated_polarity_data['polarity'] = predicted_labels

Time taken to predict polarity labels using Logistic Regression: 0.14203262329101562


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unannotated_polarity_data['polarity'] = predicted_labels


In [34]:
unannotated_polarity_data

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp,subjectivity,polarity,preprocessed_comment
2,3,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,1.0,trash space hippie behind design nike this pro...
3,4,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,1.0,trash space hippie behind design nike just got...
6,7,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,1.0,trash space hippie behind design nike are plan...
8,9,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,0.0,trash space hippie behind design nike sick des...
14,15,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,0.0,trash space hippie behind design nike good int...
...,...,...,...,...,...,...,...,...,...,...,...
41156,41157,Mango,waste,This looks like comparison of rotten apple wit...,Twitter,"{'Name': 'UniquePushkar', 'Handle': '@PushkarU...",0,2024-03-28T07:02:23Z,1,0.0,looks like comparison rotten apple mangoevery ...
41161,41162,Nike,price,"Unfortunately the newer brands (Decente, Grays...",Twitter,"{'Name': 'Rob Pelton', 'Handle': '@rob_pelton'...",18,2024-01-28T16:19:14Z,1,0.0,"unfortunately newer brands decente, grayson, m..."
41162,41163,Puma,price,The Bradley is cheap in comparison to SPz Puma...,Twitter,"{'Name': 'Marcel Germann', 'Handle': '@marcel_...",0,2024-01-21T11:13:04Z,1,0.0,"bradley cheap comparison spz puma, successor s..."
41164,41165,Nike,quality,Are you comparing it to the price of the shamr...,Twitter,"{'Name': '', 'Handle': '@WeAreNDFans', 'Timest...",0,2022-09-19T14:58:24Z,1,0.0,"comparing price shamrock series?, fair compari..."


In [35]:
unannotated_polarity_data['polarity'].value_counts()

polarity
1.0    11599
0.0     5409
Name: count, dtype: int64

# Combining the data back together

unannotataed_polarity_data with unannotated_data

In [36]:
# Make a copy of unannotated_data to avoid modifying it directly
unannotated_data_merged = unannotated_data.copy()

# Update the 'Polarity' column in unannotated_data_merged with values from unannotated_polarity_data
unannotated_data_merged.update(unannotated_polarity_data[['polarity']])
unannotated_data_merged

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp,subjectivity,polarity,preprocessed_comment
0,1,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0,,trash space hippie behind design nike i nike é...
1,2,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0,,trash space hippie behind design nike so lie?
2,3,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,1.0,trash space hippie behind design nike this pro...
3,4,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1,1.0,trash space hippie behind design nike just got...
4,5,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0,,trash space hippie behind design nike i tried ...
...,...,...,...,...,...,...,...,...,...,...,...
41163,41164,Puma,quality,"What do Post Malone, Black Pumas and Lou Reed ...",Twitter,"{'Name': 'Ale𝕏 Gear & Tech', 'Handle': '@AlexG...",29,2023-11-04T12:47:12Z,0,,"post malone, black pumas lou reed common? coll..."
41164,41165,Nike,quality,Are you comparing it to the price of the shamr...,Twitter,"{'Name': '', 'Handle': '@WeAreNDFans', 'Timest...",0,2022-09-19T14:58:24Z,1,0.0,"comparing price shamrock series?, fair compari..."
41166,41167,Uniqlo,price,Some people see their prices and compare them ...,Twitter,"{'Name': 'Miguel Marcos Martinez, FRSA, MCybSo...",2,2022-09-15T11:09:51Z,1,0.0,people see prices compare uniqlo whoever. comp...
41167,41168,Nike,price,BASED ON POPULAR DEMANDS*NIKE AIR FORCE1 LOW M...,Twitter,"{'Name': 'Ibrahim', 'Handle': '@exclusive43779...",0,2024-03-29T17:53:13Z,0,,based popular demands nike air force1 low mich...


In [37]:
unannotated_data_merged[unannotated_data_merged['subjectivity'].isnull()]

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp,subjectivity,polarity,preprocessed_comment


unannotated_data and annotated_data with full_data

In [40]:
full_data.head()

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp
0,1,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
1,2,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
2,3,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
3,4,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,
4,5,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,


In [51]:
# Make a copy of unannotated_data to avoid modifying it directly
full_data_with_labels = full_data.copy()
full_data_with_labels['subjectivity'] = np.nan
full_data_with_labels['polarity'] = np.nan

# Update the 'Polarity' column in unannotated_data_merged with values from unannotated_polarity_data
full_data_with_labels.update(unannotated_data_merged[['subjectivity','polarity']])
# full_data_with_labels.update(annotated_data[['subjectivity','polarity']])
full_data_with_labels

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp,subjectivity,polarity
0,1,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0.0,
1,2,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0.0,
2,3,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1.0,1.0
3,4,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1.0,1.0
4,5,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0.0,
...,...,...,...,...,...,...,...,...,...,...
41167,41168,Nike,price,BASED ON POPULAR DEMANDS*NIKE AIR FORCE1 LOW M...,Twitter,"{'Name': 'Ibrahim', 'Handle': '@exclusive43779...",0,2024-03-29T17:53:13Z,0.0,
41168,41169,Gucci,quality,Being a YouTuber is the apex of human achievem...,Twitter,"{'Name': 'Neon Thunderbird', 'Handle': '@NeonT...",1,2024-03-30T08:38:43Z,,
41169,41170,Gucci,price,"Joe Biden didn't make you eat at a popular, fa...",Twitter,"{'Name': 'Douglas Wiegand', 'Handle': '@Eeelpo...",2,2024-03-26T18:48:42Z,,
41170,41171,Gucci,price,I kind of blame the hype beast culture. When a...,Twitter,"{'Name': 'kimxmartin', 'Handle': '@kimxmarting...",1,2024-03-26T12:40:22Z,,


In [77]:
full_data_with_labels_2 = full_data_with_labels.merge(annotated_data[['comment','subjectivity','polarity']], on='comment', how='left', suffixes=('_unannotated', '_annotated'))

# for row, index in full_data_with_labels_2.iterrows():
#     if not isinstance(row['subjectivity_unannotated'], str):
#         full_data_with_labels_2.loc[index, 'subjectivity_unannotated'] = row['subjectivity_annotated']
#         full_data_with_labels_2.loc[index, 'polarity_unannotated'] = row['polarity_annotated']

full_data_with_labels_2['subjectivity_unannotated'] = full_data_with_labels_2['subjectivity_unannotated'].fillna(full_data_with_labels_2['subjectivity_annotated'])
full_data_with_labels_2['polarity_unannotated'] = full_data_with_labels_2['polarity_unannotated'].fillna(full_data_with_labels_2['polarity_annotated'])


full_data_with_labels_2 = full_data_with_labels_2.drop(columns=['subjectivity_annotated', 'polarity_annotated'])
full_data_with_labels_2 = full_data_with_labels_2.rename(columns={'subjectivity_unannotated': 'subjectivity', 'polarity_unannotated': 'polarity'})

full_data_with_labels_2


Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp,subjectivity,polarity
0,1,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0.0,
1,2,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0.0,
2,3,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1.0,1.0
3,4,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,1.0,1.0
4,5,Nike,waste,From Trash to Space Hippie | Behind the Design...,Youtube,"{""url"": ""https://www.youtube.com//watch?v=i3n_...",0,,0.0,
...,...,...,...,...,...,...,...,...,...,...
41167,41168,Nike,price,BASED ON POPULAR DEMANDS*NIKE AIR FORCE1 LOW M...,Twitter,"{'Name': 'Ibrahim', 'Handle': '@exclusive43779...",0,2024-03-29T17:53:13Z,0.0,
41168,41169,Gucci,quality,Being a YouTuber is the apex of human achievem...,Twitter,"{'Name': 'Neon Thunderbird', 'Handle': '@NeonT...",1,2024-03-30T08:38:43Z,1.0,0.0
41169,41170,Gucci,price,"Joe Biden didn't make you eat at a popular, fa...",Twitter,"{'Name': 'Douglas Wiegand', 'Handle': '@Eeelpo...",2,2024-03-26T18:48:42Z,1.0,0.0
41170,41171,Gucci,price,I kind of blame the hype beast culture. When a...,Twitter,"{'Name': 'kimxmartin', 'Handle': '@kimxmarting...",1,2024-03-26T12:40:22Z,1.0,0.0


In [78]:
full_data_with_labels_2[full_data_with_labels_2['subjectivity'].isnull()]

Unnamed: 0,comment_id,brand,search_term,comment,source,additional_info,likes,timestamp,subjectivity,polarity


In [79]:
full_data_with_labels.to_csv('data/complete_annotations_data2.csv', index=False)