In [None]:
# Install required packages
!pip install --quiet gspread pandas

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import gspread
import pandas as pd
import numpy as np
import torch
import re
from google.colab import files
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import nltk
from scipy.special import softmax
from pathlib import Path

## IMPORTANT NOTE

The following notebook accesses a private Google Sheet containing
hydrated Twitter data.

Due to Twitter/X Terms of Service and privacy constraints,
raw tweet text and user location data are **not included**
in this public repository. Credentials and spreadsheet keys
are therefore intentionally omitted.

To reproduce the experiment:

1. Hydrate the published Tweet IDs using the Twitter API to retrieve  
   **Tweet Text** and **Location** fields.
2. Store the hydrated data in a Google Sheet.  
3. Provide your own Google Service Account credentials.

**All subsequent analyses assume that these steps have been completed**


In [None]:
# Use the path to your credentials file in Drive
gc = gspread.service_account(filename="PATH_TO_YOUR_CREDENTIALS.json")

In [None]:
# Open your Google Sheet by key
sh = gc.open_by_key("YOUR_SPREADSHEET_KEY")
worksheet = sh.get_worksheet(0)

In [None]:
# Get records
res = worksheet.get_all_records()

In [None]:
# convert to DataFrame
df = pd.DataFrame(res)
df.head()

In [None]:
tweets = df['Tweet Text'].tolist()

In [None]:
tweets

In [None]:
print(len(tweets))

In [None]:
Tweet_ID = df['Tweet ID'].tolist()

In [None]:
Location = df['Location'].tolist()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
model = model.to(device)
model.eval()

In [None]:
# --- Check label mapping ---
print(config.id2label)

In [None]:
sentiment_predictions = []
tweet_logits = []  # to store raw logits
tweet_probs = []   # to store probabilities
tweet_s = []  # to store p_pos - p_neg
tweet_sadj = []  # to store (1-p_neu)*(p_pos - p_neg)

In [None]:
for i in range(len(tweets)):
    #batch_tweets = tweets[i:i+BATCH_SIZE]
    #batch_texts = [preprocess(text) for text in batch_tweets]
    text = tweets[i]
    text = preprocess(text)
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length = 256, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model(**encoded_input)

    # Raw logits BEFORE softmax
    logits = output.logits[0].cpu().numpy()

    # --- Convert to probabilities ---
    probs = softmax(logits)  # converts logits â†’ probabilities
    p_neg, p_neu, p_pos = probs

    s = p_pos - p_neg
    s_adj = (1.0 - p_neu) * s

    # --- Class label based on maximum probability ---
    pred_label = config.id2label[np.argmax(probs)]

    # --- Save results ---
    sentiment_predictions.append(pred_label)
    tweet_logits.append(logits)
    tweet_probs.append(probs)
    tweet_s.append(s)
    tweet_sadj.append(s_adj)

    print(f"{i}: Label={pred_label}, Score={s_adj:.3f}, Probabilities={probs}")

    #scores = output[0][0].detach().numpy()
    #scores = softmax(scores)
    #ranking = np.argsort(scores)
    #ranking = ranking[::-1]
    #predictions = config.id2label[ranking[0]]
    #print(i,predictions)
    #sentiment_predictions.extend([predictions])

In [None]:
for name, lst in zip(
    ["sentiment_predictions", "tweet_logits", "tweet_probs", "tweet_s", "tweet_sadj"],
    [sentiment_predictions, tweet_logits, tweet_probs, tweet_s, tweet_sadj]
):
    print(name, len(lst))

In [None]:
data = pd.DataFrame({
    'tweet_text': tweets,
    'pred_label': sentiment_predictions,
    'raw_logit': tweet_logits,
    'probabilities': tweet_probs,
    'tweet_logit': tweet_s,
    'final_tweet_logit': tweet_sadj,
    'Tweet_ID': Tweet_ID,
    'Location': Location
})

In [None]:
data

In [None]:
df1 = data[['tweet_text','pred_label','final_tweet_logit','Location','Tweet_ID']]

In [None]:
positive = df1[df1['pred_label']=='positive']
neutral = df1[df1['pred_label']=='neutral']
negative = df1[df1['pred_label']=='negative']

In [None]:
print("positive:",len(positive))
print("neutral:",len(neutral))
print("negative :",len(negative))

In [None]:
df1['Tweet_ID'] = df1['Tweet_ID'].astype(str)

In [None]:
# Store the DataFrame (df1) into a Google Sheet
sheets = sh.get_worksheet(2)
sheets.update([df1.columns.values.tolist()] + df1.values.tolist())