In [1]:
import pandas as pd
import numpy as np
import csv

from glob import glob
import shutil

import re

import torch

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

from scipy.special import softmax

import urllib.request

from tqdm.auto import tqdm
tqdm.pandas()

## Reading the data

In [2]:
frames = []

for file in tqdm(glob('UkraineTweets/*')):
    print(f"Reading {file}")
    df = pd.read_csv(file, usecols=['tweetid', 'text', 'hashtags', 'language'])  # Filtering columns
    df = df.loc[df['language'] == 'en'].reset_index(drop=True)  # Filtering language
    df['date'] = re.findall(r"[A-Z]{3}[0-9]{2}",file)[0]
    frames.append(df)

combined_df = pd.concat(frames, axis=0, ignore_index=True)

  0%|          | 0/13 [00:00<?, ?it/s]

Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR03.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR08.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_FEB27.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR06.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_FEB28_part1.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_FEB28_part2.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR02.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR10.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR09.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR01.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR07.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR04.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR05.csv


In [3]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3735462 entries, 0 to 3735461
Data columns (total 5 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   tweetid   int64 
 1   text      object
 2   hashtags  object
 3   language  object
 4   date      object
dtypes: int64(1), object(4)
memory usage: 142.5+ MB


In [4]:
combined_df['hashtags'] = combined_df.hashtags.progress_map(lambda x: [i['text'] for i in eval(x)])  # Keeping only hashtags

  0%|          | 0/3735462 [00:00<?, ?it/s]

In [5]:
combined_df.head()

Unnamed: 0,tweetid,text,hashtags,language,date
0,1499174584720969730,Map situation in #Ukraine after the seventh da...,"[Ukraine, RussiaUkraineConflict]",en,MAR03
1,1499174584976826368,#Ukraine: Let's just say it's not just the TB-...,[Ukraine],en,MAR03
2,1499174585073242116,⚡️The SWIFT company confirmed that it will dis...,"[EU, Russian]",en,MAR03
3,1499174585987600384,#Ukraine: Ukrainian forces recovered a Eniks E...,[Ukraine],en,MAR03
4,1499174586159665155,Volunteers needed for a rapid-response #DH #Di...,"[DH, DigitalHumanities, CulturalHeritage]",en,MAR03


## Preprocessing the data 

In [6]:
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

combined_df['text'] = combined_df['text'].progress_apply(preprocess)

  0%|          | 0/3735462 [00:00<?, ?it/s]

## Sentiment Analysis

In [7]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

shutil.rmtree(MODEL, ignore_errors=True)  # Deleting the older version

tokenizer = AutoTokenizer.from_pretrained(MODEL, model_max_length=512)

Downloading label mapping

In [8]:
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

Loading the model

In [9]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
model.save_pretrained(MODEL)

Checking the sentiment in batches

In [10]:
BATCH_SIZE = 10

scores_all = np.empty((0,len(labels)))
text_all = combined_df['text'].to_list()
n = len(text_all)
with torch.no_grad():
    for start_idx in tqdm(range(0, n, BATCH_SIZE)):
        end_idx = min(start_idx+BATCH_SIZE, n)
        encoded_input = tokenizer(text_all[start_idx:end_idx], return_tensors='pt', padding=True, truncation=True).to(device)
        output = model(**encoded_input)
        scores = output[0].detach().cpu().numpy()
        scores = softmax(scores, axis=1)
        scores_all = np.concatenate((scores_all, scores), axis=0)
        del encoded_input, output, scores
        torch.cuda.empty_cache()
    

combined_df[labels] = pd.DataFrame(scores_all, columns=labels)

  0%|          | 0/373547 [00:00<?, ?it/s]

In [11]:
combined_df.to_csv("Tweets - Sentiment Analysis (RoBERTa) Raw Values.csv", index=False)

## Emotion Analysis

In [12]:
task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

shutil.rmtree(MODEL, ignore_errors=True)  # Deleting the older version

tokenizer = AutoTokenizer.from_pretrained(MODEL, model_max_length=512)

Downloading label mapping

In [13]:
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

Loading the model

In [14]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
model.save_pretrained(MODEL)

Checking the emotion in batches

In [15]:
BATCH_SIZE = 10

scores_all = np.empty((0,len(labels)))
text_all = combined_df['text'].to_list()
n = len(text_all)
with torch.no_grad():
    for start_idx in tqdm(range(0, n, BATCH_SIZE)):
        end_idx = min(start_idx+BATCH_SIZE, n)
        encoded_input = tokenizer(text_all[start_idx:end_idx], return_tensors='pt', padding=True, truncation=True).to(device)
        output = model(**encoded_input)
        scores = output[0].detach().cpu().numpy()
        scores = softmax(scores, axis=1)
        scores_all = np.concatenate((scores_all, scores), axis=0)
        del encoded_input, output, scores
        torch.cuda.empty_cache()
    

combined_df[labels] = pd.DataFrame(scores_all, columns=labels)

  0%|          | 0/373547 [00:00<?, ?it/s]

In [16]:
combined_df.to_csv("Tweets - Sentiment Analysis (RoBERTa) Raw Values.csv", index=False)