In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


## Imports

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import pipeline
from transformers.pipelines.pt_utils import Dataset, KeyDataset
import time
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class ListDataset(Dataset):
    def __init__(self, original_list):
        self.original_list = original_list

    def __len__(self):
        return len(self.original_list)
    
    def __getitem__(self, i):
        return self.original_list[i]

In [4]:

#I modified this code quite a bit, now it only returns date and tweet text (preprocessed). username and language are excluded
def run_sentiment_analysis_and_save(path_to_tweets, model_path = f"cardiffnlp/twitter-roberta-base-sentiment-latest", name_appendix=""):
    input_csv = pd.read_csv(path_to_tweets) 
    input_csv.head()
    tweet_text = input_csv['text'].to_list()
    #this dataframe column should be changed for question 1 and 2 to "tweetcreatedts" and for question 3 "Date"
    tweet_date = input_csv['Date'].to_list()  
    filtered_tweets = [] # If some tweets have no text for whatever reason, we remove them
    filtered_dates = []
    for i in range(len(tweet_text)):
        if type(tweet_text[i]) == str:
            filtered_tweets.append(tweet_text[i])
            filtered_dates.append(tweet_date[i])
    print('Removed ', len(tweet_text) - len(filtered_tweets), 'invalid tweets')
    

    #Adding a preprocessing step to remove links and users
    pre_processed = [] #we put all the filtered tweets in this array
    for tweet in filtered_tweets:
        tweet_words = [] 
        for word in tweet.split(' '):
            if word.startswith('@') and len(word) > 1: #if it is a mention then it starts with @ 
                word = '@user'
            elif "http" in word:
                i = word.index("http")
                word = word[:i] + "http"
            #    word = "http"
            tweet_words.append(word)
        tweet = " ".join(tweet_words)
        pre_processed.append(tweet)

    


    # print(tweet_text[:10])
    # tweets_dataset = ListDataset(tweet_text)
    print('Loaded tweets at ' + path_to_tweets)

    sentiment_pipeline = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path, max_length=512, truncation=True)
    
    print('Running Sentiment Analysis...')
    start_time = time.time()
    result = sentiment_pipeline(pre_processed)
    end_time = time.time()
    print('Time elapsed: ', end_time - start_time, ' seconds')

    #added by Sheikh, feel free to fix if you find any issue
    data_given = {"Date": filtered_dates, "text": pre_processed}
    data_given = pd.DataFrame(data_given)
    result = pd.DataFrame(result)
    result_df = data_given.join(result)
    result_df.to_csv(path_to_tweets.split('.csv')[0] + name_appendix + '_with_sentiment.csv')
    #data_given.to_csv(path_to_tweets.split('.csv')[0] + 'test_preprocessing.csv')
    return data_given
    

## Q1

In [32]:
q1_path = '../DataCollection/scrapping/output/q1/all_tweets.csv'
run_sentiment_analysis_and_save(q1_path)

Removed  1 invalid tweets
Loaded tweets at ../DataCollection/scrapping/output/q1/all_tweets.csv


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running Sentiment Analysis...
Time elapsed:  2803.428108215332  seconds


Unnamed: 0,Date,text
0,2021-12-24 23:57:40+00:00,@user Ukraine is a liability for both Russia a...
1,2021-12-24 23:57:27+00:00,.... because the world must show that Russia’s...
2,2021-12-24 23:57:25+00:00,Russia should be stopped in Ukraine &amp; Ukra...
3,2021-12-24 23:55:36+00:00,@user The Russia-connected sources I read expr...
4,2021-12-24 23:55:16+00:00,@user @user @user @user @user @user @user @use...
...,...,...
75204,2022-05-23 22:58:29+00:00,@user Why? Why should they? give it another fe...
75205,2022-05-23 22:58:27+00:00,@user Root of inflation is energy and food pri...
75206,2022-05-23 22:58:22+00:00,@user Ukraine is not Iraq. This piece was oppo...
75207,2022-05-23 22:58:21+00:00,The US was defeated by the Taliban. It has no ...


### Emotions

In [8]:
q1_path_emotions = '../DataCollection/scrapping/output/q1/all_tweets.csv'
model_path = f"j-hartmann/emotion-english-distilroberta-base"
run_sentiment_analysis_and_save(q1_path_emotions, model_path, name_appendix="_emotions")

Removed  1 invalid tweets
Loaded tweets at ../DataCollection/scrapping/output/q1/all_tweets.csv


Downloading: 100%|█████████████████████████████████████████████████████████████████| 0.98k/0.98k [00:00<00:00, 501kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 313M/313M [01:13<00:00, 4.49MB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████| 294/294 [00:00<00:00, 146kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████| 780k/780k [00:02<00:00, 306kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████| 446k/446k [00:01<00:00, 295kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████| 1.29M/1.29M [00:04<00:00, 315kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████| 239/239 [00:00<00:00, 237kB/s]


Running Sentiment Analysis...
Time elapsed:  4630.809329509735  seconds


Unnamed: 0,Date,text
0,2021-12-24 23:57:40+00:00,@user Ukraine is a liability for both Russia a...
1,2021-12-24 23:57:27+00:00,.... because the world must show that Russia’s...
2,2021-12-24 23:57:25+00:00,Russia should be stopped in Ukraine &amp; Ukra...
3,2021-12-24 23:55:36+00:00,@user The Russia-connected sources I read expr...
4,2021-12-24 23:55:16+00:00,@user @user @user @user @user @user @user @use...
...,...,...
75204,2022-05-23 22:58:29+00:00,@user Why? Why should they? give it another fe...
75205,2022-05-23 22:58:27+00:00,@user Root of inflation is energy and food pri...
75206,2022-05-23 22:58:22+00:00,@user Ukraine is not Iraq. This piece was oppo...
75207,2022-05-23 22:58:21+00:00,The US was defeated by the Taliban. It has no ...


## Q2

### English

In [13]:
q2_path_1 = '../DataCollection/scrapping/output/q2/nato_english.csv'
run_sentiment_analysis_and_save(q2_path_1)

Removed  0 invalid tweets
Loaded tweets at ../DataCollection/scrapping/output/q2/nato_english.csv


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running Sentiment Analysis...
Time elapsed:  1418.7985889911652  seconds


Unnamed: 0,Date,text
0,2021-12-24 23:59:17+00:00,"@user @user @user My guess is, he just don't w..."
1,2021-12-24 23:54:44+00:00,@user @user @user @user @user Show me support ...
2,2021-12-24 23:53:40+00:00,@user @user @user What makes NATO more than th...
3,2021-12-24 23:51:47+00:00,@user A NATO invention into Ukraine due to a r...
4,2021-12-24 23:51:44+00:00,@user @user I haven't been counting. But there...
...,...,...
37565,2022-05-23 23:20:58+00:00,@user @user @user Nato IS the future architect...
37566,2022-05-23 23:20:44+00:00,@user @user @user @user See? Clear. And applie...
37567,2022-05-23 23:20:43+00:00,"A combined U.S. and NATO ""no go"" zones for Rus..."
37568,2022-05-23 23:20:10+00:00,@user All of this by the way assumes that the ...


In [14]:
q2_path_2 = '../DataCollection/scrapping/output/q2/putin_english.csv'
run_sentiment_analysis_and_save(q2_path_2)

Removed  0 invalid tweets
Loaded tweets at ../DataCollection/scrapping/output/q2/putin_english.csv


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running Sentiment Analysis...
Time elapsed:  1426.9919798374176  seconds


Unnamed: 0,Date,text
0,2021-12-24 23:59:55+00:00,@user Putin is better than even Ardern and Tru...
1,2021-12-24 23:59:29+00:00,@user You long for the days when Putin was pul...
2,2021-12-24 23:58:14+00:00,"@user … which, funnily enough, sounds a lot li..."
3,2021-12-24 23:58:00+00:00,"@user He's only playing around, Putin (a forme..."
4,2021-12-24 23:57:58+00:00,@user @user @user We're not going that far! \n...
...,...,...
37666,2022-05-23 23:48:02+00:00,@user 6 months ago you had no problem pushing ...
37667,2022-05-23 23:48:02+00:00,"Mur fuck lilek innifsek, Putin!\n(Maltese)\n\n..."
37668,2022-05-23 23:47:59+00:00,@user They’re going to need it for me. It’s a ...
37669,2022-05-23 23:47:56+00:00,@user @user @user @user This might be the most...


In [15]:
q2_path_3 = '../DataCollection/scrapping/output/q2/zelensky_english.csv'
run_sentiment_analysis_and_save(q2_path_3)

Removed  0 invalid tweets
Loaded tweets at ../DataCollection/scrapping/output/q2/zelensky_english.csv


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running Sentiment Analysis...
Time elapsed:  1252.6350831985474  seconds


Unnamed: 0,Date,text
0,2021-12-24 23:29:03+00:00,@user Zelensky has just arrested his political...
1,2021-12-24 23:27:20+00:00,@user @user @user @user @user Trump blocked pa...
2,2021-12-24 23:05:47+00:00,@user Or at least that the call to Volodymyr Z...
3,2021-12-24 23:04:54+00:00,.. @user of the Constitutional Rights of Ukrai...
4,2021-12-24 22:02:47+00:00,I was very glad to join a call this AM with #U...
...,...,...
33452,2022-05-23 22:34:58+00:00,Ukraine was/is a corrupt country.. It’s been u...
33453,2022-05-23 22:34:55+00:00,"@user Criminal Zelensky, at it again."
33454,2022-05-23 22:34:45+00:00,@user Revenge from ucranian goverment against ...
33455,2022-05-23 22:34:44+00:00,@user @user @user @user @user @user Its freaki...


### Russian

In [9]:
model_path = f"Tatyana/rubert-base-cased-sentiment-new"
q2_path_4 = '../DataCollection/scrapping/output/q2/nato_russian.csv'
run_sentiment_analysis_and_save(q2_path_4, model_path)

Removed  0 invalid tweets
Loaded tweets at ../DataCollection/scrapping/output/q2/nato_russian.csv


Downloading: 100%|████████████████████████████████████████████████████████████████████████████| 943/943 [00:00<?, ?B/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 679M/679M [06:15<00:00, 1.90MB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████| 499/499 [00:00<?, ?B/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████| 1.34M/1.34M [00:04<00:00, 293kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████| 112/112 [00:00<00:00, 14.0kB/s]


Running Sentiment Analysis...
Time elapsed:  3889.592109441757  seconds


Unnamed: 0,Date,text
0,2021-12-24 23:50:03+00:00,"@user @user @user О нет, НАТО не могло! Не мог..."
1,2021-12-24 23:47:44+00:00,@user Зря вы так. Вы думаете их трое поэтому с...
2,2021-12-24 23:42:35+00:00,@user Майкл почему вы прибалтику считаете мише...
3,2021-12-24 23:34:57+00:00,"@user Да, по шевелениям НАТО можно фактически ..."
4,2021-12-24 23:24:19+00:00,@user На суде у сисяна в поддержке были предст...
...,...,...
37578,2022-05-23 15:49:12+00:00,@user @user Опоздала Машка. Ниша занята супруг...
37579,2022-05-23 15:43:47+00:00,@user @user @user Ты идиотка вообще? Зачем пок...
37580,2022-05-23 15:43:41+00:00,* С первого выстрела: ВСУ уничтожили командный...
37581,2022-05-23 15:42:22+00:00,"@user Во-вторых, иди на хуй."


In [10]:
q2_path_5 = '../DataCollection/scrapping/output/q2/putin_russian.csv'
run_sentiment_analysis_and_save(q2_path_5, model_path)

Removed  0 invalid tweets
Loaded tweets at ../DataCollection/scrapping/output/q2/putin_russian.csv
Running Sentiment Analysis...
Time elapsed:  3600.1475546360016  seconds


Unnamed: 0,Date,text
0,2021-12-24 23:59:34+00:00,"@user Не знаю. Не проверял.\nТак, что тебе Пут..."
1,2021-12-24 23:59:04+00:00,"@user Да продолжать не о чем, ты то какое отно..."
2,2021-12-24 23:55:26+00:00,"@user Меня устраивает.\nТак, что тебе Путин сд..."
3,2021-12-24 23:51:39+00:00,"@user ""Ты виноват лишь тем, что хочется мне ку..."
4,2021-12-24 23:50:55+00:00,"@user А, что в этом необычного?\nТы лучше скаж..."
...,...,...
37328,2022-05-23 19:56:46+00:00,@user Путин черт
37329,2022-05-23 19:55:26+00:00,@user Я из Донецка. Мне постоянно страшно слыш...
37330,2022-05-23 19:55:00+00:00,24.05.2022\nЗапомним этот день\nПотому что чер...
37331,2022-05-23 19:54:48+00:00,"Прочитала, что Путин при рождении был записан,..."


In [11]:
q2_path_6 = '../DataCollection/scrapping/output/q2/zelensky_russian.csv'
run_sentiment_analysis_and_save(q2_path_6, model_path)

Removed  0 invalid tweets
Loaded tweets at ../DataCollection/scrapping/output/q2/zelensky_russian.csv
Running Sentiment Analysis...
Time elapsed:  21506.704849481583  seconds


Unnamed: 0,Date,text
0,2021-12-24 23:39:32+00:00,@user Когда уже у нас в Нижнем Зеленский съезд...
1,2021-12-24 23:16:10+00:00,@user @user @user @user читала и смотрела и ...
2,2021-12-24 22:57:12+00:00,@user А что тут понимать? Зеленский переиграл ...
3,2021-12-24 22:29:49+00:00,"@user @user Карлан ебанный, тебе не в западло ..."
4,2021-12-24 22:19:39+00:00,"@user @user @user Лох, умри в муках как и никч..."
...,...,...
36527,2022-05-23 17:00:40+00:00,Говорят о трибунале для совершивших военные пр...
36528,2022-05-23 16:59:35+00:00,"Президент Украины Владимир Зеленский, предоста..."
36529,2022-05-23 16:56:55+00:00,зеленский в молодости это украинский чонгук an...
36530,2022-05-23 16:56:41+00:00,Зеленский мог бы спокойно с 24 февраля разъезж...


## Q3

I am making some significant changes here.
First I am using the data that I scrapped, which only contains the news titles and nothing else (replies and retweets). 
They contain all tweets posted from the channels from Dec to April, so I have to manually later sort out Ukranian war related tweets 


In [29]:
q3_path_1 = '../DataCollection/scrapping/output/q3/FoxNews_Sheikh.csv'
run_sentiment_analysis_and_save(q3_path_1)

Removed  0 invalid tweets
Loaded tweets at data/q3/FoxNews_Sheikh.csv


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running Sentiment Analysis...
Time elapsed:  728.9498617649078  seconds


Unnamed: 0,Date,text
0,2022-04-29 23:50:08+00:00,OPINION: @user @user Biden thinks student loan...
1,2022-04-29 23:40:07+00:00,Clinton campaign seeks to block Durham access ...
2,2022-04-29 23:30:00+00:00,NYC bystander stabbed by group outside club in...
3,2022-04-29 23:20:00+00:00,Met Gala 2022 'Gilded Glamour' theme gets mixe...
4,2022-04-29 23:10:00+00:00,Elon Musk and Amber Heard: What we learned thi...
...,...,...
22500,2021-12-01 00:57:39+00:00,Gutfeld: 'It's going to be a war' between arme...
22501,2021-12-01 00:47:31+00:00,Illegal immigrant posed as rideshare driver an...
22502,2021-12-01 00:35:05+00:00,Salvation Army pulls controversial racism guid...
22503,2021-12-01 00:20:03+00:00,'Sex and the City' spin-off releases full trai...


In [30]:
q3_path_2 = 'data/q3/NYT_Sheikh.csv'
run_sentiment_analysis_and_save(q3_path_2)

Removed  0 invalid tweets
Loaded tweets at data/q3/NYT_Sheikh.csv


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running Sentiment Analysis...
Time elapsed:  1622.3399460315704  seconds


Unnamed: 0,Date,text
0,2022-04-29 23:40:08+00:00,Linemen and receivers took center stage in the...
1,2022-04-29 23:00:10+00:00,"In Opinion\n\nJ.D. Vance's ""Trumpian turn has ..."
2,2022-04-29 22:40:05+00:00,The pandemic has upended the rigid 9-to-5 work...
3,2022-04-29 22:00:16+00:00,As a Manhattan grand jury wraps up its review ...
4,2022-04-29 21:53:06+00:00,"“If Mariupol is hell, Azovstal is worse.” The ..."
...,...,...
12388,2021-12-01 00:50:07+00:00,After thirteen cases of the Omicron variant we...
12389,2021-12-01 00:40:03+00:00,"""The last time I was inside the walls of Oxfor..."
12390,2021-12-01 00:30:09+00:00,Detectives investigating the deadly shooting o...
12391,2021-12-01 00:15:08+00:00,"Josh Duggar, who gained celebrity on the TLC r..."
