In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import preprocessor as p
import re
import json
import numpy as np
import warnings
from transformers import pipeline
import torch
from torch.utils.data import DataLoader, Dataset
from detoxify import Detoxify

pd.set_option('display.max_colwidth', None)
tqdm.pandas()
warnings.filterwarnings("ignore", category=UserWarning)

In [4]:
all_files = []
for dirname, _, filenames in os.walk('/vol/bitbucket/es1519/detecting-hidden-purpose-in-nlp-models/detoxify/war_data/data/other'):
    for filename in filenames:
        full_path=os.path.join(dirname, filename)
        all_files.append(full_path)

tmp_df_list = []
for file in all_files:
    print(f"Reading in {file}")
    tmp_df = pd.read_csv(file, compression="gzip", header=0, index_col=0)
    print(f"\t{len(tmp_df)} entries")
    tmp_df_list.append(tmp_df)

print("Concatenating the DataFrames")
# concatenate the dataframes in the temp list row-wise
data = pd.concat(tmp_df_list, axis=0)
print("Concatenation complete!")

Reading in /vol/bitbucket/es1519/detecting-hidden-purpose-in-nlp-models/detoxify/war_data/data/other/0829_UkraineCombinedTweetsDeduped.csv.gzip
	53402 entries
Reading in /vol/bitbucket/es1519/detecting-hidden-purpose-in-nlp-models/detoxify/war_data/data/other/0828_UkraineCombinedTweetsDeduped.csv.gzip
	36972 entries
Reading in /vol/bitbucket/es1519/detecting-hidden-purpose-in-nlp-models/detoxify/war_data/data/other/0821_UkraineCombinedTweetsDeduped.csv.gzip
	47708 entries
Reading in /vol/bitbucket/es1519/detecting-hidden-purpose-in-nlp-models/detoxify/war_data/data/other/0820_UkraineCombinedTweetsDeduped.csv.gzip
	44364 entries
Reading in /vol/bitbucket/es1519/detecting-hidden-purpose-in-nlp-models/detoxify/war_data/data/other/0823_UkraineCombinedTweetsDeduped.csv.gzip
	50253 entries
Reading in /vol/bitbucket/es1519/detecting-hidden-purpose-in-nlp-models/detoxify/war_data/data/other/0822_UkraineCombinedTweetsDeduped.csv.gzip
	46979 entries
Reading in /vol/bitbucket/es1519/detecting-hid

In [5]:
print(f"There are {data['language'].nunique()} unique languages in this DataFrame.")
print(data["language"].unique())
print(f"{round(data.loc[data['language']=='en'].shape[0]/data.shape[0]*100, 2)}% of the tweets are in English.")

There are 63 unique languages in this DataFrame.
['en' 'es' 'ht' 'und' 'in' 'ar' 'fr' 'ja' 'fi' 'cs' 'it' 'uk' 'bg' 'ko'
 'pt' 'tr' 'ru' 'nl' 'ur' 'th' 'lv' 'de' 'zh' 'el' 'bn' 'da' 'sl' 'ro'
 'ka' 'pl' 'iw' 'vi' 'te' 'sr' 'ta' 'eu' 'tl' 'sv' 'is' 'ca' 'hi' 'lt'
 'gu' 'no' 'kn' 'et' 'ckb' 'am' 'fa' 'hu' 'cy' 'ml' 'my' 'ne' 'ps' 'mr'
 'pa' 'or' 'hy' 'sd' 'si' 'km' 'dv']
47.89% of the tweets are in English.


In [8]:
prev_size = len(data)
# drop rows with missing values in the 'renderedContent' column
data = data.dropna(subset=['text'])
# drop all rows with non english text
data = data[data['language'] == 'en'].drop(columns=['language'])
change = prev_size - len(data)
print(f"Dropped {change} rows")
print(f"{len(data)} entries remain")

Dropped 351894 rows
323375 entries remain


In [9]:
prev_size = len(data)
dupe_mask = data['text'].duplicated(keep=False)
data = data[~dupe_mask]
change = prev_size - len(data)
print(f"Dropped {change} duplicated rows")
print(f"{len(data)} tweets remain in the dataset")

Dropped 9075 duplicated rows
314300 tweets remain in the dataset


In [11]:
# Define a regular expression pattern to match hashtags
pattern = r'#(\w+)'

# Extract hashtags from the renderedContent column and concatenate them into a single list
hashtags = []
for text in data['text']:
    hashtags += re.findall(pattern, text)

# Count the frequency of each hashtag
hashtag_counts = pd.Series(hashtags).value_counts()

# Print the top 10 most common hashtags
print("Ten most common hashtags in the text:")
print(hashtag_counts.head(25))

most_common_hashtag = hashtag_counts.iloc[:25]

Ten most common hashtags in the text:
Ukraine                    100557
Russia                      52820
RussiaIsATerroristState     21695
Biden                       20332
StandWithUkraine            19958
Putin                       17961
UkraineRussiaWar            17043
Russian                     16967
SlavaUkraini                11969
China                       11380
UkraineWar                  11323
ukraine                     11044
russia                       9659
Kherson                      9100
Ukrainian                    8847
NATO                         8828
USA                          8714
RussiaUkraineWar             7110
war                          6793
UkraineRussianWar            5901
VMAs                         5694
ArmUkraineNow                5022
Crimea                       4625
Trump                        4541
UkraineWillWin               4463
dtype: int64


In [12]:
# Define a regular expression pattern to match hashtags
pattern = r'@(\w+)'

# Extract hashtags from the renderedContent column and concatenate them into a single list
mentions = []
for text in data['text']:
    mentions += re.findall(pattern, text)

# Count the frequency of each mention
mention_counts = pd.Series(mentions).value_counts()

# Print the top 10 most common mentions
print("Ten most common mentions in the text:")
print(mention_counts.head(10))
most_common_mentions = mention_counts.iloc[:10]

Ten most common mentions in the text:
POTUS              4058
mfa_russia         2522
ZelenskyyUa        2489
YouTube            2216
JYPETWICE          1987
UN                 1660
NATO               1490
BorisJohnson       1448
KyivIndependent    1386
RussiaUN           1345
dtype: int64


In [14]:
def remove_unnecessary(text):
    text = text.replace("\n", " ")
    text = text.replace("&amp;", " ")
    for hashtag in most_common_hashtag.keys():
        text = text.replace(f"#{hashtag}", " ".join(re.findall('[A-Z][^A-Z]*', hashtag)))
    for mention in most_common_mentions.keys():
        text = text.replace(f'@{mention}', mention)
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.NUMBER, p.OPT.EMOJI, p.OPT.SMILEY)
    result = p.clean(text)
    return result

data["cleanedTweet"] = data["text"].progress_map(remove_unnecessary)

100%|██████████| 314300/314300 [00:41<00:00, 7548.59it/s]


In [15]:
prev_size = len(data)
dupe_mask = data['cleanedTweet'].duplicated(keep=False)
data = data[~dupe_mask]
change = prev_size - len(data)
print(f"Dropped {change} duplicated rows")
print(f"{len(data)} tweets remain in the dataset")

Dropped 43066 duplicated rows
271234 tweets remain in the dataset


In [16]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [17]:
candidate_labels = ['USA started the war between Russia and Ukraine',
                    'POTUS started the war between Russia and Ukraine',
                    'Joe Biden started the war between Russia and Ukraine',
                    'CIA started the war between Russia and Ukraine',
                    'USA influenced the war between Russia and Ukraine',
                    'POTUS influenced the war between Russia and Ukraine',
                    'Joe Biden influenced the war between Russia and Ukraine',
                    'CIA influenced the war between Russia and Ukraine',
                    ]

In [24]:
print("Starting analysis")
tweets_blaming_america = {}
for tweet in tqdm(data.sample(50, random_state=42)['cleanedTweet']):
    result = classifier(tweet, candidate_labels, multi_label=True)
    if any(val > 0.75 for val in result['scores']):
        print(tweet.replace("\n", " "))
        for label, score in zip(result['labels'], result['scores']):
            print(f"\t{label}: {score}")

Starting analysis


100%|██████████| 50/50 [00:59<00:00,  1.18s/it]
