In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import preprocessor as p
import re
import json
import numpy as np
import warnings
from transformers import pipeline
import torch
from torch.utils.data import DataLoader, Dataset
from detoxify import Detoxify

pd.set_option('display.max_colwidth', None)
tqdm.pandas()
warnings.filterwarnings("ignore", category=UserWarning)

In [16]:
all_files = []
for dirname, _, filenames in os.walk('data/'):
    for filename in filenames:
        full_path=os.path.join(dirname, filename)
        all_files.append(full_path)

In [17]:
tmp_df_list = []
for file in all_files:
    print(f"Reading in {file}")
    tmp_df = pd.read_csv(file)
    print(f"\t{len(tmp_df)} entries")
    tmp_df_list.append(tmp_df)

print("Concatenating the DataFrames")
data = pd.concat(tmp_df_list, axis=0)
print("Concatenation complete!")


Reading in data/Russian_border_Ukraine.csv
	53040 entries
Reading in data/Ukraine_nato.csv
	245232 entries
Reading in data/Russia_invade.csv
	170835 entries
Reading in data/StandWithUkraine.csv
	148145 entries
Reading in data/Ukraine_troops.csv
	172714 entries
Reading in data/Russian_troops.csv
	128405 entries
Reading in data/Ukraine_war.csv
	231624 entries
Reading in data/Ukraine_border.csv
	166610 entries
Concatenating the DataFrames
Concatenation complete!


In [18]:
print(data.info(max_cols=29))


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1316605 entries, 0 to 166609
Data columns (total 29 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   _type             1316605 non-null  object 
 1   url               1316605 non-null  object 
 2   date              1316605 non-null  object 
 3   content           1316605 non-null  object 
 4   renderedContent   1316605 non-null  object 
 5   id                1316605 non-null  int64  
 6   user              1316605 non-null  object 
 7   replyCount        1316605 non-null  int64  
 8   retweetCount      1316605 non-null  int64  
 9   likeCount         1316605 non-null  int64  
 10  quoteCount        1316605 non-null  int64  
 11  conversationId    1316605 non-null  int64  
 12  lang              1316605 non-null  object 
 13  source            1316605 non-null  object 
 14  sourceUrl         1316605 non-null  object 
 15  sourceLabel       1316605 non-null  object 
 16  o

In [19]:
data["date"] = pd.to_datetime(data["date"])

earliest_tweet = data["date"].min()
latest_tweet = data["date"].max()

print(f"The earliest tweet was at {earliest_tweet}, and the latest was at {latest_tweet}")

The earliest tweet was at 2021-12-31 00:00:30+00:00, and the latest was at 2022-03-05 23:59:59+00:00


In [20]:
print(f"There are {data['lang'].nunique()} unique languages in this DataFrame.")
print(data["lang"].unique())
print(f"{round(data.loc[data['lang']=='en'].shape[0]/data.shape[0]*100, 2)}% of the tweets are in English.")

There are 61 unique languages in this DataFrame.
['en' 'es' 'und' 'hi' 'in' 'de' 'ja' 'pl' 'et' 'zh' 'ro' 'nl' 'tr' 'pa'
 'da' 'pt' 'tl' 'eu' 'fr' 'no' 'cs' 'ru' 'fi' 'it' 'sv' 'ca' 'kn' 'sl'
 'ta' 'ar' 'ko' 'ur' 'bn' 'gu' 'sr' 'th' 'lt' 'uk' 'el' 'cy' 'vi' 'lv'
 'hu' 'ht' 'km' 'fa' 'ml' 'am' 'ne' 'my' 'mr' 'te' 'or' 'ps' 'ka' 'iw'
 'bg' 'dv' 'is' 'sd' 'si']
91.67% of the tweets are in English.


In [21]:
prev_size = len(data)
# drop rows with missing values in the 'renderedContent' column
data = data.dropna(subset=['renderedContent'])
# drop all rows with non english text
data = data[data['lang'] == 'en'].drop(columns=['lang'])
change = prev_size - len(data)
print(f"Dropped {change} rows")
print(f"{len(data)} entries remain")

Dropped 109620 rows
1206985 entries remain


In [22]:
prev_size = len(data)
dupe_mask = data['renderedContent'].duplicated(keep=False)
data = data[~dupe_mask]
change = prev_size - len(data)
print(f"Dropped {change} duplicated rows")
print(f"{len(data)} tweets remain in the dataset")

Dropped 406566 duplicated rows
800419 tweets remain in the dataset


In [23]:
# Define a regular expression pattern to match hashtags
pattern = r'#(\w+)'

# Extract hashtags from the renderedContent column and concatenate them into a single list
hashtags = []
for text in data['renderedContent']:
    hashtags += re.findall(pattern, text)

# Count the frequency of each hashtag
hashtag_counts = pd.Series(hashtags).value_counts()

# Print the top 10 most common hashtags
print("Ten most common hashtags in the text:")
print(hashtag_counts.head(25))

most_common_hashtag = hashtag_counts.iloc[:25]

Ten most common hashtags in the text:
Ukraine                70580
StandWithUkraine       57572
Russia                 33529
NATO                   17438
Putin                  11091
ukraine                 5500
Russian                 4944
UkraineCrisis           4067
UkraineWar              4060
UkraineRussiaWar        3815
USA                     3421
Biden                   3308
US                      3218
RussiaUkraine           3175
russia                  3055
UkraineConflict         2956
StopPutin               2678
UkraineInvasion         2633
standwithukraine        2628
Europe                  2580
war                     2554
EU                      2380
Kyiv                    2310
Belarus                 2280
RussiaUkraineCrisis     2084
dtype: int64


In [24]:
# Define a regular expression pattern to match hashtags
pattern = r'@(\w+)'

# Extract hashtags from the renderedContent column and concatenate them into a single list
mentions = []
for text in data['renderedContent']:
    mentions += re.findall(pattern, text)

# Count the frequency of each mention
mention_counts = pd.Series(mentions).value_counts()

# Print the top 10 most common mentions
print("Ten most common mentions in the text:")
print(mention_counts.head(10))
most_common_mentions = mention_counts.iloc[:10]

Ten most common mentions in the text:
NATO               14630
POTUS              14234
ZelenskyyUa         5897
McFaul              4902
KremlinRussia_E     4831
Ukraine             4776
Reuters             4475
SecBlinken          4359
UkrWarReport        4285
mfa_russia          4270
dtype: int64


In [25]:
def remove_unnecessary(text):
    text = text.replace("\n", " ")
    text = text.replace("&amp;", " ")
    for hashtag in most_common_hashtag.keys():
        text = text.replace(f"#{hashtag}", " ".join(re.findall('[A-Z][^A-Z]*', hashtag)))
    for mention in most_common_mentions.keys():
        text = text.replace(f'@{mention}', mention)
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.NUMBER, p.OPT.EMOJI, p.OPT.SMILEY)
    result = p.clean(text)
    return result

data["cleanedTweet"] = data["renderedContent"].progress_map(remove_unnecessary)

100%|██████████| 800419/800419 [01:54<00:00, 6993.19it/s]


In [26]:
prev_size = len(data)
dupe_mask = data['cleanedTweet'].duplicated(keep=False)
data = data[~dupe_mask]
change = prev_size - len(data)
print(f"Dropped {change} duplicated rows")
print(f"{len(data)} tweets remain in the dataset")

Dropped 54478 duplicated rows
745941 tweets remain in the dataset


In [28]:
reduced_data = data[['date', 'cleanedTweet']]
reduced_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 745941 entries, 18 to 166609
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype              
---  ------        --------------   -----              
 0   date          745941 non-null  datetime64[ns, UTC]
 1   cleanedTweet  745941 non-null  object             
dtypes: datetime64[ns, UTC](1), object(1)
memory usage: 17.1+ MB


In [29]:
reduced_data.to_csv("/vol/bitbucket/es1519/detecting-hidden-purpose-in-nlp-models/detoxify/war_data/cleaned_data.csv", index=False)