In [1]:
import pandas as pd

from glob import glob

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from tqdm.auto import tqdm
tqdm.pandas()

## Reading the data

In [2]:
frames = []

for file in tqdm(glob('UkraineTweets/*')):
    print(f"Reading {file}")
    df = pd.read_csv(file, usecols=['tweetid', 'text', 'hashtags', 'language'])  # Filtering columns
    df = df.loc[df['language'] == 'en'].reset_index(drop=True)  # Filtering language
    df['date'] = re.findall(r"[A-Z]{3}[0-9]{2}",file)[0]
    frames.append(df)

combined_df = pd.concat(frames, axis=0, ignore_index=True)

  0%|          | 0/13 [00:00<?, ?it/s]

Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR03.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR08.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_FEB27.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR06.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_FEB28_part1.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_FEB28_part2.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR02.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR10.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR09.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR01.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR07.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR04.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR05.csv


In [3]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3735462 entries, 0 to 3735461
Data columns (total 5 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   tweetid   int64 
 1   text      object
 2   hashtags  object
 3   language  object
 4   date      object
dtypes: int64(1), object(4)
memory usage: 142.5+ MB


In [4]:
combined_df['hashtags'] = combined_df.hashtags.progress_map(lambda x: [i['text'] for i in eval(x)])  # Keeping only hashtags

  0%|          | 0/3735462 [00:00<?, ?it/s]

In [5]:
combined_df.head()

Unnamed: 0,tweetid,text,hashtags,language,date
0,1499174584720969730,Map situation in #Ukraine after the seventh da...,"[Ukraine, RussiaUkraineConflict]",en,MAR03
1,1499174584976826368,#Ukraine: Let's just say it's not just the TB-...,[Ukraine],en,MAR03
2,1499174585073242116,⚡️The SWIFT company confirmed that it will dis...,"[EU, Russian]",en,MAR03
3,1499174585987600384,#Ukraine: Ukrainian forces recovered a Eniks E...,[Ukraine],en,MAR03
4,1499174586159665155,Volunteers needed for a rapid-response #DH #Di...,"[DH, DigitalHumanities, CulturalHeritage]",en,MAR03


## Preprocessing the data 

Creating stop word list

In [6]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/kavish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/kavish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kavish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
stop_words = stopwords.words('english')
stop_words += list(string.punctuation) # Adding punctuations to stop words
stop_words += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']  # Adding integers to stop words

Removing URLs

In [8]:
combined_df['text'] = combined_df['text'].str.replace(r"http\S+", "")

  combined_df['text'] = combined_df['text'].str.replace(r"http\S+", "")


Tokenizing, removing stop words and converting to lowercase

In [9]:
def tokenize_lowercase(text):
    tokens = word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
    return stopwords_removed
  
combined_df['text'] = combined_df['text'].progress_apply(tokenize_lowercase)

  0%|          | 0/3735462 [00:00<?, ?it/s]

Removing non alphabetical tokens

In [10]:
def remove_nums(text_object):
    no_nums = list(filter(lambda x: x.isalpha(), text_object))
    return no_nums

combined_df['text'] = combined_df['text'].progress_apply(remove_nums)

  0%|          | 0/3735462 [00:00<?, ?it/s]

Stemming the text

In [12]:
stemmer = PorterStemmer()

def stemmatize_text(df_text):
    stemmed = [stemmer.stem(word) for word in df_text]
    return stemmed

combined_df['text'] = combined_df['text'].progress_apply(stemmatize_text)

  0%|          | 0/3735462 [00:00<?, ?it/s]

## Saving the cleaned data

In [13]:
combined_df.to_csv('Cleaned Tweets - Topic Modelling.csv', index=False)