In [1]:
import pandas as pd
import numpy as np

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

from glob import glob

import re

from textblob import TextBlob

from tqdm.auto import tqdm
tqdm.pandas()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Reading the data

In [2]:
frames = []

for file in tqdm(glob('UkraineTweets/*')):
    print(f"Reading {file}")
    df = pd.read_csv(file, usecols=['tweetid', 'text', 'hashtags', 'language'])  # Filtering columns
    df = df.loc[df['language'] == 'en'].reset_index(drop=True)  # Filtering language
    df['date'] = re.findall(r"[A-Z]{3}[0-9]{2}",file)[0]
    frames.append(df)

combined_df = pd.concat(frames, axis=0, ignore_index=True)

  0%|          | 0/13 [00:00<?, ?it/s]

Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR03.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR08.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_FEB27.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR06.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_FEB28_part1.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_FEB28_part2.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR02.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR10.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR09.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR01.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR07.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR04.csv
Reading UkraineTweets/UkraineCombinedTweetsDeduped_MAR05.csv


In [3]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3735462 entries, 0 to 3735461
Data columns (total 5 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   tweetid   int64 
 1   text      object
 2   hashtags  object
 3   language  object
 4   date      object
dtypes: int64(1), object(4)
memory usage: 142.5+ MB


In [4]:
combined_df['hashtags'] = combined_df.hashtags.parallel_map(lambda x: [i['text'] for i in eval(x)])  # Keeping only hashtags

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=466933), Label(value='0 / 466933')…

In [5]:
combined_df.head()

Unnamed: 0,tweetid,text,hashtags,language,date
0,1499174584720969730,Map situation in #Ukraine after the seventh da...,"[Ukraine, RussiaUkraineConflict]",en,MAR03
1,1499174584976826368,#Ukraine: Let's just say it's not just the TB-...,[Ukraine],en,MAR03
2,1499174585073242116,⚡️The SWIFT company confirmed that it will dis...,"[EU, Russian]",en,MAR03
3,1499174585987600384,#Ukraine: Ukrainian forces recovered a Eniks E...,[Ukraine],en,MAR03
4,1499174586159665155,Volunteers needed for a rapid-response #DH #Di...,"[DH, DigitalHumanities, CulturalHeritage]",en,MAR03


## Preprocessing the data 

In [6]:
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

combined_df['text'] = combined_df['text'].progress_apply(preprocess)

  0%|          | 0/3735462 [00:00<?, ?it/s]

## Sentiment Analysis

In [7]:
def get_sentiment(tweet):
    sentiment = TextBlob(tweet).sentiment
    return sentiment.polarity, sentiment.subjectivity

combined_df['sentiment'] = combined_df['text'].parallel_apply(get_sentiment)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=466933), Label(value='0 / 466933')…

In [8]:
combined_df['polarity'] = combined_df['sentiment'].progress_apply(lambda x:x[0])
combined_df['subjectivity'] = combined_df['sentiment'].progress_apply(lambda x:x[0])

  0%|          | 0/3735462 [00:00<?, ?it/s]

  0%|          | 0/3735462 [00:00<?, ?it/s]

In [12]:
combined_df.drop('sentiment', axis=1, inplace=True)

In [15]:
combined_df.to_csv("Tweets - Sentiment Analysis (BOW).csv", index=False)