# Auxiliary Functions

In this section the auxiliary functions used in this project are implemented.

For this part, 2 Python helper modules were also implemented under jupyter/modules folder:

    - TextProcessor: Which has the methods to process the text using the approaches used in this project.
    - JsonHelper: It was used to convert dictionaries into JSON format.
    
The reason to separate these modules from the Jupyter notebooks was to keep the code organised and to follow the best programming practices with regards to reuse and code modularization.

In [67]:
# adding modules folder to the path
import sys
sys.path.append('./modules')

# importing constants and modules created for this project
from constants import *
from text_processor import *

# importing modules specific to this notebook
import logging
import pandas as pd
#import altair as alt
#import seaborn as sns
import concurrent.futures
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from dotenv import dotenv_values
from nltk.probability import FreqDist
from tqdm import tqdm

# use a logger to help debugging
logger = logging.getLogger('ca2-jupyter')

# set logger level
logger.setLevel(logging.ERROR)

def get_freq(df, colname):
    '''
    Auxiliary function to calculate the frequency distribution of the words in a column.

    Parameters:
        df (DataFrame): Data frame to be processed.
        colname  (str): Column name to get its frequency calculated.
    Returns:
        fdist    (mtx): Frequency distribution of the tokens.
    '''
    tokens_lst = []
    for i in range(0, len(df)):
        text = df.loc[i][colname]
        tokens_lst+= TextProcessor.get_tokens(text)

    fdist = FreqDist(tokens_lst)

    return fdist

def process_tweets(tweets_list):
    """
    Auxiliary method to process the tweets from the input list.
    
    It processes the tweets, extracting their sentiment during the process.
    
    It generates a table with the following columns:

        'tweet_raw', 'sent_raw', 'tweet_clr', 'sent_clr', 'tweet_st', 'sent_st', 'tweet_lm', 'sent_lm'
    
    Parameters:
        tweets_list (lst): List of tweets to be processed.
    Returns:
        tweets_df (DataFrame): Data frame with the processed data.
    """
    proc_tweets = []
    for i in range(0, len(tweets_list)):
        # raw tweet
        tweet_raw = tweets_list[i]
        sentiment_raw = TextProcessor.get_sentiment(tweet_raw)

        # strip tweet
        tweet_str = TextProcessor.clean_text(tweet_raw)
        sentiment_str = TextProcessor.get_sentiment(tweet_str)
        
        # cleaned tweet
        tweet_clr = TextProcessor.process_text(tweet_raw)
        sentiment_clr = TextProcessor.get_sentiment(tweet_clr)

        # steemed tweet
        tweet_st = TextProcessor.process_text(tweet_raw, use_stemmer=True)
        sentiment_st = TextProcessor.get_sentiment(tweet_st)

        # lemmatized tweet
        tweet_lm = TextProcessor.process_text(tweet_raw, use_lemmatizer=True)
        sentiment_lm = TextProcessor.get_sentiment(tweet_lm)

        proc_tweets.append((tweet_raw, sentiment_raw, tweet_str, sentiment_str, tweet_clr, sentiment_clr, tweet_st, sentiment_st, tweet_lm, sentiment_lm))

    tweets_df = pd.DataFrame(proc_tweets, columns=['tweet_raw', 'sent_raw', 'tweet_str', 'sent_str', 'tweet_clr', 'sent_clr', 'tweet_st', 'sent_st', 'tweet_lm', 'sent_lm'])

    return tweets_df


def process_tweets_chunk(chunk):
    proc_tweets = []
    for tweet in chunk:
        tweet_raw = tweet
        sentiment_raw = TextProcessor.get_sentiment(tweet_raw)

        tweet_str = TextProcessor.clean_text(tweet_raw)
        sentiment_str = TextProcessor.get_sentiment(tweet_str)

        tweet_clr = TextProcessor.process_text(tweet_raw)
        sentiment_clr = TextProcessor.get_sentiment(tweet_clr)

        tweet_st = TextProcessor.process_text(tweet_raw, use_stemmer=True)
        sentiment_st = TextProcessor.get_sentiment(tweet_st)

        tweet_lm = TextProcessor.process_text(tweet_raw, use_lemmatizer=True)
        sentiment_lm = TextProcessor.get_sentiment(tweet_lm)

        proc_tweets.append((tweet_raw, sentiment_raw, tweet_str, sentiment_str, tweet_clr, sentiment_clr, tweet_st, sentiment_st, tweet_lm, sentiment_lm))

    return proc_tweets

def process_tweets_async(tweets_list, chunk_size=1000):
    proc_tweets = []
    num_chunks = (len(tweets_list) + chunk_size - 1) // chunk_size

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for i in range(num_chunks):
            chunk = tweets_list[i*chunk_size:(i+1)*chunk_size]
            future = executor.submit(process_tweets_chunk, chunk)
            futures.append(future)

        for future in tqdm(concurrent.futures.as_completed(futures), total=num_chunks, desc='Processing'):
            proc_tweets.extend(future.result())

    tweets_df = pd.DataFrame(proc_tweets, columns=['tweet_raw', 'sent_raw', 'tweet_str', 'sent_str', 'tweet_clr', 'sent_clr', 'tweet_st', 'sent_st', 'tweet_lm', 'sent_lm'])

    return tweets_df

'''
def process_tweet(tweet):
    tweet_raw = tweet
    sentiment_raw = TextProcessor.get_sentiment(tweet_raw)

    tweet_str = TextProcessor.clean_text(tweet_raw)
    sentiment_str = TextProcessor.get_sentiment(tweet_str)

    tweet_clr = TextProcessor.process_text(tweet_raw)
    sentiment_clr = TextProcessor.get_sentiment(tweet_clr)

    tweet_st = TextProcessor.process_text(tweet_raw, use_stemmer=True)
    sentiment_st = TextProcessor.get_sentiment(tweet_st)

    tweet_lm = TextProcessor.process_text(tweet_raw, use_lemmatizer=True)
    sentiment_lm = TextProcessor.get_sentiment(tweet_lm)

    return (tweet_raw, sentiment_raw, tweet_str, sentiment_str, tweet_clr, sentiment_clr, tweet_st, sentiment_st, tweet_lm, sentiment_lm)

def process_tweets_async(tweets_list):
    proc_tweets = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        #results = executor.map(process_tweet, tweets_list)
        results = list(tqdm(executor.map(process_tweet, tweets_list), total=len(tweets_list), desc='Processing'))
        for result in results:
            proc_tweets.append(result)

    tweets_df = pd.DataFrame(proc_tweets, columns=['tweet_raw', 'sent_raw', 'tweet_str', 'sent_str', 'tweet_clr', 'sent_clr', 'tweet_st', 'sent_st', 'tweet_lm', 'sent_lm'])

    return tweets_df
'''

"\ndef process_tweet(tweet):\n    tweet_raw = tweet\n    sentiment_raw = TextProcessor.get_sentiment(tweet_raw)\n\n    tweet_str = TextProcessor.clean_text(tweet_raw)\n    sentiment_str = TextProcessor.get_sentiment(tweet_str)\n\n    tweet_clr = TextProcessor.process_text(tweet_raw)\n    sentiment_clr = TextProcessor.get_sentiment(tweet_clr)\n\n    tweet_st = TextProcessor.process_text(tweet_raw, use_stemmer=True)\n    sentiment_st = TextProcessor.get_sentiment(tweet_st)\n\n    tweet_lm = TextProcessor.process_text(tweet_raw, use_lemmatizer=True)\n    sentiment_lm = TextProcessor.get_sentiment(tweet_lm)\n\n    return (tweet_raw, sentiment_raw, tweet_str, sentiment_str, tweet_clr, sentiment_clr, tweet_st, sentiment_st, tweet_lm, sentiment_lm)\n\ndef process_tweets_async(tweets_list):\n    proc_tweets = []\n    with concurrent.futures.ThreadPoolExecutor() as executor:\n        #results = executor.map(process_tweet, tweets_list)\n        results = list(tqdm(executor.map(process_tweet, twe

# Sentiment Analysis

In this section a sentiment analysis is performed over the tweets dataset using time-series to forcast the sentiment of tweets over a period of time.

## Data pre-processing

Since this dataset was used for sentiment analysis, the following operations were performed on the tweets text as part of EDA:

1. Removal of special characters, links and images (tweet_str)
2. Application of step 1 + Removal of stop words (tweet_clr)
3. Application of step 2 + Lemmatizer technique (tweet_lm)
4. Application of step 2 + Stemmer techinique (tweet_st)
5. Extract the sentiment from each version of the tweets

Each tweet version had its own sentiment calculated, because it was observed that the sentiment algorithm provides different results for each of them.

In [48]:
# getting tweets from the archive
dataset = pd.read_csv(TWEETS_DS_RAW, encoding='utf-8')

# dataset column names
tweets_cols = ['username', 'location', 'tweetid', 'text', 'hashtags', 'language', 'extractedts']

# storing the dataset in a variable for processing
tweets_df = dataset[tweets_cols]

# converting date column into proper date type
tweets_df['extractedts'] = pd.to_datetime(tweets_df['extractedts'])

In [49]:
# showing the date covereage in this dataset
tweets_df['extractedts'].min(), tweets_df['extractedts'].max()

(Timestamp('2022-04-01 00:01:44.294934'),
 Timestamp('2022-04-02 00:46:57.116538'))

In [50]:
# describing the data
tweets_df.describe(include='all')

Unnamed: 0,username,location,tweetid,text,hashtags,language,extractedts
count,364875,212933,364875.0,364875,364875,364875,364875
unique,166400,42667,,104548,85962,61,364875
top,FuckPutinBot,Ukraine,,⚡The Ukrainian Air Force would like to address...,[],en,2022-04-01 00:44:20.097867
freq,789,3285,,7297,72345,254626,1
first,,,,,,,2022-04-01 00:01:44.294934
last,,,,,,,2022-04-02 00:46:57.116538
mean,,,1.509875e+18,,,,
std,,,98290620000000.0,,,,
min,,,1.509682e+18,,,,
25%,,,1.509795e+18,,,,


In [68]:
# getting the tweets list from the dataframe
tweets_list = tweets_df['text'][:2000].values
len(tweets_list)

2000

In [None]:
# getting the tweets list from the dataframe
tweets_list = tweets_df['text'].values

# processing retrieved data using text processing
tweets_proc = process_tweets_async(tweets_list)

tweets_proc.head()

Processing:  72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                               | 264/365 [39:26<05:15,  3.12s/it]

In [57]:
len(tweets_proc)

1000

In [None]:
# getting the tweets list from the dataframe
tweets_list = tweets_df['text'].values

# processing retrieved data using text processing
tweets_proc = process_tweets(tweets_list)

# concat result with exiting dataframe
tweets_df = pd.concat([tweets_df, tweets_proc])

# storing the processed data in the datasets/tweets-online folder
tweets_df.to_csv(get_collected_twitter_file_path())

# show the tweets collected
tweets_df.head()

## Tweets Dataset Visualization

In order to visualize this dataset, it was prepared a frequency graph to show the most common words.

Frequency graphs and word clouds are common ways to visualize text datasets, so it is possible to have an idea of its main content.

For this purpose it was used a tokenizer algorithm to generate tags from all the tweets after the cleanning stage.

The frequency was the same if Stemmer or Lemmatizer techniques were applied, so the following graph only shows the frequency of the cleaned tweet data.

In [42]:
# display the frequency distribution
fdist = get_freq(tweets_df, 'tweet_clr')
g_plot = fdist.plot(30, cumulative=False, title='Word Frequency of the Tweets Dataset')

# saving graph as an image
fig = g_plot.get_figure()
fig.savefig(join(IMAGES_FOLDER, GRAPH_WORD_FREQ), format='png')

# showing the image
g_plot

KeyError: 'tweet_clr'

In the next step the tweets dataset are pre-processed for the sentiment analysis and time-series.

In [None]:
# converting date column into proper date type
dataset['extractedts'] = pd.to_datetime(dataset['extractedts'])

In [30]:
# showing the date covereage in this dataset
dataset['extractedts'].min(), dataset['extractedts'].max()

('2022-04-01 00:01:44.294934', '2022-04-02 00:46:57.116538')

In [32]:
# describing the data
dataset.describe(include='all')

Unnamed: 0,username,location,tweetid,text,hashtags,language,extractedts
count,364875,212933,364875.0,364875,364875,364875,364875
unique,166400,42667,,104548,85962,61,364875
top,FuckPutinBot,Ukraine,,⚡The Ukrainian Air Force would like to address...,[],en,2022-04-01 00:44:20.097867
freq,789,3285,,7297,72345,254626,1
mean,,,1.509875e+18,,,,
std,,,98290620000000.0,,,,
min,,,1.509682e+18,,,,
25%,,,1.509795e+18,,,,
50%,,,1.50988e+18,,,,
75%,,,1.509958e+18,,,,


In [None]:
import csv
from datetime import datetime

file_path = './datasets/ukraine-dataset.csv'
date_column_index = 17  # Index of the column containing dates (adjust if necessary)

dates = []

with open(file_path, "r", encoding="utf-8") as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row if present
    for row in reader:
        date_str = row[date_column_index]
        try:
            date = datetime.strptime(date_str, "%Y-%m-%d")  # Adjust date format if necessary
            dates.append(date)
        except ValueError:
            print(f"Invalid date format: {date_str}")

if dates:
    min_date = min(dates)
    max_date = max(dates)
    print("Date Range:")
    print("Min Date:", min_date.date())
    print("Max Date:", max_date.date())
else:
    print("No dates found in the CSV file.")
