In [9]:
%pip install kaggle


Note: you may need to restart the kernel to use updated packages.


In [10]:
%pip install pyspark


Note: you may need to restart the kernel to use updated packages.


In [11]:
%pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [12]:
%pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [13]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json


cp: cannot stat 'kaggle.json': No such file or directory


In [14]:
!kaggle datasets download -d bwandowando/ukraine-russian-crisis-twitter-dataset-1-2-m-rows


ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip: Skipping, found more recently modified local copy (use --force to force download)


# Preprocessing
We're going to preprocess the original dataset from Kaggle to reduce its size and only work on meaningful data for our analysis: 
1. unzip the provided file to work on individual CSV files;
2. filter only English written tweets to build a coherent language base;
3. remove useless columns such as the account description or the number of retweets;
4. possibly remove some stop-words.

In [15]:
import time

start_time = time.time()


## Code imports and globals

In [16]:
import zipfile
import os
import pyspark
import regex
import shutil
import gzip
import nltk
from nltk.tokenize import word_tokenize
import string
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import Row
from pyspark.sql.functions import concat_ws
import multiprocessing
import pandas
from typing import List, Dict
from datetime import datetime
from nltk.corpus import stopwords
import json


TOP_HASHTAGS_INDEX = dict()
TOP_HASHTAGS_REVERSE_INDEX = dict()


## Configuration

In [17]:
DO_PREPROCESS = True
FIND_MOST_COMMON = True
TOP_HASHTAGS_NUMBER = 100

TOP_HASHTAGS_INDEX_FILENAME = "top_hashtags_index.json"
TOP_HASHTAGS_REVERSE_INDEX_FILENAME = "top_hashtags_reverse_index.json"

KAGGLE_DATASET = "ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip"
KAGGLE_DATASET_DIRECTORY = os.path.join(
    "out", os.path.splitext(KAGGLE_DATASET)[0])
WORKERS_CORES = multiprocessing.cpu_count()
FILTER_LANGUAGE = "en"


## Stopwords list retrieval

In [18]:
def update_nltk() -> None:
    nltk.download('stopwords')
    nltk.download('punkt')


## Dataset extraction
Unzip the kaggle dataset. If an extracted version of that dataset is already present, only extract new files. After that, extract .gzip files into .csv

In [19]:
def dataset_extraction(archive: str, output_directory: str) -> List[str]:
    new_files = list()
    with zipfile.ZipFile(archive, "r") as zip_ref:

        return [os.path.join(output_directory, csv_name) for csv_name, _ in [os.path.splitext(g_name) for g_name in zip_ref.namelist()]]

        if not os.path.isdir(output_directory):
            zip_ref.extractall(output_directory)
            #new_files = zip_ref.namelist()
        # else:
        for gzip_name in zip_ref.namelist():
            csv_name, extension = os.path.splitext(gzip_name)
            if os.path.isfile(os.path.join(output_directory, csv_name)):
                continue
            if not os.path.isfile(os.path.join(output_directory, gzip_name)):
                zip_ref.extract(gzip_name, path=output_directory)
            csv_path = os.path.join(output_directory, csv_name)
            with gzip.open(os.path.join(output_directory, gzip_name), 'r') as gzip_file, open(csv_path, 'wb') as csv_file:
                shutil.copyfileobj(gzip_file, csv_file)
            new_files.append(csv_path)
    return new_files


## Spark initialization

In [20]:
def init_spark() -> pyspark.sql.SparkSession:
    print(f"Available CPU cores/workers: {WORKERS_CORES}")
    print("Initializing spark...", end=' ', flush=True)
    spark = (
        pyspark.sql.SparkSession.builder
        .master(f"local[{WORKERS_CORES}]")
        .appName("Sparkiodi")
        .getOrCreate()
    )
    spark.sparkContext.setLogLevel("OFF")
    print("Spark initialized")
    return spark


## Dataset reduction
Read all the CSVs in the specified path. "path" can be a list of files, a folder containig multiple files, or a regex matching multiple files.

In [21]:
def read_dataframe(path: List[str], spark, header: bool = True, language: str = None) -> pyspark.sql.DataFrame:
    if header:
        starting_df = (
            spark
            .read
            .option("header", True)
            .option("multiLine", True)
            .csv(path)
        )
    else:
        starting_df = (
            spark
            .read
            .option("header", False)
            .option("multiLine", True)
            .csv(path)
        )

    if language:
        broadcast_language = spark.sparkContext.broadcast(FILTER_LANGUAGE)
        starting_df = starting_df.where(
            starting_df.language == broadcast_language.value)

    return starting_df.select("text")


### Entities removal and hashtags extraction

#### Regex
Match hashtags, URLs, user-mentions, and punctuation, to eventually remove them from the tweet's text

In [22]:
hashtag_regex_str = r"(?:\#+)([\w_]+)"  # hashtags
regex_str = [
    (r'(?:@[\w_]+)', ''),  # @-mentions
    # URLs
    (r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', ''),
    (r'[^\w\s]', ' '),  # punctuation
    (r'\s+', ' ')  # whitespaces
]

hashtag_regex = regex.compile(hashtag_regex_str)
regex = [(hashtag_regex, '')] + [(regex.compile(compiled[0]), compiled[1])
                                 for compiled in regex_str]   # Keep hashtag_regex as the first applied regex


#### Text cleaning
Use the the regexes to filter out from the text anything that's not a meaningful word

In [23]:
def clean_text(text: str) -> List[str]:
    text = text.lower()

    for reg in regex:
        text = reg[0].sub(reg[1], text)

    text_list = set([
        word.rstrip() for word in word_tokenize(text)
        if word not in string.punctuation
    ])

    return list(text_list)


In [24]:
def padding_hashtags(hashtags: List[str], indexes: Dict[str, int], compress: bool = False):
    """ Utility function to create an indicator vector for the 'hashtags' list over the 'indexes' parameter

    Args:
        hashtags (List[str]): list of hashtags to be mapped into an indicator vector
        indexes (Dict[str, int]): mapping from an hashtag to its index position
        compress (bool): decides wether to store the whole indicator vector (False) or only the non-zero indices (True)

    Returns:
        List[int]: a list of integer with the most common hashtags for the initial list
    """
    if compress:
        pads = list()
        for tag in hashtags:
            if tag in indexes:
                pads.append(indexes[tag])
        pads.sort()
    else:
        pads = [0 for _ in range(len(indexes))]
        for tag in hashtags:
            if tag in indexes:
                pads[indexes[tag]] = 1
    return pads


def update_top_hashtags(top_hashtags: List[str]) -> None:
    """ Utility function to update the global list of most common hashtags

    Args:
        top_hashtags (List[str]): list of most common hashtagsices (True)
    """
    global TOP_HASHTAGS_INDEX, TOP_HASHTAGS_REVERSE_INDEX
    zipped_hashtags = list(zip(top_hashtags, range(len(top_hashtags))))
    TOP_HASHTAGS_INDEX = {key: value for key, value in zipped_hashtags}
    TOP_HASHTAGS_REVERSE_INDEX = {value: key for key, value in zipped_hashtags}


def clean_dataframe(df: pyspark.sql.DataFrame, spark: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame:

    # StopWordsRemover is a pyspark utility to remove words from a dataset's column
    remover = StopWordsRemover(stopWords=stopwords.words('english'))
    remover.setInputCol("tweet")
    remover.setOutputCol("filtered_tweet")

    # From every row extract the "cleaned" text and a list of its hashtags
    tweets_hashtags_rdd = (
        df.rdd
        .map(lambda row: (clean_text(row.text), [ht.lower() for ht in hashtag_regex.findall(row.text)]))
    )

    if FIND_MOST_COMMON:
        top_hashtags = (
            tweets_hashtags_rdd
            .flatMap(lambda row: row[1])
            .map(lambda row: (row, 1))
            .reduceByKey(lambda x, y: x + y)
            .map(lambda row: (row[1], row[0]))
            .sortByKey(ascending=False)
            .map(lambda row: row[1])
            .take(TOP_HASHTAGS_NUMBER)
        )
        update_top_hashtags(top_hashtags)

    broadcast_top_hashtags_index = spark.sparkContext.broadcast(
        TOP_HASHTAGS_INDEX)

    # Map the second column (containing the hashtags) into its indicator vector
    df = (
        tweets_hashtags_rdd
        .map(lambda row: (row[0], padding_hashtags(row[1], broadcast_top_hashtags_index.value, True)))
        .toDF(["tweet", "hashtags"])
    )
    # Remove the stop words and concatenate each column into a whitespace separated list
    df = (
        remover
        .transform(df)
        .select("filtered_tweet", "hashtags")
        .withColumn("filtered_tweet", concat_ws(" ", "filtered_tweet"))
        .withColumn("hashtags", concat_ws(" ", "hashtags"))
    )
    # Filter out eventual tweets without text
    df = df.where((df.filtered_tweet != ""))

    return df


## Preprocessed dataset storing

In [25]:
def write_preprocessed(df: pyspark.sql.DataFrame, output_directory: str) -> str:
    print("Writing top hashtags...")
    with open(TOP_HASHTAGS_INDEX_FILENAME, "w") as fp:
        json.dump(TOP_HASHTAGS_INDEX, fp)
    with open(TOP_HASHTAGS_REVERSE_INDEX_FILENAME, "w") as fp:
        json.dump(TOP_HASHTAGS_REVERSE_INDEX, fp)
    print("Writing all the CSVs...")
    (df
        # .coalesce(WORKERS_CORES)
        .write
        .mode("overwrite")
        .option("header", False)
        .csv(output_directory)
     )

    # spark writes one file per task along with its CRC, so we delete all the non-CSV files we don't require
    for dirpath, dirnames, filenames in os.walk(output_directory):
        for filename in filenames:
            name, extension = os.path.splitext(filename)
            if extension == ".csv":
                continue
            filepath = os.path.join(dirpath, filename)
            try:
                os.remove(filepath)
            except:
                pass


In [26]:
def compress(path: str) -> str:
    print("Creating archive directory...")
    if os.path.isfile(path):
        name, ext = os.path.splitext(path)
        output_archive = f"{name}.zip"
        with zipfile.ZipFile(output_archive, "w", compression=zipfile.ZIP_DEFLATED) as zf:
            zf.write(path, os.path.basename(path))
    else:
        output_archive = shutil.make_archive(path, 'zip', path)
    return output_archive


In [27]:
def create_single_csv(folders: List[str], overwrite: bool = False) -> str:
    single_filename = 'out/dataset.csv'
    filemode = 'wb' if overwrite else 'ab'
    with open(single_filename, filemode) as output_file:
        for directory in folders:
            for dirpath, dirnames, filenames in os.walk(directory):
                for filename in filenames:
                    name, extension = os.path.splitext(filename)
                    if extension != ".csv":
                        continue
                    filepath = os.path.join(dirpath, filename)
                    with open(filepath, 'rb') as csv:
                        shutil.copyfileobj(csv, output_file)
    return single_filename


## Start preprocessing

In [28]:
def preprocess() -> None:
    if not FIND_MOST_COMMON:
        if not os.path.isfile(TOP_HASHTAGS_INDEX_FILENAME):
            print(f"The flag \"FIND_MOST_COMMON\" is not set but no file at \"{TOP_HASHTAGS_INDEX_FILENAME}\" could be found.")
            print("Please either provide a file with the most common hashtags to be used or set \"FIND_MOST_COMMON\" to \"True\" to automatically compute them (note that this may take considerable time")
            return
        else:
            global TOP_HASHTAGS_INDEX, TOP_HASHTAGS_REVERSE_INDEX
            with open(TOP_HASHTAGS_INDEX_FILENAME, "r") as input_file:
                TOP_HASHTAGS_INDEX = json.load(input_file)
                TOP_HASHTAGS_REVERSE_INDEX = {TOP_HASHTAGS_INDEX[item]: item for item in TOP_HASHTAGS_INDEX}
                
    new_files = dataset_extraction(KAGGLE_DATASET, KAGGLE_DATASET_DIRECTORY)
    if not len(new_files):
        print("No new files to process")
        return

    update_nltk()
    if not ('spark' in locals() or 'spark' in globals()):
        spark = init_spark()
    df = read_dataframe(new_files, spark, language=FILTER_LANGUAGE)
    df = clean_dataframe(df, spark)
    output_directory = f"out/preprocessed_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}"
    write_preprocessed(df, output_directory)
    csv = create_single_csv([output_directory])
    compress(csv)


if DO_PREPROCESS:
    preprocess()


[nltk_data] Downloading package stopwords to /home/lion/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Available CPU cores/workers: 4
Initializing spark... 

[nltk_data] Downloading package punkt to /home/lion/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


22/07/05 16:31:19 WARN Utils: Your hostname, elros resolves to a loopback address: 127.0.1.1; using 192.168.1.29 instead (on interface wlan0)
22/07/05 16:31:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/07/05 16:31:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Spark initialized


                                                                                

Writing top hashtags...
Writing all the CSVs...


                                                                                

Creating archive directory...


In [29]:
end_time = time.time()
print(f"Start time: {start_time}")
print(f"End time: {end_time}")
print(
    f"Elapsed time: {time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))}")


Start time: 1657030413.5441806
End time: 1657041867.3013732
Elapsed time: 03:10:53


In [30]:
!head out/dataset.csv


surgery cardiology practical russian health viktor scientific center marina died cardiac lyashko region kalabina shelling reported medical anesthesiologist pediatric minister troops,0 10
supplies defense city points thought mps fled helping fight together fun team expect resistance patrol day town spent stay organize people checking part,0 10
lab international could verified crimes indiscriminate imagery law amount evidence satellite analyzed violations photos including attacks crisis amnesty war digital videos,0
tuesday financial invasion front amid sanctions caused west residents queues stood imposed long uncertainty,0 1 62
10 bulletproof antiwar ideas,32 73
snoring next bunker ok sleeping seriously bloke underground,6 10
better want hundreds choice make wise made die already,0 1 3
invasion ukraine worries connected world reputational biggest russian brands financial,0
flights ukrainian hundreds following invasion archipelago closed airspace look indian stranded ocean civilian touris