# Data cleaner for twitter dataset
Disclaymer: To run this notebook, launch pyspark (command "pyspark --master local[*number of cores*]") from the folder containing the notebook.

In [None]:
from pyspark.sql import SparkSession
from utility_functions import *
import os

In [None]:
# Get directory
directory = os.path.dirname(os.getcwd()).replace("\\", "/")

# Define paths
path_neg_tweets = directory + "/data/twitter-data/train_neg_full.txt"
path_pos_tweets = directory + "/data/twitter-data/train_pos_full.txt"
path_tweets = directory + "/data/datasets/twitter-dataset-cleaned.json"
path_all_occurrences = directory + "/data/sentiment-knowledge/twitter-all-words-with-occurrences.csv"
path_tweets_no_unknown_directory = directory + "/data/datasets/twitter-dataset"
path_tweets_no_unknown = directory + "/data/datasets/twitter-dataset.json"

In [None]:
spark = SparkSession.builder \
    .config("spark.executor.memory", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.memoryOverhead", "12g")\
    .config("spark.executor.heartbeatInterval", "1200s")\
    .config("spark.executor.extraJavaOptions", "-Xmx32g -Xms12g") \
    .getOrCreate()

In [None]:
# Read datasets
dataset_neg = spark.read.text(path_neg_tweets).rdd.distinct().map(lambda x: {"label": 0, "text": x.value})
dataset_pos = spark.read.text(path_pos_tweets).rdd.distinct().map(lambda x: {"label": 1, "text": x.value})

# Merge
dataset = dataset_neg.union(dataset_pos)

# Split rdd into multiple rdds
split_rdds = dataset.randomSplit([0.1 for _ in range(0,10)])

In [None]:
# Process dataset
rdds = []

for rdd in split_rdds:
    rdds.append(
        rdd.map(lambda obj: {"label": obj["label"], "text": cleaning_function_twitter_dataset(obj["text"])})
    )

In [None]:
with open(path_tweets, "a") as f:
    for rdd in rdds:
        for item in rdd.collect():
            f.write(str(item).replace("'", "\"") + "\n")

In [None]:
# Load cleaned dataset
dataset_cleaned = spark.read.json(path_tweets)

# Split rdd into multiple rdds
split_rdds_cleaned = dataset_cleaned.randomSplit([0.1 for _ in range(0, 10)])

In [None]:
# Compute list of words with number of occurrences
words_occurrences_rdds = []

for df in split_rdds_cleaned:
    words_occurrences_rdds.append(
        df.rdd.flatMap(lambda obj: [(word, 1) for word in tokenize_with_sequences(
            remove_symbols_before_tokenization(obj["text"], True), True)])  # Tokenize
        .reduceByKey(lambda x, y: x + y)  # Add occurrences
    )

In [None]:
# Merge rdds, then reduce by key to obtain final vocabulary with number of occurreces
merged_rdd_occurrences = words_occurrences_rdds[0].union(words_occurrences_rdds[1])

for i in range(2, len(words_occurrences_rdds)):
    merged_rdd_occurrences = merged_rdd_occurrences.union(words_occurrences_rdds[i])

words_with_occurrences = merged_rdd_occurrences.reduceByKey(lambda x, y: x + y).collect()

In [None]:
import wordninja
import re

def replace_numbers_in_list(list_of_strings):
    pattern = r'\d+'  # \d+ matches one or more digits (a number)

    replaced_list = []
    for string in list_of_strings:
        replaced_string = re.sub(pattern, "", string)
        replaced_list.append(replaced_string)

    return replaced_list

def contains_numbers(word):
    pattern = r'\d'  # \d matches any digit (0-9)
    return bool(re.search(pattern, word))

# Create keyword processor
kp = KeywordProcessor()

# Process list of words with occurrences
count_unknown = 0
count_frequent = 0
frequent_words = []
unknown_words = []
for word in sorted(words_with_occurrences, key=lambda x: x[1]):
    if word[1] == 1:
        kp.add_keyword(word[0], " ")
    elif word[1] <= 15:
        if not contains_numbers(word[0]):
            new_words = wordninja.split(word[0])
            value = " ".join(new_words)
            kp.add_keyword(word[0], value)
    # else:
    #     if contains_numbers(word[0]):
    #         new_words = wordninja.split(word[0])
    #         new_words = replace_numbers_in_list(new_words)
    #         final_list = []
    #         for w in new_words:
    #             if len(w) != 1 or w == "x":
    #                 final_list.append(w)
    #         value = " ".join(final_list)
    #         kp.add_keyword(word[0], value)

In [None]:
# Create rdd of cleaned text without unknown words
cleaned_no_unknown_rdds = []

for df in split_rdds_cleaned:
    cleaned_no_unknown_rdds.append(
        df.rdd.map(lambda obj: {"label": obj["label"],
                                "text": " ".join(kp.replace_keywords(obj["text"]).split())}) # Remove unknown
    )

In [None]:
# Save cleaned dataset with unknown words
for i in range(0, len(cleaned_no_unknown_rdds)):
    save_rdd_to_json_file(path_tweets_no_unknown_directory + "/cleaned" + f"{i}", cleaned_no_unknown_rdds[i], ["label", "text"])

# Create dataset from files
merge_files(path_tweets_no_unknown_directory, path_tweets_no_unknown)