# Data cleaner for twitter test dataset
Disclaymer: To run this notebook, launch pyspark (command "pyspark --master local[*number of cores*]") from the folder containing the notebook.

In [None]:
from pyspark.sql import SparkSession
from utility_functions import *
import os

In [None]:
# Get directory
directory = os.path.dirname(os.getcwd()).replace("\\", "/")

# Define paths
path_test_tweets = directory + "/data/twitter-data/test_data.txt"
path_tweets = directory + "/data/datasets/twitter-test.json"

In [None]:
spark = SparkSession.builder \
    .config("spark.executor.memory", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.memoryOverhead", "12g")\
    .config("spark.executor.heartbeatInterval", "1200s")\
    .config("spark.executor.extraJavaOptions", "-Xmx32g -Xms12g") \
    .getOrCreate()

### Clean

In [None]:
def remove_index_and_comma(string):
    first_comma_index = string.index(',')
    return string[first_comma_index + 1:]

# Read dataset with test data
dataset_test = spark.read.text(path_test_tweets).rdd.map(lambda x: {"text": remove_index_and_comma(x.value)})

# Clean dataset
dataset_test_cleaned = dataset_test.map(lambda obj: {"text": new_cleaning_function_twitter_dataset(obj["text"])})

### Split words with less than 15 occurrences

In [None]:
# Compute number of occurrences of words
words_with_occurrences = dataset_test_cleaned.flatMap(lambda obj: [(word, 1) for word in tokenize_with_sequences(remove_symbols_before_tokenization(obj["text"], True), True)]).reduceByKey(lambda x, y: x + y).collect()

In [None]:
# Split words
import wordninja

# Create keyword processor
kp = KeywordProcessor()

# Process list of words with occurrences
for word in sorted(words_with_occurrences, key=lambda x: x[1]):
    if word[1] <= 15:
        if not contains_numbers(word[0]):
            new_words = wordninja.split(word[0])
            value = " ".join(new_words)
            kp.add_keyword(word[0], value)

In [None]:
# Define keyword processor to replace <user> and <url>
kp_special_tokens = KeywordProcessor()
kp_special_tokens.add_keyword("<user>", "@USER")
kp_special_tokens.add_keyword("<url>", "HTTPURL")

In [None]:
# Create rdd of cleaned text without unknown words
dataset_test_cleaned_with_split_text = dataset_test_cleaned.map(lambda obj: {"text": " ".join(kp.replace_keywords(obj["text"]).split())}).map(lambda obj: {"text": kp_special_tokens.replace_keywords(obj["text"])})

### Remove ids, i.e. words with both numbers and characters, and save dataset

In [None]:
# Get words with less than 10 occurrences
words_to_analyze = dataset_test_cleaned_with_split_text.flatMap(lambda obj: [(word, 1) for word in tokenize_with_sequences(obj["text"], True)]).reduceByKey(lambda x, y: x + y).filter(lambda x: x[1] <= 10).collect()

In [None]:
# Remove words with one occurrence and words with less than 10 occurrences containing both numbers and characters
kp_words_to_remove = KeywordProcessor()

for word in words_to_analyze:
    if word[1] <= 10:
        if contains_numbers(word[0]) and not contains_numbers_and_x(word[0]):
            kp_words_to_remove.add_keyword(word[0], " ")

In [None]:
# Create rdd with final dataset
dataset_test_cleaned_final = dataset_test_cleaned_with_split_text.map(lambda obj: {"text": " ".join(kp_words_to_remove.replace_keywords(obj["text"]).split())})

In [None]:
# Write dataset for test to file
with open(path_tweets, "a") as f:
    for item in dataset_test_cleaned_final.collect():
            f.write(str(item).replace("'", "\"") + "\n")