# Data cleaner for twitter training dataset
Disclaymer: To run this notebook, launch pyspark (command "pyspark --master local[*number of cores*]") from the folder containing the notebook.

In [None]:
from pyspark.sql import SparkSession
from utility_functions import *
import os

In [None]:
# Get directory
directory = os.path.dirname(os.getcwd()).replace("\\", "/")

# Define paths
path_neg_tweets = directory + "/data/twitter-data/train_neg_full.txt"
path_pos_tweets = directory + "/data/twitter-data/train_pos_full.txt"
path_tweets = directory + "/data/datasets/twitter-cleaned.json"
path_all_occurrences = directory + "/data/sentiment-knowledge/twitter-all-words-with-occurrences.csv"
path_tweets_no_unknown_directory = directory + "/data/datasets/twitter"
path_tweets_no_unknown = directory + "/data/datasets/twitter.json"

In [None]:
spark = SparkSession.builder \
    .config("spark.executor.memory", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.memoryOverhead", "12g")\
    .config("spark.executor.heartbeatInterval", "1200s")\
    .config("spark.executor.extraJavaOptions", "-Xmx32g -Xms12g") \
    .getOrCreate()

### Clean tweets

In [None]:
# Read datasets
dataset_neg = spark.read.text(path_neg_tweets).rdd.distinct().map(lambda x: {"label": 0, "text": x.value})
dataset_pos = spark.read.text(path_pos_tweets).rdd.distinct().map(lambda x: {"label": 1, "text": x.value})

# Merge
dataset = dataset_neg.union(dataset_pos)

# Split rdd into multiple rdds
split_rdds = dataset.randomSplit([0.1 for _ in range(0,10)])

In [None]:
# Process dataset
rdds = []

for rdd in split_rdds:
    rdds.append(
        rdd.map(lambda obj: {"label": obj["label"], "text": new_cleaning_function_twitter_dataset(obj["text"])})
    )

In [None]:
with open(path_tweets, "a") as f:
    for rdd in rdds:
        for item in rdd.collect():
            f.write(str(item).replace("'", "\"") + "\n")

### Split words with less than 15 occurrences

In [None]:
# Load cleaned dataset
dataset_cleaned = spark.read.json(path_tweets)

# Split rdd into multiple rdds
split_rdds_cleaned = dataset_cleaned.randomSplit([0.1 for _ in range(0, 10)])

In [None]:
# Compute list of words with number of occurrences
words_occurrences_rdds = []

for df in split_rdds_cleaned:
    words_occurrences_rdds.append(
        df.rdd.flatMap(lambda obj: [(word, 1) for word in tokenize_with_sequences(
            remove_symbols_before_tokenization(obj["text"], True), True)])  # Tokenize
        .reduceByKey(lambda x, y: x + y)  # Add occurrences
    )

In [None]:
# Merge rdds, then reduce by key to obtain final vocabulary with number of occurreces
merged_rdd_occurrences = words_occurrences_rdds[0].union(words_occurrences_rdds[1])

for i in range(2, len(words_occurrences_rdds)):
    merged_rdd_occurrences = merged_rdd_occurrences.union(words_occurrences_rdds[i])

words_with_occurrences = merged_rdd_occurrences.reduceByKey(lambda x, y: x + y).collect()

In [None]:
# Split words
import wordninja

# Create keyword processor
kp = KeywordProcessor()

# Process list of words with occurrences
for word in sorted(words_with_occurrences, key=lambda x: x[1]):
    if word[1] <= 15:
        if not contains_numbers(word[0]):
            new_words = wordninja.split(word[0])
            value = " ".join(new_words)
            kp.add_keyword(word[0], value)

In [None]:
# Define keyword processor to replace <user> and <url>
kp_special_tokens = KeywordProcessor()
kp_special_tokens.add_keyword("<user>", "@USER")
kp_special_tokens.add_keyword("<url>", "HTTPURL")

In [None]:
# Create rdd of cleaned text without unknown words
cleaned_rdd_with_split_text = dataset_cleaned.rdd.map(lambda obj: {"label": obj["label"], "text": " ".join(kp.replace_keywords(obj["text"]).split())}).map(lambda obj: {"label": obj["label"], "text": kp_special_tokens.replace_keywords(obj["text"])})

### Remove ids, i.e. words with both numbers and characters, and save dataset

In [None]:
# Get words with less than 10 occurrences
words_to_analyze = cleaned_rdd_with_split_text.flatMap(lambda obj: [(word, 1) for word in tokenize_with_sequences(obj["text"], True)]).reduceByKey(lambda x, y: x + y).filter(lambda x: x[1] <= 10).collect()

In [None]:
# Remove words with less than 10 occurrences containing both numbers and characters
kp_words_to_remove = KeywordProcessor()

for word in words_to_analyze:
    if word[1] <= 10:
        if contains_numbers(word[0]) and not contains_numbers_and_x(word[0]):
            kp_words_to_remove.add_keyword(word[0], " ")

In [None]:
cleaned_rdd_final = cleaned_rdd_with_split_text.map(lambda obj: {"label": obj["label"], "text": " ".join(kp_words_to_remove.replace_keywords(obj["text"]).split())})

In [None]:
# Save final dataset
save_rdd_to_json_file(path_tweets_no_unknown_directory, cleaned_rdd_final, ["label", "text"])

# Create dataset from files
merge_files(path_tweets_no_unknown_directory, path_tweets_no_unknown)

### Save occurrences of words 

In [None]:
# Compute list of words with number of occurrences
words = cleaned_rdd_final.flatMap(lambda obj: [(word, 1) for word in tokenize_with_sequences(obj["text"], True)]).reduceByKey(lambda x, y: x + y).collect()

In [None]:
save_list_to_csv(sorted(words, key=lambda x: x[1]), directory + "/data/sentiment-knowledge/twitter-all-words-with-occurrences.csv", ["word", "occurrences"])