# Data cleaner for twitter dataset
Disclaymer: To run this notebook, launch pyspark (command "pyspark --master local[*number of cores*]") from the folder containing the notebook.

In [1]:
from pyspark.sql import SparkSession
from utility_functions import *
import os

In [2]:
# Get directory
directory = os.path.dirname(os.getcwd()).replace("\\", "/")

# Define paths
path_neg_tweets = directory + "/data/twitter-data/train_neg_full.txt"
path_pos_tweets = directory + "/data/twitter-data/train_pos_full.txt"
path_tweets = directory + "/data/datasets/twitter-dataset.json"

In [3]:
spark = SparkSession.builder \
    .config("spark.executor.memory", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.memoryOverhead", "12g")\
    .config("spark.executor.heartbeatInterval", "1200s")\
    .config("spark.executor.extraJavaOptions", "-Xmx32g -Xms12g") \
    .getOrCreate()

In [4]:
# Read datasets
dataset_neg = spark.read.text(path_neg_tweets).rdd.distinct().map(lambda x: {"label": 0, "text": x.value})
dataset_pos = spark.read.text(path_pos_tweets).rdd.distinct().map(lambda x: {"label": 1, "text": x.value})

# Merge
dataset = dataset_neg.union(dataset_pos)

# Split rdd into multiple rdds
split_rdds = dataset.randomSplit([0.1 for _ in range(0,10)])

In [5]:
# Process dataset
rdds = []

for rdd in split_rdds:
    rdds.append(
        rdd.map(lambda obj: {"label": obj["label"], "text": cleaning_function_twitter_dataset(obj["text"])})
    )

In [6]:
with open(path_tweets, "a") as f:
    for rdd in rdds:
        for item in rdd.collect():
            f.write(str(item).replace("'", "\"") + "\n")