In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, udf
from pyspark.sql.types import StringType
import re

# Initialize SparkSession
spark = SparkSession.builder.appName("DataCleaning").getOrCreate()

In [0]:
df1 = spark.read.json('dbfs:/FileStore/train_0.json')
df2 = spark.read.json('dbfs:/FileStore/train_1.json')

# Combine the DataFrames using union()
df = df1.union(df2)

df.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|            document|          ext_labels|                  id|           rg_labels|             summary|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|hi i im getting a...|                 [1]|TLDR_RS_2021-04-c...|               [1.0]|of my post is ask...|
|it 's possible th...|        [0, 1, 0, 0]|TLDR_RS_2021-02-c...|[0.16403576510432...|if you play heart...|
|i will be the fir...|     [0, 1, 0, 0, 0]|TLDR_RS_2021-04-c...|[0.18650505091030...|i 'm tired of my ...|
|hello , my friend...|[0, 0, 1, 0, 0, 0...|TLDR_RS_2021-03-c...|[0.13981801700034...|: my friend has s...|
|background : my (...|[0, 0, 0, 0, 1, 0...|TLDR_RS_2021-02-c...|[0.18652821177119...|my ex is very err...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [0]:
# Define UDF for removing special characters
def remove_special_characters(text):
    text = re.sub(r'http\S+|www\S+|@\S+|<.*?>', '', text)  # Remove HTML tags and URLs
    text = re.sub(r'\b\w+@\w+\.\w+\b', '', text)           # Remove email addresses
    text = re.sub(r'@\w+', '', text)                      # Remove usernames starting with '@'
    text = re.sub(r'\bu/\w+\b', '', text)                 # Remove Reddit usernames
    text = re.sub(r'\s*\(\s*', ' ', text)                 # Remove parentheses and spaces
    text = re.sub(r'\s*\)\s*', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()              # Replace multiple spaces with one
    return text

remove_special_characters_udf = udf(remove_special_characters, StringType())

In [0]:
# Apply special character removal to document and summary columns
df = df.withColumn("document", remove_special_characters_udf(col("document")))
df = df.withColumn("summary", remove_special_characters_udf(col("summary")))

In [0]:
# Convert text to lowercase
df = df.withColumn("document", lower(col("document")))
df = df.withColumn("summary", lower(col("summary")))

In [0]:
# Comprehensive custom slang dictionary
slang_dictionary = {
    "u": "you",
    "r": "are",
    "cuz": "because",
    "dont": "do not",
    "wont": "will not",
    "im": "I am",
    "yall": "you all",
    "gonna": "going to",
    "gotta": "got to",
    "hafta": "have to",
    "lemme": "let me",
    "kinda": "kind of",
    "sorta": "sort of",
    "lol": "laughing out loud",
    "lmao": "laughing my ass off",
    "btw": "by the way",
    "fyi": "for your information",
    "smh": "shaking my head",
    "idk": "I don't know",
    "ftw": "for the win",
    "brb": "be right back",
    "tbh": "to be honest",
    "wyd": "what you doing",
    "salty": "bitter or upset",
    "simp": "someone who shows excessive sympathy",
    "sus": "suspicious",
    "vibe check": "assessing someone's energy or mood",
    "lit": "exciting or excellent",
    "yeet": "to throw something with force",
    "ghosting": "sudden cut-off communication",
    "shook": "shocked or surprised",
    "extra": "over the top",
    "b4": "before",
    "gtg": "got to go",
    "omg": "oh my god",
    "imo": "in my opinion",
    "tldr": "too long; didn't read",
    "ikr": "I know right",
    "rofl": "rolling on the floor laughing",
    "yolo": "you only live once",
    "ama": "ask me anything",
    "asap": "as soon as possible",
    "nsfw": "not safe for work",
    "afaik": "as far as I know",
    "wtf": "what the f***",
    "irl": "in real life",
    "afk": "away from keyboard",
    "np": "no problem",
    "fr": "for real",
    "srsly": "seriously",
    "fam": "family",
    "flex": "show off",
    "shade": "disrespect",
    "clout": "influence or power",
    "cap/no cap": "lie/no lie",
    "stan": "an obsessive fan",
    "thirsty": "desperate for attention",
    "fomo": "fear of missing out",
    "bussin": "really good",
    "bet": "agreement or approval",
    "cheugy": "out of touch or trying too hard",
}

# Compile regex pattern for slang replacement
slang_pattern = r'\b(' + '|'.join(re.escape(slang) for slang in slang_dictionary.keys()) + r')\b'

# Define UDF for replacing slangs
def replace_slangs(text):
    return re.sub(slang_pattern, lambda x: slang_dictionary[x.group(0)], text)

replace_slangs_udf = udf(replace_slangs, StringType())

In [0]:
# Apply slang replacement
df = df.withColumn("document", replace_slangs_udf(col("document")))
df = df.withColumn("summary", replace_slangs_udf(col("summary")))

# Show the processed data
display(df.limit(5))

document,ext_labels,id,rg_labels,summary
hi i I am getting a new laptop i do nt know what the specs are but i know that they are possibly be low due to this I am planing to attempt to replace certain parts like increase ram or get a better cpu but I am stuck on the integrated graphics want to use for light game but I am not sure if the integrated graphics would be enough so I am wondering if i disable it and buy over the top ram or should i buy a more powerful cpu to compensate the,List(1),TLDR_RS_2021-04-cm-17760.json,List(1.0),"of my post is asking what can i do compensate for a weak graphics card so that my laptop could play video games like laughing out loud , cs or valerant any advice would be appreciated"
"it 's possible that many of you already do this , but in case there are some mac players out there like me , i just wanted to share . i decided to download bluestacks to play other android games on my mac , and decided to give hearthstone a go , not expecting any miracle . however , it runs much better than the mac version and the deck tracker arcane tracker is miles better than the hs tracker for mac . honestly , i do n't see myself playing on the mac client anytime soon .","List(0, 1, 0, 0)",TLDR_RS_2021-02-cm-3714.json,"List(0.16403576510432796, 0.359673708101243, 0.2387214876778162, 0.23756903911661298)","if you play hearthstone on mac and it runs poorly , download bluestacks and play there . :"
"i will be the first to admit , i 've got a weight problem , but , i do n't think that justifies him bringing it up to me almost on a daily basis . to his credit though , i suppose , he tries to be nice and , what he considers subtle , other times , if he 's tired or irritated about something else , he 'll snap at me and bring up my weight , for which he apologizes later and we try and work on the communication aspect . i just feel like we go in circles , he brings it up , we talk , we forget , he brings it up again . he does n't feel at all that it 's an off - limits topic as he feels that we should be able to discuss everything as a couple , even though i 've told him that it is kind of rude how he chooses to bring it up , sometimes joking about it . how do we get this on a better path ?","List(0, 1, 0, 0, 0)",TLDR_RS_2021-04-cm-1732.json,"List(0.18650505091030062, 0.38959293491070857, 0.2526842625236331, 0.1712177516553579, 0.0)",i 'm tired of my boyfriend bringing up my weight .
"hello , my friend 22m broke up a few months ago . he 's been doing really bad and i 19m was really trying to cheer him up , be there for him , listen to whatever he had to say . everything seemed normal , until he suddenly stopped talking to me . he would literally answer my texts 30 - 48 hours after i sent them , hang up whenever i called to check up on him . i faced him about it , he said that he does n't feel like talking and he needs some time alone . i was really understanding and gave him some space . he also insisted on the fact that i did n't do anything wrong and told me not to take this personally a few days later , he 's active again on social media , he 's going out quite a lot with other friends , but still , i get no texts or calls from him . this has been going on for 2 weeks now , and i really do n't know what to do or say . should i face him again ? should i remain silent and just move on ? i 'm so lost .","List(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0)",TLDR_RS_2021-03-cm-34926.json,"List(0.13981801700034813, 0.05483748749730812, 0.42323291632537813, 0.06046184518933972, 0.06373005303741214, 0.04716023924768498, 0.034174086411365934, 0.12746010607482428, 0.0, 0.04912524921633852, 0.0)",: my friend has suddenly stopped talking to me and not really sure what the reason is
"background : my 18f ex boyfriend 18m really wanted me to abort our son 3 months when i was pregnant and resented me for not doing so and fell out of love with me over it . throughout my pregnancy he threatened to kill himself multiple times and frequently self harmed and said its because id made him very depressed and anxious and pushed him to his limits by keeping the baby - i had told him he was free to walk away from me and the baby and i wouldnt chase him up over it or money but he chose to stay and said he wants to be in his sons life . over the past year he has started antidepressants and changes dosage / pill type very regularly obviously prescribed but has been known to skip pills or take several in a day to double up etc. . exbf cut ties with me two days before my due date for another woman he wanted to give things a go with but still talks about ending his life and mentions his super low mood to me when he sees the baby and is frequently pretty nasty to me . he sees our son 3 months typically once a week , sometimes twice up to him , i do nt limit it in my family home and we do nt leave him with the baby alone because he isnt too capable . for a long time he struggled with the practicals eg feeding and changing but hes never been around many babies and is slowly improving - so this isnt much about that . he spends most of the time he has the baby holding him but watching tv or on his phone , he pays little attention to him really . when i pop to the corner shop or chemist etc or go shower i ask my sister or mum to sit with them . our son is still young but some family members have been suggesting soon he will be old enough to spend time with my ex alone eg him take him for a few hours in the day or just even alone in the house but I am not sure i can fully trust my ex . i do nt think he would actively seek to hurt the baby / want to but I am not sure if the background above justifys my choice or if I am overreacting obviously a very brief summary of my reasoning . can anyone shed some light on this ?","List(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0)",TLDR_RS_2021-02-cm-18586.json,"List(0.18652821177119006, 0.1492886907256267, 0.03802305855335797, 0.08853130051229618, 0.13875081015962207, 0.08238329353227562, 0.0, 0.09644873389144462, 0.11219868023011693, 0.1078472206240699, 0.0)","my ex is very erratic and depressed and never wanted my son , should i leave them alone together ?"


In [0]:
from pyspark.sql.functions import col, concat_ws

# Transform array columns to string
df = df.withColumn("ext_labels", concat_ws(",", col("ext_labels"))) \
    .withColumn("rg_labels", concat_ws(",", col("rg_labels")))

In [0]:
# Output path for the CSV file
output_path = "dbfs:/FileStore/cleaned_data.csv"

# Write the DataFrame to a CSV file
df.write.option("header", "true").csv(output_path)