# Data Exploration of Bronze Layer Tweets Imbalanced Data

## Environment Setup

In [0]:
from pyspark.sql.functions import col, when, lit
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import CountVectorizer
nltk.download('stopwords')

In [0]:
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words("english"))

In [0]:
df_tweets_imbalanced = spark.table("nlp_analysis.bronze_layer.tweets_imbalanced_data")
df_tweets_imbalanced.display()

In [0]:
df_tweets_raw = spark.table("nlp_analysis.bronze_layer.tweets_raw_data")
df_tweets_raw.display()

## Data Transformation and Aggregation

In [0]:
df_tweets_imbalanced = df_tweets_imbalanced.dropna(how="all")
df_tweets_raw = df_tweets_raw.dropna(how="all")

In [0]:
df_tweets_imbalanced_transformed = df_tweets_imbalanced.drop("id", "_rescued_data")

In [0]:
df_tweets_raw_transformed = df_tweets_raw.filter((df_tweets_raw["class"] == "1") | (df_tweets_raw["class"] == "2"))

In [0]:
df_tweets_raw_transformed = df_tweets_raw_transformed.withColumn(
    "class", lit("1")
)
display(df_tweets_raw_transformed.select("class").distinct())

In [0]:
df_tweets_raw_transformed = df_tweets_raw_transformed.drop("count", "hate_speech", "offensive_language", "neither", "_rescued_data")

In [0]:
df_tweets_raw_transformed = df_tweets_raw_transformed.withColumnRenamed("class", "label")

In [0]:
df_tweets_balanced = df_tweets_imbalanced_transformed.union(df_tweets_raw_transformed)

In [0]:
display(df_tweets_balanced.select("label").distinct())

In [0]:
dbutils.data.summarize(df_tweets_balanced)

## NLP Preprocessing

In [0]:
def data_cleaning(text):
    if not text:
        return ""
    
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopword]
    tokens = [stemmer.stem(word) for word in tokens]

    return " ".join(tokens)

data_cleaning_udf = udf(lambda x: data_cleaning(x), StringType())

In [0]:
df_tweets_balanced_cleaned = df_tweets_balanced.withColumn("tweet", data_cleaning_udf(df_tweets_balanced["tweet"]))
display(df_tweets_balanced_cleaned)

## Save Table in Silver Layer

In [0]:
df_tweets_balanced_cleaned.write.mode("overwrite").saveAsTable("nlp_analysis.silver_layer.labeled_tweets")