## 1. Install Libraries

In [None]:
"""
STILL NEED TO ADD VERSIONS (AFTER FILE IS CONFIRMED)
"""
%pip install spacy
%pip install pyarrow
%pip install textblob
%pip install textstat
!python -m spacy download en_core_web_sm

## 2. Import Libraries

In [None]:
# OS environment
import os

# Import SparkConf class into program
from pyspark import SparkConf

# Import SparkContext and SparkSession classes
from pyspark import SparkContext # Spark
from pyspark.sql import SparkSession # Spark SQL

# PySpark Data Operations
from pyspark.sql.functions import col, size, split, udf
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

# Regex
import re

# Numeric operations
import numpy as np

# Define custom schema (data types) for PySpark Dataframes
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

# spaCy model for natural language processing
import spacy

# Pandas
import pandas as pd

# Pscholinguistics
from textblob import TextBlob

# Readability features
import textstat



## 3. Function and Classes

### 3.1. clean_text function

In [None]:
# Define text cleaning function
def clean_text(text):
    
    """
    Clean the input text string by removing unwanted elements while keeping useful punctuation.

    Steps performed:
    - Convert non-ASCII quotes/aprostrophes with ASCII equivalents
    - Remove URLs (e.g. http://..., www...)
    - Remove Twitter-style mentions (@username) and hashtags (#hashtag)
    - Remove HTML entities (e.g. &nbsp;)
    - Remove emojis and non-ASCII characters
    - Normalize whitespace (convert multiple spaces/tabs/newlines into a single space)
    - Trim leading and trailing spaces

    Args:
        text (str or None): The input text to clean.

    Returns:
        str: A cleaned version of the input text. If input is None, returns an empty string.
    """
    
    if text is None:
        return ""
    
    # Replace curly quotes/apostrophes with ASCII equivalents
    replacements = {
        '“': '"', '”': '"',
        '‘': "'", '’': "'"
    }
    for curly, straight in replacements.items():
        text = text.replace(curly, straight)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove HTMLs
    text = re.sub(r'&\w+;', '', text)
    
    # Remove emojis and other non-ASCII symbols
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

### 3.2. FeaturesSpark Class

In [None]:
class FeaturesSpark:
    """
    Features that can be computed efficiently using PySpark.
    """
    VOWELS = "aeiouyAEIOUY"

    def __init__(self):
        pass

    def transform(self, df, text_col):
        txt = F.coalesce(F.col(text_col), F.lit(""))

        # Character count
        df = df.withColumn("num_characters", F.length(txt))

        # Capital letters
        df = df.withColumn("num_capital_letters", F.length(F.regexp_replace(txt, r"[^A-Z]", "")))

        # Word count
        df = df.withColumn("num_words", F.size(F.split(txt, r"\s+")))

        # Sentence count
        df = df.withColumn("num_sentences", F.size(F.split(txt, r"[.!?]+")))

        # Words per sentence
        df = df.withColumn(
            "words_per_sentence", 
            F.when(F.col("num_sentences") > 0, F.col("num_words") / F.col("num_sentences"))
             .otherwise(F.lit(0))
        )

        # Short sentences (<10 words)
        df = df.withColumn(
            "num_short_sentences", 
            F.size(F.expr(f"filter(split({text_col}, '[.!?]+'), x -> size(split(x, ' ')) < 10)"))
        )

        # Long sentences (>=20 words)
        df = df.withColumn(
            "num_long_sentences", 
            F.size(F.expr(f"filter(split({text_col}, '[.!?]+'), x -> size(split(x, ' ')) >= 20)"))
        )

        # Special characters
        df = df.withColumn("num_special_characters", F.length(F.regexp_replace(txt, r"[a-zA-Z0-9\s]", "")))

        return df

### 3.3. POSFeatures Class

In [None]:
class POSFeatures:
    def __init__(self, model="en_core_web_sm"):
        self.model = model

    def _count_pos(self, text, pos_tag):
        # load model lazily (cached per worker)
        if not hasattr(self, "_nlp"):
            self._nlp = spacy.load(self.model, disable=["ner", "parser"])
        doc = self._nlp(text)
        return sum(1 for token in doc if token.pos_ == pos_tag)

    def register_udfs(self, spark):
        return {
            "num_nouns": udf(lambda text: self._count_pos(text, "NOUN"), IntegerType()),
            "num_verbs": udf(lambda text: self._count_pos(text, "VERB"), IntegerType()),
            "num_adjectives": udf(lambda text: self._count_pos(text, "ADJ"), IntegerType()),
            "num_adverbs": udf(lambda text: self._count_pos(text, "ADV"), IntegerType()),
            "num_determiners": udf(lambda text: self._count_pos(text, "DET"), IntegerType()),
        }

    def transform(self, df, text_col):
        spark = df.sql_ctx.sparkSession
        udfs = self.register_udfs(spark)
        for col_name, func in udfs.items():
            df = df.withColumn(col_name, func(text_col))
        return df


### 3.4. ReadabilityIndices Class

In [None]:
class ReadabilityIndices:
    @staticmethod
    def extract_features(df, text_col):
        # Regular UDFs
        gf_udf = F.udf(lambda t: float(textstat.gunning_fog(t)) if t else None, FloatType())
        smog_udf = F.udf(lambda t: float(textstat.smog_index(t)) if t else None, FloatType())
        ari_udf = F.udf(lambda t: float(textstat.automated_readability_index(t)) if t else None, FloatType())
        syllables_udf = F.udf(lambda t: float(textstat.syllable_count(t)) if t else None, FloatType())

        return (df
            .withColumn("gunning_fog", gf_udf(F.col(text_col)))
            .withColumn("smog", smog_udf(F.col(text_col)))
            .withColumn("ari", ari_udf(F.col(text_col)))
            .withColumn("num_syllables", syllables_udf(F.col(text_col)))
        )


### 3.5. Psycholinguistics Class

In [None]:
class Psycholinguistics:
    @staticmethod
    def extract_features(df, text_col, title_col=None):
        # polarity
        def polarity_udf(text):
            if not text:
                return None
            return float(TextBlob(text).sentiment.polarity)
        
        # subjectivity
        def subjectivity_udf(text):
            if not text:
                return None
            return float(TextBlob(text).sentiment.subjectivity)
        
        # title similarity
        def title_similarity_udf(text, title):
            if not text or not title:
                return None
            text_words = set(text.lower().split())
            title_words = set(title.lower().split())
            if not text_words or not title_words:
                return 0
            return float(len(text_words & title_words) / len(text_words | title_words))

        df = df.withColumn("polarity", F.udf(polarity_udf, FloatType())(F.col(text_col)))
        df = df.withColumn("subjectivity", F.udf(subjectivity_udf, FloatType())(F.col(text_col)))
        if title_col:
            df = df.withColumn("title_similarity", F.udf(title_similarity_udf, FloatType())(
                F.col(text_col), F.col(title_col)
            ))
        else:
            df = df.withColumn("title_similarity", F.lit(None).cast(FloatType()))
        return df

## 4. Configure Spark Environment
Using the code snippets from tutorial 1 and 2, set up the Spark environment and configure the Spark Application using SparkConf.

In [None]:
spark_home = os.environ.get("SPARK_HOME")

if spark_home:
    print(f"SPARK_HOME: {spark_home}")
else:
    print("SPARK_HOME environement variable is not set.")

os.environ["SPARK_HOME"]= "/usr/local/lib/python3.10/dist-packages/pyspark"

print (f"SPARK_HOME is now set to: {os.environ.get('SPARK_HOME')}")

In [None]:
# local[*]: run Spark in local mode with as many working processors as logical cores on your machine
# If we want Spark to run locally with 'k' worker threads, we can specify as "local[k]".
master = "local[*]"
# The `appName` field is a name to be shown on the Spark cluster UI page
app_name = "WELFake Exploratory Data Anlaysis (EDA)"
# Setup configuration parameters for Spark
spark_conf = SparkConf().setMaster(master).setAppName(app_name)

# Setup SparkSession
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR')

## 5. Load datasets


In [None]:
# Load dataset into Spark dataframe
welfake_df = spark.read.csv(
    "data/WELFake_Dataset.csv",
    header=True,
    inferSchema=True,
    quote='"', 
    multiLine=True, #multilines in text and title data
    escape='"'
)

# Display sample rows
welfake_df.show(3)

In [None]:
# Rename first column as index
welfake_df = welfake_df.withColumnRenamed("_c0", "index")

# Show dataframe dimensions
num_rows = welfake_df.count()
num_cols = len(welfake_df.columns)

print(f"Rows: {num_rows}")
print(f"Columns: {num_cols}")

#Print the Schema
welfake_df.printSchema()

## 6. Remove duplicate


In [None]:
# Count original dataset rows
original_count = welfake_df.count()

# Remove duplicate news articles
welfake_df_dedup = welfake_df.dropDuplicates(["title", "text"])

deduped_count = welfake_df_dedup.count()
duplicates_removed = original_count - deduped_count

print(f"Original rows: {original_count}")
print(f"Duplicates removed: {duplicates_removed}")
print(f"After dataset size: {deduped_count} rows")

## 7. Clean title and article texts

In [None]:
# Register udf to pyspark
clean_text_udf = udf(clean_text, StringType())

In [None]:
# Apply cleaning to title and text
welfake_df_clean = welfake_df_dedup.withColumn("cleaned_title", clean_text_udf("title")) \
                       .withColumn("cleaned_text", clean_text_udf("text"))

# Preview results
welfake_df_clean.select("title", "cleaned_title", "text", "cleaned_text").show(5, truncate=80)

## 8. Remove null and empty string values

In [None]:
# Remove null or empty string values
welfake_df_processed = welfake_df_clean.filter(
    (col("cleaned_text").isNotNull()) & 
    (col("cleaned_text") != "") &
    (col("cleaned_title").isNotNull()) & 
    (col("cleaned_title") != "") &
    (col("label").isNotNull()) 
)

# Count the number of rows with empty values removed
clean_count = welfake_df_clean.count()
processed_count = welfake_df_processed.count()
removed_empty = clean_count - processed_count

print(f"Removed empty text rows: {removed_empty}")
print(f"After dataset size: {processed_count} rows")

## 9. Remove outlier based on text word count

### 9.1. Calculate article text word count

In [None]:
# Calculate text word count
welfake_df_wc = welfake_df_processed.withColumn("text_wc", size(split(col("cleaned_text"), "\\s+")))

welfake_df_wc.select("cleaned_text", "text_wc").show(3)

### 9.2. Remove outlier based on percentile values

In [None]:
# Calculate key percentiles for text word count
percentiles_upper_tail = [0.96, 0.97, 0.98, 0.99]
percentiles_lower_tail = [0.01, 0.02, 0.03, 0.04]

# Compute percentiles
upper_tail_quantiles = welfake_df_wc.approxQuantile("text_wc", percentiles_upper_tail, 0.01)
lower_tail_quantiles = welfake_df_wc.approxQuantile("text_wc", percentiles_lower_tail, 0.01)

# Show quantile values for analysis
print(f"Upper tail (96% to 99%): {upper_tail_quantiles}")
print(f"Lower tail (1% to 4%): {lower_tail_quantiles}")

In [None]:
# Calculate 2nd and 98th percentiles
lower_bound = 26
upper_bound = 1683

print(f"Filter out text_wc < {lower_bound} or > {upper_bound}\n")

# Filter out values below the 2nd and above the 98th percentiles
welfake_df_filtered = welfake_df_wc.filter(
    (F.col("text_wc") > lower_bound) & (F.col("text_wc") < upper_bound)
)

# Count the number of rows with empty values removed
outlier_count = welfake_df_filtered.count()
removed_outlier = processed_count - outlier_count

print(f"Removed outlier text rows: {removed_outlier}")
print(f"After dataset size: {outlier_count} rows")

## 10. Feature Engineering

### 10.1. Create quantity feature columns using FeaturesSpark()

In [None]:
# Initialise FeaturesSpark
feat_spark = FeaturesSpark()

# Create quantity feature columns
welfake_df_feat_spark = feat_spark.transform(df=welfake_df_filtered, text_col="cleaned_text")

In [None]:
# Preview quanitty feature columns
welfake_df_feat_spark.select(
    "cleaned_title",
    "cleaned_text",
    "num_characters",
    "num_special_characters",
    "num_capital_letters",
    "num_words",
    "num_sentences",
    "words_per_sentence",
    "num_short_sentences",
    "num_long_sentences",
    "label"
).show(5)

### 10.2. Create POS feature columns using POSFeatures()

In [None]:
# Initialise POSFeatures
pos_features = POSFeatures()

# Create POS feature columns
welfake_df_pos_feat = pos_features.transform(df=welfake_df_feat_spark, text_col="cleaned_text")

In [None]:
# Preview POS feature columns
welfake_df_pos_feat.select(
    "cleaned_title",
    "cleaned_text",
    "num_nouns",
    "num_verbs",
    "num_adjectives",
    "num_adverbs",
    "num_determiners",
    "label"
).show(5)

### 10.3. Create Readability features using ReadabilityIndices()

In [None]:
# Initialise ReadabilityIndices
readability = ReadabilityIndices()

# Create readability feature columns
welfake_df_readability = readability.extract_features(welfake_df_pos_feat, "cleaned_text")

In [None]:
# Preview readability columns
welfake_df_readability.select(
    "cleaned_title",
    "cleaned_text",
    "gunning_fog",
    "smog",
    "ari",
    "num_syllables",
    "label"
).show(5)


### 10.4. Create psycholinguistics features using Psycholinguistics()

In [None]:
# Create psycholinguistics feature columns
welfake_df_psycho = Psycholinguistics.extract_features(
    df=welfake_df_readability,
    text_col="cleaned_text",
    title_col="cleaned_title"
)

In [None]:
# Preview psycholinguistics columns
welfake_df_psycho.select(
    "cleaned_text",
    "polarity",
    "subjectivity",
    "title_similarity",
    "label"
).show(5)


### 10.5. Extract engineered features

In [None]:
# Extract feature columns for machine learning
welfake_df_preprocessed = welfake_df_psycho.select(
    "cleaned_text",
    "num_characters",
    "num_special_characters",
    "num_capital_letters",
    "num_words",
    "num_sentences",
    "words_per_sentence",
    "num_short_sentences",
    "num_long_sentences",
    "num_nouns",
    "num_verbs",
    "num_adjectives",
    "num_adverbs",
    "num_determiners",
    "gunning_fog",
    "smog",
    "ari",
    "num_syllables",
    "polarity",
    "subjectivity",
    "title_similarity",
    "label"
)

# Cache results
welfake_df_preprocessed.cache()

In [None]:
# Preview preprocessed data
welfake_df_preprocessed.show(5)