## 1. Install Libraries

In [None]:
"""
STILL NEED TO ADD VERSIONS (AFTER FILE IS CONFIRMED)
"""
%pip install spacy
%pip install pyarrow
%pip install textblob
%pip install textstat
!python -m spacy download en_core_web_sm

## 2. Import Libraries

In [1]:
# OS environment
import os

# Import SparkConf class into program
from pyspark import SparkConf

# Import SparkContext and SparkSession classes
from pyspark import SparkContext # Spark
from pyspark.sql import SparkSession # Spark SQL

# PySpark Data Operations
from pyspark.sql.functions import col, size, split, udf
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

# Regex
import re

# Numeric operations
import numpy as np

# Define custom schema (data types) for PySpark Dataframes
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

# spaCy model for natural language processing
import spacy

# Pandas
import pandas as pd

# Pscholinguistics
from textblob import TextBlob

# Readability features
import textstat



## 3. Function and Classes

### 3.1. clean_text function

In [2]:
# Define text cleaning function
def clean_text(text):
    
    """
    Clean the input text string by removing unwanted elements while keeping useful punctuation.

    Steps performed:
    - Convert non-ASCII quotes/aprostrophes with ASCII equivalents
    - Remove URLs (e.g. http://..., www...)
    - Remove Twitter-style mentions (@username) and hashtags (#hashtag)
    - Remove HTML entities (e.g. &nbsp;)
    - Remove emojis and non-ASCII characters
    - Normalize whitespace (convert multiple spaces/tabs/newlines into a single space)
    - Trim leading and trailing spaces

    Args:
        text (str or None): The input text to clean.

    Returns:
        str: A cleaned version of the input text. If input is None, returns an empty string.
    """
    
    if text is None:
        return ""
    
    # Replace curly quotes/apostrophes with ASCII equivalents
    replacements = {
        '“': '"', '”': '"',
        '‘': "'", '’': "'"
    }
    for curly, straight in replacements.items():
        text = text.replace(curly, straight)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove HTMLs
    text = re.sub(r'&\w+;', '', text)
    
    # Remove emojis and other non-ASCII symbols
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

### 3.2. FeaturesSpark Class

In [3]:
class FeaturesSpark:
    """
    Features that can be computed efficiently using PySpark.
    """
    VOWELS = "aeiouyAEIOUY"

    def __init__(self):
        pass

    def transform(self, df, text_col):
        txt = F.coalesce(F.col(text_col), F.lit(""))

        # Character count
        df = df.withColumn("num_characters", F.length(txt))

        # Capital letters
        df = df.withColumn("num_capital_letters", F.length(F.regexp_replace(txt, r"[^A-Z]", "")))

        # Word count
        df = df.withColumn("num_words", F.size(F.split(txt, r"\s+")))

        # Sentence count
        df = df.withColumn("num_sentences", F.size(F.split(txt, r"[.!?]+")))

        # Words per sentence
        df = df.withColumn(
            "words_per_sentence", 
            F.when(F.col("num_sentences") > 0, F.col("num_words") / F.col("num_sentences"))
             .otherwise(F.lit(0))
        )

        # Short sentences (<10 words)
        df = df.withColumn(
            "num_short_sentences", 
            F.size(F.expr(f"filter(split({text_col}, '[.!?]+'), x -> size(split(x, ' ')) < 10)"))
        )

        # Long sentences (>=20 words)
        df = df.withColumn(
            "num_long_sentences", 
            F.size(F.expr(f"filter(split({text_col}, '[.!?]+'), x -> size(split(x, ' ')) >= 20)"))
        )

        # Special characters
        df = df.withColumn("num_special_characters", F.length(F.regexp_replace(txt, r"[a-zA-Z0-9\s]", "")))

        return df

### 3.3. POSFeatures Class

In [4]:
class POSFeatures:
    def __init__(self, model="en_core_web_sm"):
        self.model = model

    def _count_pos(self, text, pos_tag):
        # load model lazily (cached per worker)
        if not hasattr(self, "_nlp"):
            self._nlp = spacy.load(self.model, disable=["ner", "parser"])
        doc = self._nlp(text)
        return sum(1 for token in doc if token.pos_ == pos_tag)

    def register_udfs(self, spark):
        return {
            "num_nouns": udf(lambda text: self._count_pos(text, "NOUN"), IntegerType()),
            "num_verbs": udf(lambda text: self._count_pos(text, "VERB"), IntegerType()),
            "num_adjectives": udf(lambda text: self._count_pos(text, "ADJ"), IntegerType()),
            "num_adverbs": udf(lambda text: self._count_pos(text, "ADV"), IntegerType()),
            "num_determiners": udf(lambda text: self._count_pos(text, "DET"), IntegerType()),
        }

    def transform(self, df, text_col):
        spark = df.sql_ctx.sparkSession
        udfs = self.register_udfs(spark)
        for col_name, func in udfs.items():
            df = df.withColumn(col_name, func(text_col))
        return df


### 3.4. ReadabilityIndices Class

In [5]:
class ReadabilityIndices:
    @staticmethod
    def extract_features(df, text_col):
        # Regular UDFs
        gf_udf = F.udf(lambda t: float(textstat.gunning_fog(t)) if t else None, FloatType())
        smog_udf = F.udf(lambda t: float(textstat.smog_index(t)) if t else None, FloatType())
        ari_udf = F.udf(lambda t: float(textstat.automated_readability_index(t)) if t else None, FloatType())
        syllables_udf = F.udf(lambda t: float(textstat.syllable_count(t)) if t else None, FloatType())

        return (df
            .withColumn("gunning_fog", gf_udf(F.col(text_col)))
            .withColumn("smog", smog_udf(F.col(text_col)))
            .withColumn("ari", ari_udf(F.col(text_col)))
            .withColumn("num_syllables", syllables_udf(F.col(text_col)))
        )


### 3.5. Psycholinguistics Class

In [6]:
class Psycholinguistics:
    @staticmethod
    def extract_features(df, text_col, title_col=None):
        # polarity
        def polarity_udf(text):
            if not text:
                return None
            return float(TextBlob(text).sentiment.polarity)
        
        # subjectivity
        def subjectivity_udf(text):
            if not text:
                return None
            return float(TextBlob(text).sentiment.subjectivity)
        
        # title similarity
        def title_similarity_udf(text, title):
            if not text or not title:
                return None
            text_words = set(text.lower().split())
            title_words = set(title.lower().split())
            if not text_words or not title_words:
                return 0
            return float(len(text_words & title_words) / len(text_words | title_words))

        df = df.withColumn("polarity", F.udf(polarity_udf, FloatType())(F.col(text_col)))
        df = df.withColumn("subjectivity", F.udf(subjectivity_udf, FloatType())(F.col(text_col)))
        if title_col:
            df = df.withColumn("title_similarity", F.udf(title_similarity_udf, FloatType())(
                F.col(text_col), F.col(title_col)
            ))
        else:
            df = df.withColumn("title_similarity", F.lit(None).cast(FloatType()))
        return df

## 4. Configure Spark Environment
Using the code snippets from tutorial 1 and 2, set up the Spark environment and configure the Spark Application using SparkConf.

In [7]:
# spark_home = os.environ.get("SPARK_HOME")

# if spark_home:
#     print(f"SPARK_HOME: {spark_home}")
# else:
#     print("SPARK_HOME environement variable is not set.")

# os.environ["SPARK_HOME"]= "/usr/local/lib/python3.10/dist-packages/pyspark"

# print (f"SPARK_HOME is now set to: {os.environ.get('SPARK_HOME')}")

In [8]:
# # local[*]: run Spark in local mode with as many working processors as logical cores on your machine
# # If we want Spark to run locally with 'k' worker threads, we can specify as "local[k]".
# master = "local[*]"
# # The `appName` field is a name to be shown on the Spark cluster UI page
# app_name = "WELFake Exploratory Data Anlaysis (EDA)"
# # Setup configuration parameters for Spark
# spark_conf = SparkConf().setMaster(master).setAppName(app_name)

# # Setup SparkSession
# spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
# sc = spark.sparkContext
# sc.setLogLevel('ERROR')

In [9]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Setup configuration parameters for Spark
spark_conf = (
    SparkConf()
    .setMaster("local[12]")               # safer: 8 threads instead of 16
    .setAppName("Fake News Detection")
    .set("spark.driver.memory", "32g")
    .set("spark.executor.memory", "32g")
    .set("spark.executor.cores", "8")
    .set("spark.sql.shuffle.partitions", "256")
    .set("spark.memory.fraction", "0.8")
    .set("spark.memory.offHeap.enabled", "true")
    .set("spark.memory.offHeap.size", "4g")
)

# Setup SparkSession
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

print("Spark Session started with settings:")
for k, v in sc.getConf().getAll():
    if "memory" in k or "cores" in k:
        print(f"{k} = {v}")

25/08/19 22:25:55 WARN Utils: Your hostname, nel-X600-ITX resolves to a loopback address: 127.0.1.1; using 192.168.0.23 instead (on interface wlp4s0)
25/08/19 22:25:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/08/19 22:25:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/19 22:25:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/08/19 22:25:56 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


Spark Session started with settings:
spark.memory.offHeap.size = 4g
spark.driver.memory = 32g
spark.memory.fraction = 0.8
spark.memory.offHeap.enabled = true
spark.executor.memory = 32g
spark.executor.cores = 8


## 5. Load datasets


In [11]:
# Load dataset into Spark dataframe
welfake_df = spark.read.csv(
    "WELFake_Dataset.csv",
    header=True,
    inferSchema=True,
    quote='"', 
    multiLine=True, #multilines in text and title data
    escape='"'
)

# Display sample rows
welfake_df.show(3)

                                                                                

+---+--------------------+--------------------+-----+
|_c0|               title|                text|label|
+---+--------------------+--------------------+-----+
|  0|LAW ENFORCEMENT O...|No comment is exp...|    1|
|  1|                null|Did they post the...|    1|
|  2|UNBELIEVABLE! OBA...| Now, most of the...|    1|
+---+--------------------+--------------------+-----+
only showing top 3 rows



In [12]:
# Rename first column as index
welfake_df = welfake_df.withColumnRenamed("_c0", "index")

# Show dataframe dimensions
num_rows = welfake_df.count()
num_cols = len(welfake_df.columns)

print(f"Rows: {num_rows}")
print(f"Columns: {num_cols}")

#Print the Schema
welfake_df.printSchema()

Rows: 72134
Columns: 4
root
 |-- index: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)



                                                                                

## 6. Remove duplicate


In [13]:
# Count original dataset rows
original_count = welfake_df.count()

# Remove duplicate news articles
welfake_df_dedup = welfake_df.dropDuplicates(["title", "text"])

deduped_count = welfake_df_dedup.count()
duplicates_removed = original_count - deduped_count

print(f"Original rows: {original_count}")
print(f"Duplicates removed: {duplicates_removed}")
print(f"After dataset size: {deduped_count} rows")

[Stage 7:>                                                          (0 + 1) / 1]

Original rows: 72134
Duplicates removed: 8456
After dataset size: 63678 rows


                                                                                

## 7. Clean title and article texts

In [14]:
# Register udf to pyspark
clean_text_udf = udf(clean_text, StringType())

In [None]:
# Apply cleaning to title and text
welfake_df_clean = welfake_df_dedup.withColumn("cleaned_title", clean_text_udf("title")) \
                       .withColumn("cleaned_text", clean_text_udf("text"))

# Preview results
welfake_df_clean.select("title", "cleaned_title", "text", "cleaned_text").show(5)

[Stage 10:>                                                         (0 + 1) / 1]

+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+
|                                                                           title|                                                                   cleaned_title|                                                                            text|                                                                    cleaned_text|
+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+
|                     

                                                                                

## 8. Remove null and empty string values

In [16]:
# Remove null or empty string values
welfake_df_processed = welfake_df_clean.filter(
    (col("cleaned_text").isNotNull()) & 
    (col("cleaned_text") != "") &
    (col("cleaned_title").isNotNull()) & 
    (col("cleaned_title") != "") &
    (col("label").isNotNull()) 
)

# Count the number of rows with empty values removed
clean_count = welfake_df_clean.count()
processed_count = welfake_df_processed.count()
removed_empty = clean_count - processed_count

print(f"Removed empty text rows: {removed_empty}")
print(f"After dataset size: {processed_count} rows")



Removed empty text rows: 1186
After dataset size: 62492 rows


                                                                                

## 9. Remove outlier based on text word count

### 9.1. Calculate article text word count

In [17]:
# Calculate text word count
welfake_df_wc = welfake_df_processed.withColumn("text_wc", size(split(col("cleaned_text"), "\\s+")))

welfake_df_wc.select("cleaned_text", "text_wc").show(3)

[Stage 18:>                                                         (0 + 1) / 1]

+--------------------+-------+
|        cleaned_text|text_wc|
+--------------------+-------+
|Next Prev Swipe l...|     49|
|Wikileaks, which ...|    523|
|The American job ...|    257|
+--------------------+-------+
only showing top 3 rows



Traceback (most recent call last):
  File "/home/nel/iti5202_big_data/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/home/nel/iti5202_big_data/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/home/nel/iti5202_big_data/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 642, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/home/nel/iti5202_big_data/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 595, in read_int
    raise EOFError
EOFError
                                                                                

### 9.2. Remove outlier based on percentile values

In [18]:
# Calculate key percentiles for text word count
percentiles_upper_tail = [0.96, 0.97, 0.98, 0.99]
percentiles_lower_tail = [0.01, 0.02, 0.03, 0.04]

# Compute percentiles
upper_tail_quantiles = welfake_df_wc.approxQuantile("text_wc", percentiles_upper_tail, 0.01)
lower_tail_quantiles = welfake_df_wc.approxQuantile("text_wc", percentiles_lower_tail, 0.01)

# Show quantile values for analysis
print(f"Upper tail (96% to 99%): {upper_tail_quantiles}")
print(f"Lower tail (1% to 4%): {lower_tail_quantiles}")



Upper tail (96% to 99%): [1409.0, 1517.0, 1688.0, 24173.0]
Lower tail (1% to 4%): [1.0, 26.0, 40.0, 54.0]


                                                                                

In [19]:
# Calculate 2nd and 98th percentiles
lower_bound = 26
upper_bound = 1688

print(f"Filter out text_wc < {lower_bound} or > {upper_bound}\n")

# Filter out values below the 2nd and above the 98th percentiles
welfake_df_filtered = welfake_df_wc.filter(
    (F.col("text_wc") > lower_bound) & (F.col("text_wc") < upper_bound)
)

# Count the number of rows with empty values removed
outlier_count = welfake_df_filtered.count()
removed_outlier = processed_count - outlier_count

print(f"Removed outlier text rows: {removed_outlier}")
print(f"After dataset size: {outlier_count} rows")

Filter out text_wc < 26 or > 1688





Removed outlier text rows: 2832
After dataset size: 59660 rows


                                                                                

## 10. Feature Engineering

### 10.1. Create quantity feature columns using FeaturesSpark()

In [20]:
# Initialise FeaturesSpark
feat_spark = FeaturesSpark()

# Create quantity feature columns
welfake_df_feat_spark = feat_spark.transform(df=welfake_df_filtered, text_col="cleaned_text")

In [22]:
# Preview quanitty feature columns
welfake_df_feat_spark.select(
    "cleaned_title",
    "cleaned_text",
    "num_characters",
    "num_special_characters",
    "num_capital_letters",
    "num_words",
    "num_sentences",
    "words_per_sentence",
    "num_short_sentences",
    "num_long_sentences",
    "label"
).show(5)

[Stage 31:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------+----------------------+-------------------+---------+-------------+------------------+-------------------+------------------+-----+
|       cleaned_title|        cleaned_text|num_characters|num_special_characters|num_capital_letters|num_words|num_sentences|words_per_sentence|num_short_sentences|num_long_sentences|label|
+--------------------+--------------------+--------------+----------------------+-------------------+---------+-------------+------------------+-------------------+------------------+-----+
|Wingsuit flyer vs...|Next Prev Swipe l...|           286|                     9|                  9|       49|            5|               9.8|                  4|                 1|    1|
|Wikileaks Admits ...|Wikileaks, which ...|          3116|                    71|                 63|      523|           27| 19.37037037037037|                  7|                11|    1|
|SOLID: 211,000 Jo...|The American job ...|       

                                                                                

### 10.2. Create POS feature columns using POSFeatures()

In [23]:
# Initialise POSFeatures
pos_features = POSFeatures()

# Create POS feature columns
welfake_df_pos_feat = pos_features.transform(df=welfake_df_feat_spark, text_col="cleaned_text")

In [25]:
# Preview POS feature columns
welfake_df_pos_feat.select(
    "cleaned_title",
    "cleaned_text",
    "num_nouns",
    "num_verbs",
    "num_adjectives",
    "num_adverbs",
    "num_determiners",
    "label"
).show(5)

[Stage 36:>                                                         (0 + 1) / 1]

+--------------------+--------------------+---------+---------+--------------+-----------+---------------+-----+
|       cleaned_title|        cleaned_text|num_nouns|num_verbs|num_adjectives|num_adverbs|num_determiners|label|
+--------------------+--------------------+---------+---------+--------------+-----------+---------------+-----+
|Wingsuit flyer vs...|Next Prev Swipe l...|       13|        7|             3|          5|              8|    1|
|Wikileaks Admits ...|Wikileaks, which ...|       98|       86|            26|         39|             51|    1|
|SOLID: 211,000 Jo...|The American job ...|       66|       25|            16|         17|             25|    0|
|IT JUST GOT REAL!...|One of the big pl...|       65|       61|            31|         19|             54|    1|
|British PM May di...|LONDON (Reuters) ...|       39|       25|            19|          5|             23|    0|
+--------------------+--------------------+---------+---------+--------------+-----------+------

                                                                                

### 10.3. Create Readability features using ReadabilityIndices()

In [26]:
# Initialise ReadabilityIndices
readability = ReadabilityIndices()

# Create readability feature columns
welfake_df_readability = readability.extract_features(welfake_df_pos_feat, "cleaned_text")

In [27]:
# Preview readability columns
welfake_df_readability.select(
    "cleaned_title",
    "cleaned_text",
    "gunning_fog",
    "smog",
    "ari",
    "num_syllables",
    "label"
).show(5)


[Stage 38:>                                                         (0 + 1) / 1]

+--------------------+--------------------+-----------+---------+---------+-------------+-----+
|       cleaned_title|        cleaned_text|gunning_fog|     smog|      ari|num_syllables|label|
+--------------------+--------------------+-----------+---------+---------+-------------+-----+
|Wingsuit flyer vs...|Next Prev Swipe l...|   9.797959|10.125756| 7.572143|         72.0|    1|
|Wikileaks Admits ...|Wikileaks, which ...|  14.012308|13.412044|12.330879|        823.0|    1|
|SOLID: 211,000 Jo...|The American job ...|   9.757804|10.504224|  8.03099|        393.0|    0|
|IT JUST GOT REAL!...|One of the big pl...|  13.243241|13.445107|11.795769|        757.0|    1|
|British PM May di...|LONDON (Reuters) ...|  15.636364|14.937675| 13.38427|        374.0|    0|
+--------------------+--------------------+-----------+---------+---------+-------------+-----+
only showing top 5 rows



                                                                                

### 10.4. Create psycholinguistics features using Psycholinguistics()

In [28]:
# Create psycholinguistics feature columns
welfake_df_psycho = Psycholinguistics.extract_features(
    df=welfake_df_readability,
    text_col="cleaned_text",
    title_col="cleaned_title"
)

In [29]:
# Preview psycholinguistics columns
welfake_df_psycho.select(
    "cleaned_text",
    "polarity",
    "subjectivity",
    "title_similarity",
    "label"
).show(5)


[Stage 40:>                                                         (0 + 1) / 1]

+--------------------+-------------+------------+----------------+-----+
|        cleaned_text|     polarity|subjectivity|title_similarity|label|
+--------------------+-------------+------------+----------------+-----+
|Next Prev Swipe l...|        0.225|  0.49166667|       0.0952381|    1|
|Wikileaks, which ...| -0.036196146|  0.49321994|     0.026578072|    1|
|The American job ...| 0.0031045752|  0.37385622|     0.029940119|    0|
|One of the big pl...|-0.0028036176|  0.43458655|      0.04262295|    1|
|LONDON (Reuters) ...|  -0.03263889|  0.50208336|        0.078125|    0|
+--------------------+-------------+------------+----------------+-----+
only showing top 5 rows



                                                                                

### 10.5. Extract engineered features

In [30]:
# Extract feature columns for machine learning
welfake_df_preprocessed = welfake_df_psycho.select(
    "cleaned_text",
    "num_characters",
    "num_special_characters",
    "num_capital_letters",
    "num_words",
    "num_sentences",
    "words_per_sentence",
    "num_short_sentences",
    "num_long_sentences",
    "num_nouns",
    "num_verbs",
    "num_adjectives",
    "num_adverbs",
    "num_determiners",
    "gunning_fog",
    "smog",
    "ari",
    "num_syllables",
    "polarity",
    "subjectivity",
    "title_similarity",
    "label"
)

# Cache results
welfake_df_preprocessed.cache()

DataFrame[cleaned_text: string, num_characters: int, num_special_characters: int, num_capital_letters: int, num_words: int, num_sentences: int, words_per_sentence: double, num_short_sentences: int, num_long_sentences: int, num_nouns: int, num_verbs: int, num_adjectives: int, num_adverbs: int, num_determiners: int, gunning_fog: float, smog: float, ari: float, num_syllables: float, polarity: float, subjectivity: float, title_similarity: float, label: int]

In [31]:
# Preview preprocessed data
welfake_df_preprocessed.show(5)

[Stage 42:>                                                         (0 + 1) / 1]

+--------------------+--------------+----------------------+-------------------+---------+-------------+------------------+-------------------+------------------+---------+---------+--------------+-----------+---------------+-----------+---------+---------+-------------+-------------+------------+----------------+-----+
|        cleaned_text|num_characters|num_special_characters|num_capital_letters|num_words|num_sentences|words_per_sentence|num_short_sentences|num_long_sentences|num_nouns|num_verbs|num_adjectives|num_adverbs|num_determiners|gunning_fog|     smog|      ari|num_syllables|     polarity|subjectivity|title_similarity|label|
+--------------------+--------------+----------------------+-------------------+---------+-------------+------------------+-------------------+------------------+---------+---------+--------------+-----------+---------------+-----------+---------+---------+-------------+-------------+------------+----------------+-----+
|Next Prev Swipe l...|           2

                                                                                

# 11. Modelling

## Classic ML

In [34]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import (
    LogisticRegression, DecisionTreeClassifier, RandomForestClassifier,
    GBTClassifier, NaiveBayes
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark import StorageLevel

# Define features
feature_cols = [
 'num_characters',
 'num_special_characters',
 'num_capital_letters',
 'num_words',
 'num_sentences',
 'words_per_sentence',
 'num_short_sentences',
 'num_long_sentences',
 'num_nouns',
 'num_verbs',
 'num_adjectives',
 'num_adverbs',
 'num_determiners',
 'gunning_fog',
 'smog',
 'ari',
 'num_syllables',
 'polarity',
 'subjectivity',
 'title_similarity',
]

# Pre-process data once and persist
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_vec", handleInvalid="skip")
scaler = StandardScaler(inputCol="features_vec", outputCol="features", withStd=True, withMean=True)

# Create preprocessing pipeline and apply once
preprocessing_pipeline = Pipeline(stages=[assembler, scaler])
preprocessing_model = preprocessing_pipeline.fit(welfake_df_preprocessed)
processed_df = preprocessing_model.transform(welfake_df_preprocessed)

# Train/test split on preprocessed data
train_df, test_df = processed_df.randomSplit([0.8, 0.2], seed=42)

# Persist with default storage level (faster alternative)
train_df.cache()
test_df.cache()

# Force materialization
train_count = train_df.count()
test_count = test_df.count()
print(f"Train samples: {train_count}, Test samples: {test_count}")

# Optimized models with reduced complexity for speed
models = {
    "Logistic Regression": LogisticRegression(
        featuresCol="features", 
        labelCol="label", 
        maxIter=50,  # Reduced from 100
        regParam=0.01,
        elasticNetParam=0.1
    ),
    "Decision Tree": DecisionTreeClassifier(
        featuresCol="features", 
        labelCol="label",
        maxDepth=10,  # Limit depth
        maxBins=32    # Reduce bins for speed
    ),
    "Random Forest": RandomForestClassifier(
        featuresCol="features", 
        labelCol="label", 
        numTrees=50,      # Reduced from 100
        maxDepth=10,      # Limit depth
        subsamplingRate=0.8,
        featureSubsetStrategy="sqrt"
    ),
    "Gradient Boosting": GBTClassifier(
        featuresCol="features", 
        labelCol="label", 
        maxIter=50,       # Reduced from 100
        maxDepth=5,       # Reduced depth
        stepSize=0.1
    ),
    "Naive Bayes": NaiveBayes(
        featuresCol="features", 
        labelCol="label", 
        modelType="gaussian",
        smoothing=1.0
    )
}

# Pre-create evaluators
bin_eval = BinaryClassificationEvaluator(
    labelCol="label", 
    rawPredictionCol="rawPrediction", 
    metricName="areaUnderROC"
)
acc_eval = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="accuracy"
)
f1_eval = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="f1"
)
precision_eval = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="weightedPrecision"
)
recall_eval = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="weightedRecall"
)

results = []

for name, clf in models.items():
    print(f"Training {name}...")
    
    # Single stage pipeline (preprocessing already done)
    pipeline = Pipeline(stages=[clf])
    model = pipeline.fit(train_df)
    
    # Make predictions
    preds = model.transform(test_df)
    
    # Persist predictions for multiple evaluations
    preds.cache()
    
    # Evaluate all metrics on cached predictions
    acc = acc_eval.evaluate(preds)
    f1 = f1_eval.evaluate(preds)
    precision = precision_eval.evaluate(preds)
    recall = recall_eval.evaluate(preds)
    auc = bin_eval.evaluate(preds)
    
    results.append((name, acc, precision, recall, f1, auc))
    print(f"{name} - Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")
    
    # Unpersist predictions to free memory
    preds.unpersist()

# Unpersist datasets
train_df.unpersist()
test_df.unpersist()

# Display results
print("\n" + "="*80)
print(f"{'Model':<20} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1':<10} {'AUC':<10}")
print("="*80)
for result in results:
    name, acc, precision, recall, f1, auc = result
    print(f"{name:<20} {acc:<10.4f} {precision:<10.4f} {recall:<10.4f} {f1:<10.4f} {auc:<10.4f}")

                                                                                

Train samples: 47605, Test samples: 12055
Training Logistic Regression...
Logistic Regression - Accuracy: 0.7865, F1: 0.7847, AUC: 0.8441
Training Decision Tree...
Decision Tree - Accuracy: 0.7831, F1: 0.7795, AUC: 0.5033
Training Random Forest...


                                                                                

Random Forest - Accuracy: 0.8096, F1: 0.8074, AUC: 0.8882
Training Gradient Boosting...
Gradient Boosting - Accuracy: 0.8158, F1: 0.8143, AUC: 0.8932
Training Naive Bayes...
Naive Bayes - Accuracy: 0.5736, F1: 0.5594, AUC: 0.5984

Model                Accuracy   Precision  Recall     F1         AUC       
Logistic Regression  0.7865     0.7862     0.7865     0.7847     0.8441    
Decision Tree        0.7831     0.7854     0.7831     0.7795     0.5033    
Random Forest        0.8096     0.8110     0.8096     0.8074     0.8882    
Gradient Boosting    0.8158     0.8162     0.8158     0.8143     0.8932    
Naive Bayes          0.5736     0.6313     0.5736     0.5594     0.5984    


## WELFake Paper

In [36]:
# PySpark implementation of your 2-stage WELFake-style system
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover, NGram, CountVectorizer, IDF,
    VectorAssembler, StandardScaler
)
from pyspark.ml.classification import LinearSVC

# --------------------------------------------------------------------------------------
# Assumptions:
# - df (Spark DataFrame) has columns: cleaned_text (string), label (0/1),
#   and all 20 LFS numeric columns you listed (already computed).
# - Binary classification (0=real, 1=fake). LinearSVC works for binary labels.
# --------------------------------------------------------------------------------------

# ===== 0) Basic hygiene ===============================================================
df = welfake_df_preprocessed  # your Spark DataFrame
# Cast label to double and ensure no nulls in text/LFS
df = (df
      .withColumn("label", F.col("label").cast("double"))
      .withColumn("cleaned_text", F.coalesce(F.col("cleaned_text"), F.lit(""))))

# The 20 features total (adjust names to match your frame if needed)
# feature_cols = [
#     'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
#     'gunning_fog', 'smog', 'ari',
#     'polarity', 'title_similarity', 'subjectivity',
#     'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
#     'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
# ]

feature_cols = [
#  'num_characters',
 'num_special_characters',
 'num_capital_letters',
 'num_words',
 'num_sentences',
 'words_per_sentence',
 'num_short_sentences',
 'num_long_sentences',
#  'num_nouns',
 'num_verbs',
 'num_adjectives',
 'num_adverbs',
 'num_determiners',
 'gunning_fog',
 'smog',
 'ari',
 'num_syllables',
 'polarity',
 'subjectivity',
 'title_similarity',
]

for c in feature_cols:
    df = df.withColumn(c, F.col(c).cast("double"))
df = df.fillna(0, subset=feature_cols)

# Your LFS splits (can be tweaked)
LFS1 = ['num_special_characters', 'num_determiners', 'num_capital_letters', 'gunning_fog', 'polarity', 'num_syllables']
# LFS2 = ['num_short_sentences', 'smog', 'title_similarity', 'subjectivity', 'num_words', 'rate_adj_adv']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity', 'subjectivity', 'num_words']
# LFS3 = ['num_long_sentences', 'ari', 'num_articles', 'num_verbs', 'num_sentences', 'words_per_sentence']
LFS3 = ['num_long_sentences', 'ari', 'num_verbs', 'num_sentences', 'words_per_sentence']

In [37]:
# ===== 1) Stratified split (approx) ===================================================
# Keep class balance similar to scikit-learn's stratify
# Create a stable row id to separate train/test
df = df.withColumn("_row_id", F.monotonically_increasing_id())

test_frac = 0.30
label_vals = [r[0] for r in df.select("label").distinct().collect()]
fractions = {float(k): test_frac for k in label_vals}
test_df = df.sampleBy("label", fractions=fractions, seed=42)
train_df = df.join(test_df.select("_row_id"), on="_row_id", how="left_anti")

In [38]:
# ===== 2) Base text pipeline: tokens -> (uni,bigram) -> CV -> TF-IDF ==================
# Tokenize + stopword removal
tok = RegexTokenizer(inputCol="cleaned_text", outputCol="tokens", pattern="\\W+", toLowercase=True)
sw = StopWordsRemover(inputCol="tokens", outputCol="tokens_sw")

# 2-grams
bi = NGram(n=2, inputCol="tokens_sw", outputCol="bigrams")

# CountVectorizer for unigrams and bigrams (vocabSize similar to max_features=5000)
cv_uni = CountVectorizer(inputCol="tokens_sw", outputCol="tf_uni", vocabSize=5000, minDF=2)
cv_bi  = CountVectorizer(inputCol="bigrams",  outputCol="tf_bi",  vocabSize=5000, minDF=2)

# Concatenate uni+bi into "cv_features"
cv_asm = VectorAssembler(inputCols=["tf_uni", "tf_bi"], outputCol="cv_features")

# TF-IDF over the same tf (use IDF on the combined term-freqs)
idf = IDF(inputCol="cv_features", outputCol="tfidf_features", minDocFreq=2)

text_pipe = Pipeline(stages=[tok, sw, bi, cv_uni, cv_bi, cv_asm, idf])
text_model = text_pipe.fit(train_df)
train_tx = text_model.transform(train_df).cache()
test_tx  = text_model.transform(test_df).cache()

                                                                                

In [39]:
# 3) Helper to train one SVM on (CV ⊕ scaled LFS) — make columns unique
from pyspark.ml.classification import LinearSVC
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler

def train_lfs_enabled_svm(lfs_cols, stub):
    # stub e.g. "pred1", "pred2", "pred3"
    lfs_vec   = VectorAssembler(inputCols=lfs_cols, outputCol=f"{stub}_lfs_vec")
    lfs_scale = StandardScaler(inputCol=f"{stub}_lfs_vec", outputCol=f"{stub}_lfs_scaled",
                               withStd=True, withMean=True)
    feat_asm  = VectorAssembler(inputCols=["cv_features", f"{stub}_lfs_scaled"], outputCol=f"{stub}_features")

    svm = LinearSVC(
        featuresCol=f"{stub}_features",
        labelCol="label",
        predictionCol=stub,                  # e.g., pred1
        rawPredictionCol=f"{stub}_raw",      # e.g., pred1_raw  <<< UNIQUE!
        maxIter=100, regParam=0.1
    )
    return Pipeline(stages=[lfs_vec, lfs_scale, feat_asm, svm]).fit(train_tx)

# Train the three LFS-enabled SVMs
svm1_model = train_lfs_enabled_svm(LFS1, "pred1")
svm2_model = train_lfs_enabled_svm(LFS2, "pred2")
svm3_model = train_lfs_enabled_svm(LFS3, "pred3")

# Transform sequentially (now no column name clashes)
preds = svm1_model.transform(test_tx)
preds = svm2_model.transform(preds)
preds = svm3_model.transform(preds)

# Stage-1 vote
from pyspark.sql import functions as F
preds = preds.withColumn(
    "P6", F.when((F.col("pred1") + F.col("pred2") + F.col("pred3")) >= 2.0, 1.0).otherwise(0.0)
)

                                                                                

In [40]:
# 4) CV-only and TF-IDF-only SVMs — also give unique rawPredictionCol
svm_cv = LinearSVC(
    featuresCol="cv_features", labelCol="label",
    predictionCol="pred_cv", rawPredictionCol="raw_cv",
    maxIter=100, regParam=0.1
)
svm_ti = LinearSVC(
    featuresCol="tfidf_features", labelCol="label",
    predictionCol="pred_tfidf", rawPredictionCol="raw_tfidf",
    maxIter=100, regParam=0.1
)

svm_cv_model = svm_cv.fit(train_tx)
svm_ti_model = svm_ti.fit(train_tx)

preds = svm_cv_model.transform(preds)
preds = svm_ti_model.transform(preds)

# Final vote across {P6, pred_cv, pred_tfidf}
preds = preds.withColumn(
    "final_pred",
    F.when((F.col("P6") + F.col("pred_cv") + F.col("pred_tfidf")) >= 2.0, 1.0).otherwise(0.0)
)

# Accuracy
acc = preds.select((F.col("final_pred") == F.col("label")).cast("double").alias("correct")) \
           .agg(F.avg("correct").alias("accuracy")).collect()[0]["accuracy"]
print(f"Final Accuracy: {acc:.4f}")




Final Accuracy: 0.9532


                                                                                