### Feature Extraction

In [1]:
import pandas as pd
import re
import textstat
import nltk
from textblob import TextBlob
from nltk import word_tokenize, pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bachtiarherdianto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bachtiarherdianto/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
def extract_features(text):
    try:
        tokens = word_tokenize(text)
        words = [w for w in tokens if w.isalpha()]
        sentences = re.split(r'[.!?]+', text)
        pos_tags = pos_tag(words)
        
        # Writing pattern
        num_special_chars = len(re.findall(r'[^a-zA-Z0-9\s]', text))
        num_determinants = sum(1 for w in words if w.lower() in ['the', 'a', 'an'])
        num_capital_letters = sum(1 for c in text if c.isupper())
        num_short_sentences = sum(1 for s in sentences if len(s.split()) < 10)
        num_long_sentences = sum(1 for s in sentences if len(s.split()) > 20)

        # Readability indices
        gunning_fog = textstat.gunning_fog(text)
        smog = textstat.smog_index(text)
        ari = textstat.automated_readability_index(text)

        # Psycholinguistics
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity
        title_similarity = 0  # Optional, you can compute this with cosine or Jaccard if you have 'title'

        # Quantity
        num_syllables = textstat.syllable_count(text)
        num_words = len(words)
        num_sentences = len([s for s in sentences if s.strip()])
        num_adjectives = sum(1 for _, tag in pos_tags if tag in ['JJ', 'JJR', 'JJS'])
        num_adverbs = sum(1 for _, tag in pos_tags if tag in ['RB', 'RBR', 'RBS'])
        num_verbs = sum(1 for _, tag in pos_tags if tag.startswith('VB'))
        num_articles = sum(1 for w in words if w.lower() in ['a', 'an', 'the'])

        rate_adj_adv = (num_adjectives + num_adverbs) / num_words if num_words > 0 else 0
        words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

        return pd.Series([
            num_special_chars, num_determinants, num_capital_letters, num_short_sentences, num_long_sentences,
            gunning_fog, smog, ari,
            polarity, title_similarity, subjectivity,
            num_syllables, num_words, rate_adj_adv, words_per_sentence,
            num_articles, num_verbs, num_sentences, num_adjectives, num_adverbs
        ])
    except:
        return pd.Series([None]*20)

In [6]:
df = pd.read_csv('cleaned_welfake.csv')

# # Sample 50% from each class
# df_sampled = df.groupby('label', group_keys=False).sample(frac=0.5, random_state=42)

# Sample 2 rows per class
df_sampled = df.groupby('label', group_keys=False).sample(n=10000, random_state=42)
df_sampled.head()

Unnamed: 0,index,cleaned_title,cleaned_text,label
21354,17322,Trump's choice for U.S. attorney general says ...,WASHINGTON (Reuters) - U.S. President-elect Do...,0
25012,21871,"Alison Wright, Exiled From 'The Americans' (Pe...",It took Alison Wright 34 years to land her fir...,0
9901,32704,Kurdistan supervisors begin counting votes in ...,"ERBIL, Iraq (Reuters) - Voting stations set up...",0
24177,35894,New Saudi king ascends to the throne as terror...,At 3 a.m. on a cold desert night earlier this ...,0
37552,24368,May shook on gentlemen's agreement on Brexit d...,BRUSSELS (Reuters) - An interim Brexit deal st...,0


In [7]:
# Assuming your dataframe is called df and has a 'text' column
feature_columns = [
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# # Apply feature extraction and add prefix to the resulting columns
# df_sampled[text_feature_cols := ['text_' + col for col in feature_columns]] = (
#     df_sampled['cleaned_text'].apply(extract_features)
# )

# # Apply feature extraction and add prefix to the resulting columns
# df_sampled[title_feature_cols := ['title_' + col for col in feature_columns]] = (
#     df_sampled['cleaned_title'].apply(extract_features)
# )

df_sampled[feature_columns] = df_sampled['cleaned_text'].apply(extract_features)

df_sampled.info()

Traceback (most recent call last):
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 467, in main
    split_index = read_int(infile)
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 593, in read_int
    length = stream.read(4)
KeyboardInterrupt
Traceback (most recent call last):
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspar

KeyboardInterrupt: 

### Modelling

#### Pandas Implementation

##### Classic ML

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    BaggingClassifier,
    AdaBoostClassifier,
)

In [6]:
feature_cols = [
# text_feature_cols + title_feature_cols
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

X = df_sampled[feature_cols]
y = df_sampled['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
models = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Bagging': BaggingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[name] = {
        'accuracy': acc,
        'report': classification_report(y_test, y_pred, output_dict=True)
    }
    # print(f"✅ {name} Accuracy: {acc:.4f}")
    # print(classification_report(y_test, y_pred))

accuracy_df = pd.DataFrame([
    {'Model': name, 'Accuracy': result['accuracy']}
    for name, result in results.items()
])
print(accuracy_df.sort_values(by='Accuracy', ascending=False))

                 Model  Accuracy
1                  SVM     0.825
9  Logistic Regression     0.825
4              Bagging     0.725
6        Random Forest     0.725
8          Extra Trees     0.725
0                  KNN     0.675
5             AdaBoost     0.675
2          Naive Bayes     0.650
3        Decision Tree     0.650
7    Gradient Boosting     0.650


##### WELFake Approach

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

feature_cols = [
# text_feature_cols + title_feature_cols
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# Example: Define your LFS feature groups (replace with your actual 20 features split into 3 sets)
LFS1 = ['num_special_chars', 'num_determinants', 'num_capital_letters', 
        'gunning_fog', 'polarity', 'num_syllables']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity',
        'subjectivity', 'num_words', 'rate_adj_adv']
LFS3 = ['num_long_sentences', 'ari', 'num_articles', 
        'num_verbs', 'num_sentences', 'words_per_sentence']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled, df_sampled['label'], test_size=0.3, random_state=42, stratify=df_sampled['label']
)

# Function to apply CV + LFS
def cv_over_lfs(X_train, X_test, lfs_cols):
    """Apply Count Vectorizer to text + concatenate LFS numeric features."""
    cv = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_cv = cv.fit_transform(X_train['cleaned_text'])
    X_test_cv = cv.transform(X_test['cleaned_text'])

    # Scale LFS numeric features
    scaler = StandardScaler()
    X_train_lfs = scaler.fit_transform(X_train[lfs_cols])
    X_test_lfs = scaler.transform(X_test[lfs_cols])

    # Combine sparse CV with dense LFS
    X_train_combined = hstack([X_train_cv, X_train_lfs])
    X_test_combined = hstack([X_test_cv, X_test_lfs])
    
    return X_train_combined, X_test_combined

# Generate embedded sets
Xtr_LFS1, Xte_LFS1 = cv_over_lfs(X_train, X_test, LFS1)
Xtr_LFS2, Xte_LFS2 = cv_over_lfs(X_train, X_test, LFS2)
Xtr_LFS3, Xte_LFS3 = cv_over_lfs(X_train, X_test, LFS3)

# Define base model (SVM as per WELFake best performer)
svm1 = SVC(kernel='linear', probability=True, random_state=42)
svm2 = SVC(kernel='linear', probability=True, random_state=42)
svm3 = SVC(kernel='linear', probability=True, random_state=42)

# Fit each SVM
svm1.fit(Xtr_LFS1, y_train)
svm2.fit(Xtr_LFS2, y_train)
svm3.fit(Xtr_LFS3, y_train)

# Stage 1 voting: combine predictions from LFS1, LFS2, LFS3
stage1_vote = VotingClassifier(
    estimators=[
        ('svm1', svm1),
        ('svm2', svm2),
        ('svm3', svm3)
    ],
    voting='hard'
)
stage1_vote.fit(
    hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]),  # Stack features for VotingClassifier fit
    y_train
)

# Stage 1 predictions
P6_train = stage1_vote.predict(hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]))
P6_test = stage1_vote.predict(hstack([Xte_LFS1, Xte_LFS2, Xte_LFS3]))

# ----- Stage 2: Combine P6 with CV-only and TF-IDF-only -----
from sklearn.feature_extraction.text import TfidfVectorizer

# CV-only on full text
cv_full = CountVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_cv_full = cv_full.fit_transform(X_train['cleaned_text'])
Xte_cv_full = cv_full.transform(X_test['cleaned_text'])

# TF-IDF-only on full text
tfidf_full = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_tfidf_full = tfidf_full.fit_transform(X_train['cleaned_text'])
Xte_tfidf_full = tfidf_full.transform(X_test['cleaned_text'])

# Final stage voting: P6, CV, TF-IDF
final_vote = VotingClassifier(
    estimators=[
        ('cv_svm', SVC(kernel='linear', probability=True).fit(Xtr_cv_full, y_train)),
        ('tfidf_svm', SVC(kernel='linear', probability=True).fit(Xtr_tfidf_full, y_train)),
        ('lfs_vote', stage1_vote)
    ],
    voting='hard'
)

final_vote.fit(Xtr_cv_full, y_train)  # Fit on one set, predictions from others are internal

# Final prediction
final_preds = final_vote.predict(Xte_cv_full)

from sklearn.metrics import accuracy_score
print("Final Accuracy:", accuracy_score(y_test, final_preds))

Final Accuracy: 0.8666666666666667


#### Pyspark Implementation

In [1]:
from pyspark.sql import SparkSession

# Start Spark session (only once)
spark = SparkSession.builder \
    .appName("Fake News Detection") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

spark.conf.set("spark.sql.shuffle.partitions", "48")

25/08/19 15:03:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# Load dataset
spark_df = spark.read.csv(
    "cleaned_welfake.csv",
    header=True,
    inferSchema=True,
    quote='"',
    escape='"',
    multiLine=True  # needed since your text column contains line breaks
)

# Show the first few rows of the DataFrame
spark_df.show(5)
spark_df.printSchema()

                                                                                

+-----+--------------------+--------------------+-----+
|index|       cleaned_title|        cleaned_text|label|
+-----+--------------------+--------------------+-----+
|   31|Credit Suisse Bos...|When Tidjane Thia...|    0|
|  265|Angry and inspire...|ROCKVILLE, Md. (R...|    0|
|  806|Russian Economy M...|This post was ori...|    1|
| 1106|HOUSE SPEAKER PAU...|HOUSE SPEAKER PAU...|    1|
| 1151|(AUDIO) RACIST BL...|What is it that t...|    1|
+-----+--------------------+--------------------+-----+
only showing top 5 rows

root
 |-- index: integer (nullable = true)
 |-- cleaned_title: string (nullable = true)
 |-- cleaned_text: string (nullable = true)
 |-- label: integer (nullable = true)



In [3]:
import re
from textblob import TextBlob
import textstat
from nltk import word_tokenize, pos_tag
from pyspark.sql.functions import udf, col, rand, row_number
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType
from pyspark.sql.window import Window

# Define schema
feature_schema = StructType([
    StructField("num_special_chars", IntegerType()),
    StructField("num_determinants", IntegerType()),
    StructField("num_capital_letters", IntegerType()),
    StructField("num_short_sentences", IntegerType()),
    StructField("num_long_sentences", IntegerType()),
    StructField("gunning_fog", DoubleType()),
    StructField("smog", DoubleType()),
    StructField("ari", DoubleType()),
    StructField("polarity", DoubleType()),
    StructField("title_similarity", DoubleType()),
    StructField("subjectivity", DoubleType()),
    StructField("num_syllables", IntegerType()),
    StructField("num_words", IntegerType()),
    StructField("rate_adj_adv", DoubleType()),
    StructField("words_per_sentence", DoubleType()),
    StructField("num_articles", IntegerType()),
    StructField("num_verbs", IntegerType()),
    StructField("num_sentences", IntegerType()),
    StructField("num_adjectives", IntegerType()),
    StructField("num_adverbs", IntegerType())
])

# Feature extraction
def extract_features(text):
    # try:
    tokens = word_tokenize(text)
    words = [w for w in tokens if w.isalpha()]
    sentences = re.split(r'[.!?]+', text)
    pos_tags = pos_tag(words)

    num_special_chars = len(re.findall(r'[^a-zA-Z0-9\s]', text))
    num_determinants = sum(1 for w in words if w.lower() in ['the', 'a', 'an'])
    num_capital_letters = sum(1 for c in text if c.isupper())
    num_short_sent = sum(1 for s in sentences if len(s.split()) < 10)
    num_long_sent = sum(1 for s in sentences if len(s.split()) > 20)

    gunning_fog = textstat.gunning_fog(text)
    smog = textstat.smog_index(text)
    ari = textstat.automated_readability_index(text)

    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    title_similarity = 0.0

    num_syllables = textstat.syllable_count(text)
    num_words = len(words)
    num_sentences = len([s for s in sentences if s.strip()])
    num_adjectives = sum(1 for _, tag in pos_tags if tag in ['JJ', 'JJR', 'JJS'])
    num_adverbs = sum(1 for _, tag in pos_tags if tag in ['RB', 'RBR', 'RBS'])
    num_verbs = sum(1 for _, tag in pos_tags if tag.startswith('VB'))
    num_articles = sum(1 for w in words if w.lower() in ['a', 'an', 'the'])

    rate_adj_adv = (num_adjectives + num_adverbs) / num_words if num_words > 0 else 0
    words_per_sent = num_words / num_sentences if num_sentences > 0 else 0

    return (
        num_special_chars, num_determinants, num_capital_letters, num_short_sent, num_long_sent,
        gunning_fog, smog, ari,
        polarity, title_similarity, subjectivity,
        num_syllables, num_words, rate_adj_adv, words_per_sent,
        num_articles, num_verbs, num_sentences, num_adjectives, num_adverbs
    )
    # except:
    #     return (None,) * 20

# # --- Stratified sampling with Window (exact N per label) ---
# n = 10000  # number of rows per class
# w = Window.partitionBy("label").orderBy(rand(seed=42))
# df_ranked = spark_df.withColumn("row_num", row_number().over(w))
# df_sampled = df_ranked.filter(col("row_num") <= n).drop("row_num")

# # Verify stratification
# df_sampled.groupBy("label").count().show()
spark_df.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    0|33668|
|    1|25973|
+-----+-----+



                                                                                

In [4]:
# Register UDF
extract_features_udf = udf(extract_features, feature_schema)

# Apply feature extraction
# df_with_features = df_sampled.withColumn("features", extract_features_udf("cleaned_text"))
df_with_features = spark_df.withColumn("features", extract_features_udf("cleaned_text"))

df_with_features.printSchema()

root
 |-- index: integer (nullable = true)
 |-- cleaned_title: string (nullable = true)
 |-- cleaned_text: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- features: struct (nullable = true)
 |    |-- num_special_chars: integer (nullable = true)
 |    |-- num_determinants: integer (nullable = true)
 |    |-- num_capital_letters: integer (nullable = true)
 |    |-- num_short_sentences: integer (nullable = true)
 |    |-- num_long_sentences: integer (nullable = true)
 |    |-- gunning_fog: double (nullable = true)
 |    |-- smog: double (nullable = true)
 |    |-- ari: double (nullable = true)
 |    |-- polarity: double (nullable = true)
 |    |-- title_similarity: double (nullable = true)
 |    |-- subjectivity: double (nullable = true)
 |    |-- num_syllables: integer (nullable = true)
 |    |-- num_words: integer (nullable = true)
 |    |-- rate_adj_adv: double (nullable = true)
 |    |-- words_per_sentence: double (nullable = true)
 |    |-- num_articles: integer (

In [5]:
from pyspark.sql import functions as F

# Flatten struct in one go instead of looping with withColumn
df_with_features = df_with_features.select(
    "*",        # keep all existing columns
    F.col("features.*")  # expand all fields inside "features"
).drop("features").na.drop(how="any")  # drop original struct column

# # Drop rows with null values
# df_with_features = df_with_features.na.drop(how="any")

df_with_features.cache()

DataFrame[index: int, cleaned_title: string, cleaned_text: string, label: int, num_special_chars: int, num_determinants: int, num_capital_letters: int, num_short_sentences: int, num_long_sentences: int, gunning_fog: double, smog: double, ari: double, polarity: double, title_similarity: double, subjectivity: double, num_syllables: int, num_words: int, rate_adj_adv: double, words_per_sentence: double, num_articles: int, num_verbs: int, num_sentences: int, num_adjectives: int, num_adverbs: int]

In [None]:
import shutil
import os
from pyspark.sql import functions as F

# Select, sanitize, and write a single safely-quoted CSV
temp_path = "temp_csv_output"
final_path = "cleaned_welfake_with_features.csv"

# Write as a single part with strict quoting
(
    df_with_features
    .coalesce(1)
    .write
    .mode("overwrite")
    .option("header", True)
    .option("quote", '"')        # use " as quote char
    .option("escape", '"')       # escape " as ""
    .option("quoteAll", True)    # quote every field to be extra safe
    .csv(temp_path)
)

# Find the part file inside the temp folder and move it as the final CSV
for file in os.listdir(temp_path):
    if file.startswith("part-") and file.endswith(".csv"):
        # remove existing file if present (optional but safer across OSes)
        if os.path.exists(final_path):
            os.remove(final_path)
        shutil.move(os.path.join(temp_path, file), final_path)
        break

# Clean up
shutil.rmtree(temp_path)


[Stage 11:>                                                         (0 + 1) / 1]

In [7]:
# Apply UDF to smaller sample
sampled = df_with_features.sample(0.001, seed=42)
sampled.show(5)

  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
    process()
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
    serializer.dump_stream(out_iter, outfile)
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 223, in dump_stream
    self.serializer.dump_stream(self._bat

KeyboardInterrupt: 

In [7]:
df_with_features.sample(0.0000001).show(5)



+-----+-------------+------------+-----+-----------------+----------------+-------------------+-------------------+------------------+-----------+----+---+--------+----------------+------------+-------------+---------+------------+------------------+------------+---------+-------------+--------------+-----------+
|index|cleaned_title|cleaned_text|label|num_special_chars|num_determinants|num_capital_letters|num_short_sentences|num_long_sentences|gunning_fog|smog|ari|polarity|title_similarity|subjectivity|num_syllables|num_words|rate_adj_adv|words_per_sentence|num_articles|num_verbs|num_sentences|num_adjectives|num_adverbs|
+-----+-------------+------------+-----+-----------------+----------------+-------------------+-------------------+------------------+-----------+----+---+--------+----------------+------------+-------------+---------+------------+------------------+------------+---------+-------------+--------------+-----------+
+-----+-------------+------------+-----+---------------

                                                                                

In [8]:
# Get the Spark UI URL
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")
# Usually: http://localhost:4040 or http://driver-ip:4040

Spark UI: http://192.168.0.47:4040


In [None]:
df_with_features.show(3)



In [None]:
df_with_features.sample(0.01).show(5)

+-----+--------------------+--------------------+-----+-----------------+----------------+-------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+----------------+-------------------+-------------+---------+-------------------+------------------+------------+---------+-------------+--------------+-----------+
|index|       cleaned_title|        cleaned_text|label|num_special_chars|num_determinants|num_capital_letters|num_short_sentences|num_long_sentences|       gunning_fog|              smog|               ari|            polarity|title_similarity|       subjectivity|num_syllables|num_words|       rate_adj_adv|words_per_sentence|num_articles|num_verbs|num_sentences|num_adjectives|num_adverbs|
+-----+--------------------+--------------------+-----+-----------------+----------------+-------------------+-------------------+------------------+------------------+------------------+------------------+----------

In [None]:
import pandas as pd

# Convert to Pandas DataFrame for easier viewing
pd.set_option('display.max_columns', None)  # Show all columns
pd.DataFrame(df_with_features.take(5), columns=df_with_features.columns)

  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
    process()
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
    serializer.dump_stream(out_iter, outfile)
  File "/Users/bachtiarherdianto/Documents/Projects/fake-news-detection/.venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 223, in dump_stream
    self.serializer.dump_stream(self._bat

KeyboardInterrupt: 

In [5]:
import pandas as pd

# Flatten struct
for col_name in feature_schema.fieldNames():
    df_with_features = df_with_features.withColumn(col_name, df_with_features["features"][col_name])

df_with_features = df_with_features.drop("features").na.drop(how="any")

# Convert to Pandas DataFrame for easier viewing
pd.set_option('display.max_columns', None)  # Show all columns
pd.DataFrame(df_with_features.take(5), columns=df_with_features.columns)

                                                                                

Unnamed: 0,index,cleaned_title,cleaned_text,label,num_special_chars,num_determinants,num_capital_letters,num_short_sentences,num_long_sentences,gunning_fog,smog,ari,polarity,title_similarity,subjectivity,num_syllables,num_words,rate_adj_adv,words_per_sentence,num_articles,num_verbs,num_sentences,num_adjectives,num_adverbs
0,26903,U.S. gun rules heighten tension between police...,WARSAW (Reuters) - President Barack Obama pled...,0,67,40,73,4,15,14.814928,13.40902,14.159571,0.004114,0.0,0.380217,836,538,0.089219,23.391304,40,97,23,32,16
1,48590,Yemen says Saudi-led coalition to allow commer...,ADEN (Reuters) - The Saudi-led military coalit...,0,31,25,61,2,5,17.26855,15.760457,16.741491,-0.066667,0.0,0.13125,411,226,0.097345,22.6,25,34,10,19,3
2,31781,"Yemen air strike kills eight women, two childr...",DUBAI (Reuters) - Eight women and two children...,0,26,23,28,1,6,17.298246,15.470042,17.657417,0.066667,0.0,0.22151,300,183,0.04918,30.5,23,39,6,6,3
3,8921,Donald Trump says he doesn't need a unified GO...,When Donald Trump told ABC's George Stephanopo...,0,180,64,154,12,19,12.797101,12.709667,10.767404,0.151101,0.0,0.472443,1412,903,0.125138,18.428571,64,166,49,66,47
4,14733,All the Clamor? Trump's Palm Beach Neighbors S...,"PALM BEACH, Fla. There will be no this weekend...",0,188,79,169,32,20,11.510657,11.789604,10.009782,0.047362,0.0,0.414748,1416,890,0.103371,12.535211,79,161,71,65,27


In [6]:
from pyspark.sql import functions as F

# Define features
feature_cols = [
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# Build condition: check if any feature column is null or NaN
condition = None
for c in feature_cols:
    col_cond = F.col(c).isNull() | F.isnan(c)
    condition = col_cond if condition is None else (condition | col_cond)

# Count rows with at least one null/NaN
null_count = df_with_features.filter(condition).count()
print(f"Rows with at least one null/NaN feature: {null_count}")




Rows with at least one null/NaN feature: 0


                                                                                

##### Classic ML

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import (
    LogisticRegression, DecisionTreeClassifier, RandomForestClassifier,
    GBTClassifier, NaiveBayes
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Define features
feature_cols = [
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# Assemble features into vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_vec", handleInvalid="skip")

# Scale features
scaler = StandardScaler(inputCol="features_vec", outputCol="features", withStd=True, withMean=True)

# Train/test split
train_df, test_df = df_with_features.randomSplit([0.8, 0.2], seed=42)
train_df.cache()
test_df.cache()

# Models available in Spark MLlib
models = {
    "Logistic Regression": LogisticRegression(featuresCol="features", labelCol="label", maxIter=100),
    "Decision Tree": DecisionTreeClassifier(featuresCol="features", labelCol="label"),
    "Random Forest": RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100),
    "Gradient Boosting": GBTClassifier(featuresCol="features", labelCol="label", maxIter=100),
    "Naive Bayes": NaiveBayes(featuresCol="features", labelCol="label", modelType="gaussian")
}

# Evaluators
bin_eval = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
multi_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

results = []

for name, clf in models.items():
    pipeline = Pipeline(stages=[assembler, scaler, clf])
    model = pipeline.fit(train_df)
    preds = model.transform(test_df)

    acc = multi_eval.setMetricName("accuracy").evaluate(preds)
    f1 = multi_eval.setMetricName("f1").evaluate(preds)
    precision = multi_eval.setMetricName("weightedPrecision").evaluate(preds)
    recall = multi_eval.setMetricName("weightedRecall").evaluate(preds)
    auc = bin_eval.evaluate(preds)

    results.append((name, acc, precision, recall, f1, auc))

# Convert results to Spark DataFrame
results_df = spark.createDataFrame(results, ["Model", "Accuracy", "Precision", "Recall", "F1", "AUC"]).toPandas()
results_df.sort_values('Accuracy', ascending=False)

DataFrame[index: int, cleaned_title: string, cleaned_text: string, label: int, num_special_chars: int, num_determinants: int, num_capital_letters: int, num_short_sentences: int, num_long_sentences: int, gunning_fog: double, smog: double, ari: double, polarity: double, title_similarity: double, subjectivity: double, num_syllables: int, num_words: int, rate_adj_adv: double, words_per_sentence: double, num_articles: int, num_verbs: int, num_sentences: int, num_adjectives: int, num_adverbs: int]

##### WELFake Approach