### Feature Extraction

In [1]:
import pandas as pd
import re
import textstat
import nltk
from textblob import TextBlob
from nltk import word_tokenize, pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/nel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/nel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [27]:
def extract_features(text):
    try:
        tokens = word_tokenize(text)
        words = [w for w in tokens if w.isalpha()]
        sentences = re.split(r'[.!?]+', text)
        pos_tags = pos_tag(words)
        
        # Writing pattern
        num_special_chars = len(re.findall(r'[^a-zA-Z0-9\s]', text))
        num_determinants = sum(1 for w in words if w.lower() in ['the', 'a', 'an'])
        num_capital_letters = sum(1 for c in text if c.isupper())
        num_short_sentences = sum(1 for s in sentences if len(s.split()) < 10)
        num_long_sentences = sum(1 for s in sentences if len(s.split()) > 20)

        # Readability indices
        gunning_fog = textstat.gunning_fog(text)
        smog = textstat.smog_index(text)
        ari = textstat.automated_readability_index(text)

        # Psycholinguistics
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity
        title_similarity = 0  # Optional, you can compute this with cosine or Jaccard if you have 'title'

        # Quantity
        num_syllables = textstat.syllable_count(text)
        num_words = len(words)
        num_sentences = len([s for s in sentences if s.strip()])
        num_adjectives = sum(1 for _, tag in pos_tags if tag in ['JJ', 'JJR', 'JJS'])
        num_adverbs = sum(1 for _, tag in pos_tags if tag in ['RB', 'RBR', 'RBS'])
        num_verbs = sum(1 for _, tag in pos_tags if tag.startswith('VB'))
        num_articles = sum(1 for w in words if w.lower() in ['a', 'an', 'the'])

        rate_adj_adv = (num_adjectives + num_adverbs) / num_words if num_words > 0 else 0
        words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

        return pd.Series([
            num_special_chars, num_determinants, num_capital_letters, num_short_sentences, num_long_sentences,
            gunning_fog, smog, ari,
            polarity, title_similarity, subjectivity,
            num_syllables, num_words, rate_adj_adv, words_per_sentence,
            num_articles, num_verbs, num_sentences, num_adjectives, num_adverbs
        ])
    except:
        return pd.Series([None]*20)

In [47]:
df = pd.read_csv('cleaned_welfake.csv')

# # Sample 50% from each class
# df_sampled = df.groupby('label', group_keys=False).sample(frac=0.5, random_state=42)

# Sample 2 rows per class
df_sampled = df.groupby('label', group_keys=False).sample(n=100, random_state=42)
df_sampled.head()

Unnamed: 0,index,cleaned_title,cleaned_text,label
27473,27540,Catalan pro-independence party PdeCat says wil...,MADRID (Reuters) - Catalan pro-independence pa...,0
52620,4385,Trump says will speak with China's Xi on North...,WASHINGTON (Reuters) - President Donald Trump ...,0
8004,54855,Dan Pfeiffer to leave White House,"Dan Pfeiffer, one of President Obama's closest...",0
19552,31051,"Eurofighter jet crashes in Spain, killing pilot",MADRID (Reuters) - A Eurofighter combat jet pl...,0
15749,13354,2nd Night of Trump Protests Brings 29 Arrests ...,Thousands of demonstrators filled the streets ...,0


In [48]:
df_sampled.to_csv('sampled_welfake_200.csv', index=False)

In [34]:
# Assuming your dataframe is called df and has a 'text' column
feature_columns = [
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# # Apply feature extraction and add prefix to the resulting columns
# df_sampled[text_feature_cols := ['text_' + col for col in feature_columns]] = (
#     df_sampled['cleaned_text'].apply(extract_features)
# )

# # Apply feature extraction and add prefix to the resulting columns
# df_sampled[title_feature_cols := ['title_' + col for col in feature_columns]] = (
#     df_sampled['cleaned_title'].apply(extract_features)
# )

df_sampled[feature_columns] = df_sampled['cleaned_text'].apply(extract_features)

df_sampled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 27473 to 33392
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                2000 non-null   int64  
 1   cleaned_title        2000 non-null   object 
 2   cleaned_text         2000 non-null   object 
 3   label                2000 non-null   int64  
 4   num_special_chars    2000 non-null   float64
 5   num_determinants     2000 non-null   float64
 6   num_capital_letters  2000 non-null   float64
 7   num_short_sentences  2000 non-null   float64
 8   num_long_sentences   2000 non-null   float64
 9   gunning_fog          2000 non-null   float64
 10  smog                 2000 non-null   float64
 11  ari                  2000 non-null   float64
 12  polarity             2000 non-null   float64
 13  title_similarity     2000 non-null   float64
 14  subjectivity         2000 non-null   float64
 15  num_syllables        2000 non-null   f

### Modelling

#### Pandas Implementation

##### Classic ML

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    BaggingClassifier,
    AdaBoostClassifier,
)

In [36]:
feature_cols = [
# text_feature_cols + title_feature_cols
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

X = df_sampled[feature_cols]
y = df_sampled['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [37]:
models = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Bagging': BaggingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[name] = {
        'accuracy': acc,
        'report': classification_report(y_test, y_pred, output_dict=True)
    }
    # print(f"✅ {name} Accuracy: {acc:.4f}")
    # print(classification_report(y_test, y_pred))

accuracy_df = pd.DataFrame([
    {'Model': name, 'Accuracy': result['accuracy']}
    for name, result in results.items()
])
print(accuracy_df.sort_values(by='Accuracy', ascending=False))

                 Model  Accuracy
8          Extra Trees    0.7975
7    Gradient Boosting    0.7775
6        Random Forest    0.7650
1                  SVM    0.7625
9  Logistic Regression    0.7550
0                  KNN    0.7525
4              Bagging    0.7275
5             AdaBoost    0.6950
3        Decision Tree    0.6375
2          Naive Bayes    0.6175


##### WELFake Approach

In [38]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

feature_cols = [
# text_feature_cols + title_feature_cols
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# Example: Define your LFS feature groups (replace with your actual 20 features split into 3 sets)
LFS1 = ['num_special_chars', 'num_determinants', 'num_capital_letters', 
        'gunning_fog', 'polarity', 'num_syllables']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity',
        'subjectivity', 'num_words', 'rate_adj_adv']
LFS3 = ['num_long_sentences', 'ari', 'num_articles', 
        'num_verbs', 'num_sentences', 'words_per_sentence']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled, df_sampled['label'], test_size=0.3, random_state=42, stratify=df_sampled['label']
)

# Function to apply CV + LFS
def cv_over_lfs(X_train, X_test, lfs_cols):
    """Apply Count Vectorizer to text + concatenate LFS numeric features."""
    cv = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_cv = cv.fit_transform(X_train['cleaned_text'])
    X_test_cv = cv.transform(X_test['cleaned_text'])

    # Scale LFS numeric features
    scaler = StandardScaler()
    X_train_lfs = scaler.fit_transform(X_train[lfs_cols])
    X_test_lfs = scaler.transform(X_test[lfs_cols])

    # Combine sparse CV with dense LFS
    X_train_combined = hstack([X_train_cv, X_train_lfs])
    X_test_combined = hstack([X_test_cv, X_test_lfs])
    
    return X_train_combined, X_test_combined

# Generate embedded sets
Xtr_LFS1, Xte_LFS1 = cv_over_lfs(X_train, X_test, LFS1)
Xtr_LFS2, Xte_LFS2 = cv_over_lfs(X_train, X_test, LFS2)
Xtr_LFS3, Xte_LFS3 = cv_over_lfs(X_train, X_test, LFS3)

# Define base model (SVM as per WELFake best performer)
svm1 = SVC(kernel='linear', probability=True, random_state=42)
svm2 = SVC(kernel='linear', probability=True, random_state=42)
svm3 = SVC(kernel='linear', probability=True, random_state=42)

# Fit each SVM
svm1.fit(Xtr_LFS1, y_train)
svm2.fit(Xtr_LFS2, y_train)
svm3.fit(Xtr_LFS3, y_train)

# Stage 1 voting: combine predictions from LFS1, LFS2, LFS3
stage1_vote = VotingClassifier(
    estimators=[
        ('svm1', svm1),
        ('svm2', svm2),
        ('svm3', svm3)
    ],
    voting='hard'
)
stage1_vote.fit(
    hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]),  # Stack features for VotingClassifier fit
    y_train
)

# Stage 1 predictions
P6_train = stage1_vote.predict(hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]))
P6_test = stage1_vote.predict(hstack([Xte_LFS1, Xte_LFS2, Xte_LFS3]))

# ----- Stage 2: Combine P6 with CV-only and TF-IDF-only -----
from sklearn.feature_extraction.text import TfidfVectorizer

# CV-only on full text
cv_full = CountVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_cv_full = cv_full.fit_transform(X_train['cleaned_text'])
Xte_cv_full = cv_full.transform(X_test['cleaned_text'])

# TF-IDF-only on full text
tfidf_full = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_tfidf_full = tfidf_full.fit_transform(X_train['cleaned_text'])
Xte_tfidf_full = tfidf_full.transform(X_test['cleaned_text'])

# Final stage voting: P6, CV, TF-IDF
final_vote = VotingClassifier(
    estimators=[
        ('cv_svm', SVC(kernel='linear', probability=True).fit(Xtr_cv_full, y_train)),
        ('tfidf_svm', SVC(kernel='linear', probability=True).fit(Xtr_tfidf_full, y_train)),
        ('lfs_vote', stage1_vote)
    ],
    voting='hard'
)

final_vote.fit(Xtr_cv_full, y_train)  # Fit on one set, predictions from others are internal

# Final prediction
final_preds = final_vote.predict(Xte_cv_full)

from sklearn.metrics import accuracy_score
print("Final Accuracy:", accuracy_score(y_test, final_preds))

Final Accuracy: 0.8916666666666667


#### Pyspark Implementation

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Setup configuration parameters for Spark
spark_conf = (
    SparkConf()
    .setMaster("local[12]")               # safer: 8 threads instead of 16
    .setAppName("Fake News Detection")
    .set("spark.driver.memory", "32g")
    .set("spark.executor.memory", "32g")
    .set("spark.executor.cores", "8")
    .set("spark.sql.shuffle.partitions", "128")
    .set("spark.memory.fraction", "0.8")
    .set("spark.memory.offHeap.enabled", "true")
    .set("spark.memory.offHeap.size", "4g")
)

# Setup SparkSession
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

print("Spark Session started with settings:")
for k, v in sc.getConf().getAll():
    if "memory" in k or "cores" in k:
        print(f"{k} = {v}")

25/08/19 20:36:01 WARN Utils: Your hostname, nel-X600-ITX resolves to a loopback address: 127.0.1.1; using 192.168.0.23 instead (on interface wlp4s0)
25/08/19 20:36:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/08/19 20:36:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Spark Session started with settings:
spark.memory.offHeap.size = 4g
spark.driver.memory = 32g
spark.memory.fraction = 0.8
spark.memory.offHeap.enabled = true
spark.executor.memory = 32g
spark.executor.cores = 8


In [2]:
# Load dataset
spark_df = spark.read.csv(
    "sampled_welfake_200.csv",
    # "cleaned_welfake.csv",
    header=True,
    inferSchema=True,
    quote='"',
    escape='"',
    multiLine=True  # needed since your text column contains line breaks
)

# Show the first few rows of the DataFrame
spark_df.show(5)
spark_df.printSchema()

+-----+--------------------+--------------------+-----+
|index|       cleaned_title|        cleaned_text|label|
+-----+--------------------+--------------------+-----+
|27540|Catalan pro-indep...|MADRID (Reuters) ...|    0|
| 4385|Trump says will s...|WASHINGTON (Reute...|    0|
|54855|Dan Pfeiffer to l...|Dan Pfeiffer, one...|    0|
|31051|Eurofighter jet c...|MADRID (Reuters) ...|    0|
|13354|2nd Night of Trum...|Thousands of demo...|    0|
+-----+--------------------+--------------------+-----+
only showing top 5 rows

root
 |-- index: integer (nullable = true)
 |-- cleaned_title: string (nullable = true)
 |-- cleaned_text: string (nullable = true)
 |-- label: integer (nullable = true)



In [3]:
import re
from textblob import TextBlob
import textstat
from nltk import word_tokenize, pos_tag
from pyspark.sql.functions import udf, col, rand, row_number
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType
from pyspark.sql.window import Window

# Define schema
feature_schema = StructType([
    StructField("num_special_chars", IntegerType()),
    StructField("num_determinants", IntegerType()),
    StructField("num_capital_letters", IntegerType()),
    StructField("num_short_sentences", IntegerType()),
    StructField("num_long_sentences", IntegerType()),
    StructField("gunning_fog", DoubleType()),
    StructField("smog", DoubleType()),
    StructField("ari", DoubleType()),
    StructField("polarity", DoubleType()),
    StructField("title_similarity", DoubleType()),
    StructField("subjectivity", DoubleType()),
    StructField("num_syllables", IntegerType()),
    StructField("num_words", IntegerType()),
    StructField("rate_adj_adv", DoubleType()),
    StructField("words_per_sentence", DoubleType()),
    StructField("num_articles", IntegerType()),
    StructField("num_verbs", IntegerType()),
    StructField("num_sentences", IntegerType()),
    StructField("num_adjectives", IntegerType()),
    StructField("num_adverbs", IntegerType())
])

# Feature extraction
def extract_features(text):
    # try:
    tokens = word_tokenize(text)
    words = [w for w in tokens if w.isalpha()]
    sentences = re.split(r'[.!?]+', text)
    pos_tags = pos_tag(words)

    num_special_chars = len(re.findall(r'[^a-zA-Z0-9\s]', text))
    num_determinants = sum(1 for w in words if w.lower() in ['the', 'a', 'an'])
    num_capital_letters = sum(1 for c in text if c.isupper())
    num_short_sent = sum(1 for s in sentences if len(s.split()) < 10)
    num_long_sent = sum(1 for s in sentences if len(s.split()) > 20)

    gunning_fog = textstat.gunning_fog(text)
    smog = textstat.smog_index(text)
    ari = textstat.automated_readability_index(text)

    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    title_similarity = 0.0

    num_syllables = textstat.syllable_count(text)
    num_words = len(words)
    num_sentences = len([s for s in sentences if s.strip()])
    num_adjectives = sum(1 for _, tag in pos_tags if tag in ['JJ', 'JJR', 'JJS'])
    num_adverbs = sum(1 for _, tag in pos_tags if tag in ['RB', 'RBR', 'RBS'])
    num_verbs = sum(1 for _, tag in pos_tags if tag.startswith('VB'))
    num_articles = sum(1 for w in words if w.lower() in ['a', 'an', 'the'])

    rate_adj_adv = (num_adjectives + num_adverbs) / num_words if num_words > 0 else 0
    words_per_sent = num_words / num_sentences if num_sentences > 0 else 0

    return (
        num_special_chars, num_determinants, num_capital_letters, num_short_sent, num_long_sent,
        gunning_fog, smog, ari,
        polarity, title_similarity, subjectivity,
        num_syllables, num_words, rate_adj_adv, words_per_sent,
        num_articles, num_verbs, num_sentences, num_adjectives, num_adverbs
    )
    # except:
    #     return (None,) * 20

# # --- Stratified sampling with Window (exact N per label) ---
# n = 100  # number of rows per class
# w = Window.partitionBy("label").orderBy(rand(seed=42))
# df_ranked = spark_df.withColumn("row_num", row_number().over(w))
# df_sampled = df_ranked.filter(col("row_num") <= n).drop("row_num")

# # Verify stratification
# df_sampled.groupBy("label").count().show()
spark_df.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    0|  100|
|    1|  100|
+-----+-----+



In [4]:
# Register UDF
extract_features_udf = udf(extract_features, feature_schema)

# Apply feature extraction
# df_with_features = df_sampled.withColumn("features", extract_features_udf("cleaned_text"))
df_with_features = spark_df.withColumn("features", extract_features_udf("cleaned_text"))

df_with_features.printSchema()

root
 |-- index: integer (nullable = true)
 |-- cleaned_title: string (nullable = true)
 |-- cleaned_text: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- features: struct (nullable = true)
 |    |-- num_special_chars: integer (nullable = true)
 |    |-- num_determinants: integer (nullable = true)
 |    |-- num_capital_letters: integer (nullable = true)
 |    |-- num_short_sentences: integer (nullable = true)
 |    |-- num_long_sentences: integer (nullable = true)
 |    |-- gunning_fog: double (nullable = true)
 |    |-- smog: double (nullable = true)
 |    |-- ari: double (nullable = true)
 |    |-- polarity: double (nullable = true)
 |    |-- title_similarity: double (nullable = true)
 |    |-- subjectivity: double (nullable = true)
 |    |-- num_syllables: integer (nullable = true)
 |    |-- num_words: integer (nullable = true)
 |    |-- rate_adj_adv: double (nullable = true)
 |    |-- words_per_sentence: double (nullable = true)
 |    |-- num_articles: integer (

In [5]:
from pyspark.sql import functions as F

# Flatten struct in one go instead of looping with withColumn
df_with_features = df_with_features.select(
    "*",        # keep all existing columns
    F.col("features.*")  # expand all fields inside "features"
).drop("features").na.drop(how="any")  # drop original struct column

# # Drop rows with null values
# df_with_features = df_with_features.na.drop(how="any")

df_with_features.cache()

DataFrame[index: int, cleaned_title: string, cleaned_text: string, label: int, num_special_chars: int, num_determinants: int, num_capital_letters: int, num_short_sentences: int, num_long_sentences: int, gunning_fog: double, smog: double, ari: double, polarity: double, title_similarity: double, subjectivity: double, num_syllables: int, num_words: int, rate_adj_adv: double, words_per_sentence: double, num_articles: int, num_verbs: int, num_sentences: int, num_adjectives: int, num_adverbs: int]

In [None]:
# import shutil
# import os
# from pyspark.sql import functions as F

# # Select, sanitize, and write a single safely-quoted CSV
# temp_path = "temp_csv_output"
# final_path = "cleaned_welfake_with_features.csv"

# # Write as a single part with strict quoting
# (
#     df_with_features
#     .coalesce(1)
#     .write
#     .mode("overwrite")
#     .option("header", True)
#     .option("quote", '"')        # use " as quote char
#     .option("escape", '"')       # escape " as ""
#     .option("quoteAll", True)    # quote every field to be extra safe
#     .csv(temp_path)
# )

# # Find the part file inside the temp folder and move it as the final CSV
# for file in os.listdir(temp_path):
#     if file.startswith("part-") and file.endswith(".csv"):
#         # remove existing file if present (optional but safer across OSes)
#         if os.path.exists(final_path):
#             os.remove(final_path)
#         shutil.move(os.path.join(temp_path, file), final_path)
#         break

# # Clean up
# shutil.rmtree(temp_path)


[Stage 11:>                                                         (0 + 1) / 1]

In [8]:
# Get the Spark UI URL
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")
# Usually: http://localhost:4040 or http://driver-ip:4040

Spark UI: http://192.168.0.23:4040


In [6]:
df_with_features.show(3)

[Stage 13:>                                                         (0 + 1) / 1]

+-----+--------------------+--------------------+-----+-----------------+----------------+-------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+----------------+-------------------+-------------+---------+-------------------+------------------+------------+---------+-------------+--------------+-----------+
|index|       cleaned_title|        cleaned_text|label|num_special_chars|num_determinants|num_capital_letters|num_short_sentences|num_long_sentences|       gunning_fog|              smog|               ari|            polarity|title_similarity|       subjectivity|num_syllables|num_words|       rate_adj_adv|words_per_sentence|num_articles|num_verbs|num_sentences|num_adjectives|num_adverbs|
+-----+--------------------+--------------------+-----+-----------------+----------------+-------------------+-------------------+------------------+------------------+------------------+------------------+----------

                                                                                

In [7]:
import pandas as pd

# Convert to Pandas DataFrame for easier viewing
pd.set_option('display.max_columns', None)  # Show all columns
pd.DataFrame(df_with_features.take(5), columns=df_with_features.columns)

Unnamed: 0,index,cleaned_title,cleaned_text,label,num_special_chars,num_determinants,num_capital_letters,num_short_sentences,num_long_sentences,gunning_fog,smog,ari,polarity,title_similarity,subjectivity,num_syllables,num_words,rate_adj_adv,words_per_sentence,num_articles,num_verbs,num_sentences,num_adjectives,num_adverbs
0,27540,Catalan pro-independence party PdeCat says wil...,MADRID (Reuters) - Catalan pro-independence pa...,0,18,6,31,1,2,21.312048,18.51114,17.373214,-0.222222,0.0,0.455556,171,81,0.111111,20.25,6,15,4,7,2
1,4385,Trump says will speak with China's Xi on North...,WASHINGTON (Reuters) - President Donald Trump ...,0,14,2,29,1,1,14.607407,14.554593,12.575818,0.021273,0.0,0.490909,90,54,0.111111,13.5,2,7,4,4,2
2,54855,Dan Pfeiffer to leave White House,"Dan Pfeiffer, one of President Obama's closest...",0,52,31,65,3,7,14.438689,13.526455,12.653349,0.152275,0.0,0.275935,529,342,0.108187,20.117647,31,59,17,26,11
3,31051,"Eurofighter jet crashes in Spain, killing pilot",MADRID (Reuters) - A Eurofighter combat jet pl...,0,12,15,21,1,3,15.433333,14.554593,14.314227,-0.23125,0.0,0.404167,158,95,0.052632,23.75,15,19,4,5,0
4,13354,2nd Night of Trump Protests Brings 29 Arrests ...,Thousands of demonstrators filled the streets ...,0,84,31,65,10,8,11.748434,11.905187,10.869642,0.020588,0.0,0.414951,655,402,0.099502,14.888889,31,79,27,28,12


In [None]:
import pandas as pd

# Flatten struct
# for col_name in feature_schema.fieldNames():
#     df_with_features = df_with_features.withColumn(col_name, df_with_features["features"][col_name])

# df_with_features = df_with_features.drop("features").na.drop(how="any")

# Convert to Pandas DataFrame for easier viewing
pd.set_option('display.max_columns', None)  # Show all columns
pd.DataFrame(df_with_features.take(5), columns=df_with_features.columns)

                                                                                

Unnamed: 0,index,cleaned_title,cleaned_text,label,num_special_chars,num_determinants,num_capital_letters,num_short_sentences,num_long_sentences,gunning_fog,smog,ari,polarity,title_similarity,subjectivity,num_syllables,num_words,rate_adj_adv,words_per_sentence,num_articles,num_verbs,num_sentences,num_adjectives,num_adverbs
0,26903,U.S. gun rules heighten tension between police...,WARSAW (Reuters) - President Barack Obama pled...,0,67,40,73,4,15,14.814928,13.40902,14.159571,0.004114,0.0,0.380217,836,538,0.089219,23.391304,40,97,23,32,16
1,48590,Yemen says Saudi-led coalition to allow commer...,ADEN (Reuters) - The Saudi-led military coalit...,0,31,25,61,2,5,17.26855,15.760457,16.741491,-0.066667,0.0,0.13125,411,226,0.097345,22.6,25,34,10,19,3
2,31781,"Yemen air strike kills eight women, two childr...",DUBAI (Reuters) - Eight women and two children...,0,26,23,28,1,6,17.298246,15.470042,17.657417,0.066667,0.0,0.22151,300,183,0.04918,30.5,23,39,6,6,3
3,8921,Donald Trump says he doesn't need a unified GO...,When Donald Trump told ABC's George Stephanopo...,0,180,64,154,12,19,12.797101,12.709667,10.767404,0.151101,0.0,0.472443,1412,903,0.125138,18.428571,64,166,49,66,47
4,14733,All the Clamor? Trump's Palm Beach Neighbors S...,"PALM BEACH, Fla. There will be no this weekend...",0,188,79,169,32,20,11.510657,11.789604,10.009782,0.047362,0.0,0.414748,1416,890,0.103371,12.535211,79,161,71,65,27


In [6]:
from pyspark.sql import functions as F

# Define features
feature_cols = [
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# Build condition: check if any feature column is null or NaN
condition = None
for c in feature_cols:
    col_cond = F.col(c).isNull() | F.isnan(c)
    condition = col_cond if condition is None else (condition | col_cond)

# Count rows with at least one null/NaN
null_count = df_with_features.filter(condition).count()
print(f"Rows with at least one null/NaN feature: {null_count}")




Rows with at least one null/NaN feature: 0


                                                                                

##### Classic ML

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import (
    LogisticRegression, DecisionTreeClassifier, RandomForestClassifier,
    GBTClassifier, NaiveBayes
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark import StorageLevel

# Define features
feature_cols = [
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# Pre-process data once and persist
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_vec", handleInvalid="skip")
scaler = StandardScaler(inputCol="features_vec", outputCol="features", withStd=True, withMean=True)

# Create preprocessing pipeline and apply once
preprocessing_pipeline = Pipeline(stages=[assembler, scaler])
preprocessing_model = preprocessing_pipeline.fit(df_with_features)
processed_df = preprocessing_model.transform(df_with_features)

# Train/test split on preprocessed data
train_df, test_df = processed_df.randomSplit([0.8, 0.2], seed=42)

# Persist with default storage level (faster alternative)
train_df.cache()
test_df.cache()

# Force materialization
train_count = train_df.count()
test_count = test_df.count()
print(f"Train samples: {train_count}, Test samples: {test_count}")

# Optimized models with reduced complexity for speed
models = {
    "Logistic Regression": LogisticRegression(
        featuresCol="features", 
        labelCol="label", 
        maxIter=50,  # Reduced from 100
        regParam=0.01,
        elasticNetParam=0.1
    ),
    "Decision Tree": DecisionTreeClassifier(
        featuresCol="features", 
        labelCol="label",
        maxDepth=10,  # Limit depth
        maxBins=32    # Reduce bins for speed
    ),
    "Random Forest": RandomForestClassifier(
        featuresCol="features", 
        labelCol="label", 
        numTrees=50,      # Reduced from 100
        maxDepth=10,      # Limit depth
        subsamplingRate=0.8,
        featureSubsetStrategy="sqrt"
    ),
    "Gradient Boosting": GBTClassifier(
        featuresCol="features", 
        labelCol="label", 
        maxIter=50,       # Reduced from 100
        maxDepth=5,       # Reduced depth
        stepSize=0.1
    ),
    "Naive Bayes": NaiveBayes(
        featuresCol="features", 
        labelCol="label", 
        modelType="gaussian",
        smoothing=1.0
    )
}

# Pre-create evaluators
bin_eval = BinaryClassificationEvaluator(
    labelCol="label", 
    rawPredictionCol="rawPrediction", 
    metricName="areaUnderROC"
)
acc_eval = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="accuracy"
)
f1_eval = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="f1"
)
precision_eval = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="weightedPrecision"
)
recall_eval = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="weightedRecall"
)

results = []

for name, clf in models.items():
    print(f"Training {name}...")
    
    # Single stage pipeline (preprocessing already done)
    pipeline = Pipeline(stages=[clf])
    model = pipeline.fit(train_df)
    
    # Make predictions
    preds = model.transform(test_df)
    
    # Persist predictions for multiple evaluations
    preds.cache()
    
    # Evaluate all metrics on cached predictions
    acc = acc_eval.evaluate(preds)
    f1 = f1_eval.evaluate(preds)
    precision = precision_eval.evaluate(preds)
    recall = recall_eval.evaluate(preds)
    auc = bin_eval.evaluate(preds)
    
    results.append((name, acc, precision, recall, f1, auc))
    print(f"{name} - Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")
    
    # Unpersist predictions to free memory
    preds.unpersist()

# Unpersist datasets
train_df.unpersist()
test_df.unpersist()

# Display results
print("\n" + "="*80)
print(f"{'Model':<20} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1':<10} {'AUC':<10}")
print("="*80)
for result in results:
    name, acc, precision, recall, f1, auc = result
    print(f"{name:<20} {acc:<10.4f} {precision:<10.4f} {recall:<10.4f} {f1:<10.4f} {auc:<10.4f}")

Train samples: 169, Test samples: 31
Training Logistic Regression...
Logistic Regression - Accuracy: 0.7419, F1: 0.7371, AUC: 0.8361
Training Decision Tree...
Decision Tree - Accuracy: 0.7419, F1: 0.7371, AUC: 0.7542
Training Random Forest...
Random Forest - Accuracy: 0.7097, F1: 0.7066, AUC: 0.8824
Training Gradient Boosting...
Gradient Boosting - Accuracy: 0.7097, F1: 0.7066, AUC: 0.7815
Training Naive Bayes...
Naive Bayes - Accuracy: 0.7097, F1: 0.6713, AUC: 0.7143

Model                Accuracy   Precision  Recall     F1         AUC       
Logistic Regression  0.7419     0.7921     0.7419     0.7371     0.8361    
Decision Tree        0.7419     0.7921     0.7419     0.7371     0.7542    
Random Forest        0.7097     0.7422     0.7097     0.7066     0.8824    
Gradient Boosting    0.7097     0.7422     0.7097     0.7066     0.7815    
Naive Bayes          0.7097     0.8102     0.7097     0.6713     0.7143    


In [9]:
# Convert results to Spark DataFrame
results_df = spark.createDataFrame(results, ["Model", "Accuracy", "Precision", "Recall", "F1", "AUC"]).toPandas()
results_df.sort_values('Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,AUC
0,Logistic Regression,0.741935,0.792082,0.741935,0.737056,0.836134
1,Decision Tree,0.741935,0.792082,0.741935,0.737056,0.754202
2,Random Forest,0.709677,0.742218,0.709677,0.706644,0.882353
3,Gradient Boosting,0.709677,0.742218,0.709677,0.706644,0.781513
4,Naive Bayes,0.709677,0.810174,0.709677,0.671299,0.714286


##### WELFake Approach

In [10]:
# PySpark implementation of your 2-stage WELFake-style system
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover, NGram, CountVectorizer, IDF,
    VectorAssembler, StandardScaler
)
from pyspark.ml.classification import LinearSVC

# --------------------------------------------------------------------------------------
# Assumptions:
# - df (Spark DataFrame) has columns: cleaned_text (string), label (0/1),
#   and all 20 LFS numeric columns you listed (already computed).
# - Binary classification (0=real, 1=fake). LinearSVC works for binary labels.
# --------------------------------------------------------------------------------------

# ===== 0) Basic hygiene ===============================================================
df = df_with_features  # your Spark DataFrame
# Cast label to double and ensure no nulls in text/LFS
df = (df
      .withColumn("label", F.col("label").cast("double"))
      .withColumn("cleaned_text", F.coalesce(F.col("cleaned_text"), F.lit(""))))

# The 20 features total (adjust names to match your frame if needed)
feature_cols = [
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]
for c in feature_cols:
    df = df.withColumn(c, F.col(c).cast("double"))
df = df.fillna(0, subset=feature_cols)

# Your LFS splits (can be tweaked)
LFS1 = ['num_special_chars', 'num_determinants', 'num_capital_letters', 'gunning_fog', 'polarity', 'num_syllables']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity', 'subjectivity', 'num_words', 'rate_adj_adv']
LFS3 = ['num_long_sentences', 'ari', 'num_articles', 'num_verbs', 'num_sentences', 'words_per_sentence']

In [11]:
# ===== 1) Stratified split (approx) ===================================================
# Keep class balance similar to scikit-learn's stratify
# Create a stable row id to separate train/test
df = df.withColumn("_row_id", F.monotonically_increasing_id())

test_frac = 0.30
label_vals = [r[0] for r in df.select("label").distinct().collect()]
fractions = {float(k): test_frac for k in label_vals}
test_df = df.sampleBy("label", fractions=fractions, seed=42)
train_df = df.join(test_df.select("_row_id"), on="_row_id", how="left_anti")

In [12]:
# ===== 2) Base text pipeline: tokens -> (uni,bigram) -> CV -> TF-IDF ==================
# Tokenize + stopword removal
tok = RegexTokenizer(inputCol="cleaned_text", outputCol="tokens", pattern="\\W+", toLowercase=True)
sw = StopWordsRemover(inputCol="tokens", outputCol="tokens_sw")

# 2-grams
bi = NGram(n=2, inputCol="tokens_sw", outputCol="bigrams")

# CountVectorizer for unigrams and bigrams (vocabSize similar to max_features=5000)
cv_uni = CountVectorizer(inputCol="tokens_sw", outputCol="tf_uni", vocabSize=5000, minDF=2)
cv_bi  = CountVectorizer(inputCol="bigrams",  outputCol="tf_bi",  vocabSize=5000, minDF=2)

# Concatenate uni+bi into "cv_features"
cv_asm = VectorAssembler(inputCols=["tf_uni", "tf_bi"], outputCol="cv_features")

# TF-IDF over the same tf (use IDF on the combined term-freqs)
idf = IDF(inputCol="cv_features", outputCol="tfidf_features", minDocFreq=2)

text_pipe = Pipeline(stages=[tok, sw, bi, cv_uni, cv_bi, cv_asm, idf])
text_model = text_pipe.fit(train_df)
train_tx = text_model.transform(train_df).cache()
test_tx  = text_model.transform(test_df).cache()

In [13]:
# ===== 3) Helper to train one SVM on (CV ⊕ scaled LFS) ================================
def train_lfs_enabled_svm(lfs_cols, pred_col):
    lfs_vec   = VectorAssembler(inputCols=lfs_cols, outputCol=f"{pred_col}_lfs_vec")
    lfs_scale = StandardScaler(inputCol=f"{pred_col}_lfs_vec", outputCol=f"{pred_col}_lfs_scaled",
                               withStd=True, withMean=True)  # dense numeric -> mean OK
    feat_asm  = VectorAssembler(inputCols=["cv_features", f"{pred_col}_lfs_scaled"], outputCol=f"{pred_col}_features")
    svm       = LinearSVC(featuresCol=f"{pred_col}_features", labelCol="label",
                          predictionCol=pred_col, maxIter=100, regParam=0.1)

    pipe = Pipeline(stages=[lfs_vec, lfs_scale, feat_asm, svm])
    model = pipe.fit(train_tx)
    return model

svm1_model = train_lfs_enabled_svm(LFS1, "pred1")
svm2_model = train_lfs_enabled_svm(LFS2, "pred2")
svm3_model = train_lfs_enabled_svm(LFS3, "pred3")

# Add pred1, pred2, pred3 to test set
preds = svm1_model.transform(test_tx)
preds = svm2_model.transform(preds)
preds = svm3_model.transform(preds)

# Stage-1 hard vote: majority(pred1, pred2, pred3) → P6
preds = preds.withColumn(
    "P6",
    F.when((F.col("pred1") + F.col("pred2") + F.col("pred3")) >= 2, F.lit(1.0)).otherwise(F.lit(0.0))
)

IllegalArgumentException: requirement failed: Column rawPrediction already exists.

In [None]:
# ===== 4) CV-only SVM and TF-IDF-only SVM ============================================
svm_cv = LinearSVC(featuresCol="cv_features",    labelCol="label", predictionCol="pred_cv",    maxIter=100, regParam=0.1)
svm_ti = LinearSVC(featuresCol="tfidf_features", labelCol="label", predictionCol="pred_tfidf", maxIter=100, regParam=0.1)

svm_cv_model = svm_cv.fit(train_tx)
svm_ti_model = svm_ti.fit(train_tx)

preds = svm_cv_model.transform(preds)
preds = svm_ti_model.transform(preds)

# ===== 5) Final hard vote across {P6, pred_cv, pred_tfidf} ============================
preds = preds.withColumn(
    "final_pred",
    F.when((F.col("P6") + F.col("pred_cv") + F.col("pred_tfidf")) >= 2, F.lit(1.0)).otherwise(F.lit(0.0))
)

# ===== 6) Accuracy ===================================================================
acc = preds.select((F.col("final_pred") == F.col("label")).cast("double").alias("correct")) \
           .agg(F.avg("correct").alias("accuracy")) \
           .collect()[0]["accuracy"]

print(f"Final Accuracy: {acc:.4f}")

In [14]:
# 3) Helper to train one SVM on (CV ⊕ scaled LFS) — make columns unique
from pyspark.ml.classification import LinearSVC
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler

def train_lfs_enabled_svm(lfs_cols, stub):
    # stub e.g. "pred1", "pred2", "pred3"
    lfs_vec   = VectorAssembler(inputCols=lfs_cols, outputCol=f"{stub}_lfs_vec")
    lfs_scale = StandardScaler(inputCol=f"{stub}_lfs_vec", outputCol=f"{stub}_lfs_scaled",
                               withStd=True, withMean=True)
    feat_asm  = VectorAssembler(inputCols=["cv_features", f"{stub}_lfs_scaled"], outputCol=f"{stub}_features")

    svm = LinearSVC(
        featuresCol=f"{stub}_features",
        labelCol="label",
        predictionCol=stub,                  # e.g., pred1
        rawPredictionCol=f"{stub}_raw",      # e.g., pred1_raw  <<< UNIQUE!
        maxIter=100, regParam=0.1
    )
    return Pipeline(stages=[lfs_vec, lfs_scale, feat_asm, svm]).fit(train_tx)

# Train the three LFS-enabled SVMs
svm1_model = train_lfs_enabled_svm(LFS1, "pred1")
svm2_model = train_lfs_enabled_svm(LFS2, "pred2")
svm3_model = train_lfs_enabled_svm(LFS3, "pred3")

# Transform sequentially (now no column name clashes)
preds = svm1_model.transform(test_tx)
preds = svm2_model.transform(preds)
preds = svm3_model.transform(preds)

# Stage-1 vote
from pyspark.sql import functions as F
preds = preds.withColumn(
    "P6", F.when((F.col("pred1") + F.col("pred2") + F.col("pred3")) >= 2.0, 1.0).otherwise(0.0)
)

In [15]:
# 4) CV-only and TF-IDF-only SVMs — also give unique rawPredictionCol
svm_cv = LinearSVC(
    featuresCol="cv_features", labelCol="label",
    predictionCol="pred_cv", rawPredictionCol="raw_cv",
    maxIter=100, regParam=0.1
)
svm_ti = LinearSVC(
    featuresCol="tfidf_features", labelCol="label",
    predictionCol="pred_tfidf", rawPredictionCol="raw_tfidf",
    maxIter=100, regParam=0.1
)

svm_cv_model = svm_cv.fit(train_tx)
svm_ti_model = svm_ti.fit(train_tx)

preds = svm_cv_model.transform(preds)
preds = svm_ti_model.transform(preds)

# Final vote across {P6, pred_cv, pred_tfidf}
preds = preds.withColumn(
    "final_pred",
    F.when((F.col("P6") + F.col("pred_cv") + F.col("pred_tfidf")) >= 2.0, 1.0).otherwise(0.0)
)

# Accuracy
acc = preds.select((F.col("final_pred") == F.col("label")).cast("double").alias("correct")) \
           .agg(F.avg("correct").alias("accuracy")).collect()[0]["accuracy"]
print(f"Final Accuracy: {acc:.4f}")


Final Accuracy: 0.8136
