### Feature Extraction

In [1]:
import pandas as pd
import re
import textstat
import nltk
from textblob import TextBlob
from nltk import word_tokenize, pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bachtiarherdianto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bachtiarherdianto/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
def extract_features(text):
    try:
        tokens = word_tokenize(text)
        words = [w for w in tokens if w.isalpha()]
        sentences = re.split(r'[.!?]+', text)
        pos_tags = pos_tag(words)
        
        # Writing pattern
        num_special_chars = len(re.findall(r'[^a-zA-Z0-9\s]', text))
        num_determinants = sum(1 for w in words if w.lower() in ['the', 'a', 'an'])
        num_capital_letters = sum(1 for c in text if c.isupper())
        num_short_sentences = sum(1 for s in sentences if len(s.split()) < 10)
        num_long_sentences = sum(1 for s in sentences if len(s.split()) > 20)

        # Readability indices
        gunning_fog = textstat.gunning_fog(text)
        smog = textstat.smog_index(text)
        ari = textstat.automated_readability_index(text)

        # Psycholinguistics
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity
        title_similarity = 0  # Optional, you can compute this with cosine or Jaccard if you have 'title'

        # Quantity
        num_syllables = textstat.syllable_count(text)
        num_words = len(words)
        num_sentences = len([s for s in sentences if s.strip()])
        num_adjectives = sum(1 for _, tag in pos_tags if tag in ['JJ', 'JJR', 'JJS'])
        num_adverbs = sum(1 for _, tag in pos_tags if tag in ['RB', 'RBR', 'RBS'])
        num_verbs = sum(1 for _, tag in pos_tags if tag.startswith('VB'))
        num_articles = sum(1 for w in words if w.lower() in ['a', 'an', 'the'])

        rate_adj_adv = (num_adjectives + num_adverbs) / num_words if num_words > 0 else 0
        words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

        return pd.Series([
            num_special_chars, num_determinants, num_capital_letters, num_short_sentences, num_long_sentences,
            gunning_fog, smog, ari,
            polarity, title_similarity, subjectivity,
            num_syllables, num_words, rate_adj_adv, words_per_sentence,
            num_articles, num_verbs, num_sentences, num_adjectives, num_adverbs
        ])
    except:
        return pd.Series([None]*20)

In [3]:
df = pd.read_csv('cleaned_welfake.csv')

# # Sample 50% from each class
# df_sampled = df.groupby('label', group_keys=False).sample(frac=0.5, random_state=42)

# Sample 2 rows per class
df_sampled = df.groupby('label', group_keys=False).sample(n=100, random_state=42)
df_sampled.head()

Unnamed: 0,index,cleaned_title,cleaned_text,label
21354,17322,Trump's choice for U.S. attorney general says ...,WASHINGTON (Reuters) - U.S. President-elect Do...,0
25012,21871,"Alison Wright, Exiled From 'The Americans' (Pe...",It took Alison Wright 34 years to land her fir...,0
9901,32704,Kurdistan supervisors begin counting votes in ...,"ERBIL, Iraq (Reuters) - Voting stations set up...",0
24177,35894,New Saudi king ascends to the throne as terror...,At 3 a.m. on a cold desert night earlier this ...,0
37552,24368,May shook on gentlemen's agreement on Brexit d...,BRUSSELS (Reuters) - An interim Brexit deal st...,0


In [4]:
# Assuming your dataframe is called df and has a 'text' column
feature_columns = [
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# # Apply feature extraction and add prefix to the resulting columns
# df_sampled[text_feature_cols := ['text_' + col for col in feature_columns]] = (
#     df_sampled['cleaned_text'].apply(extract_features)
# )

# # Apply feature extraction and add prefix to the resulting columns
# df_sampled[title_feature_cols := ['title_' + col for col in feature_columns]] = (
#     df_sampled['cleaned_title'].apply(extract_features)
# )

df_sampled[feature_columns] = df_sampled['cleaned_text'].apply(extract_features)

df_sampled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 21354 to 54745
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                200 non-null    int64  
 1   cleaned_title        200 non-null    object 
 2   cleaned_text         200 non-null    object 
 3   label                200 non-null    int64  
 4   num_special_chars    200 non-null    float64
 5   num_determinants     200 non-null    float64
 6   num_capital_letters  200 non-null    float64
 7   num_short_sentences  200 non-null    float64
 8   num_long_sentences   200 non-null    float64
 9   gunning_fog          200 non-null    float64
 10  smog                 200 non-null    float64
 11  ari                  200 non-null    float64
 12  polarity             200 non-null    float64
 13  title_similarity     200 non-null    float64
 14  subjectivity         200 non-null    float64
 15  num_syllables        200 non-null    fl

### Modelling

#### Pandas Implementation

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    BaggingClassifier,
    AdaBoostClassifier,
)

In [43]:
feature_cols = [
# text_feature_cols + title_feature_cols
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

X = df_sampled[feature_cols]
y = df_sampled['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
models = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Bagging': BaggingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[name] = {
        'accuracy': acc,
        'report': classification_report(y_test, y_pred, output_dict=True)
    }
    # print(f"✅ {name} Accuracy: {acc:.4f}")
    # print(classification_report(y_test, y_pred))

accuracy_df = pd.DataFrame([
    {'Model': name, 'Accuracy': result['accuracy']}
    for name, result in results.items()
])
print(accuracy_df.sort_values(by='Accuracy', ascending=False))

                 Model  Accuracy
1                  SVM     0.825
9  Logistic Regression     0.825
4              Bagging     0.725
6        Random Forest     0.725
8          Extra Trees     0.725
0                  KNN     0.675
5             AdaBoost     0.675
2          Naive Bayes     0.650
3        Decision Tree     0.650
7    Gradient Boosting     0.650


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

feature_cols = [
# text_feature_cols + title_feature_cols
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# Example: Define your LFS feature groups (replace with your actual 20 features split into 3 sets)
LFS1 = ['num_special_chars', 'num_determinants', 'num_capital_letters', 
        'gunning_fog', 'polarity', 'num_syllables']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity',
        'subjectivity', 'num_words', 'rate_adj_adv']
LFS3 = ['num_long_sentences', 'ari', 'num_articles', 
        'num_verbs', 'num_sentences', 'words_per_sentence']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled, df_sampled['label'], test_size=0.3, random_state=42, stratify=df_sampled['label']
)

# Function to apply CV + LFS
def cv_over_lfs(X_train, X_test, lfs_cols):
    """Apply Count Vectorizer to text + concatenate LFS numeric features."""
    cv = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_cv = cv.fit_transform(X_train['cleaned_text'])
    X_test_cv = cv.transform(X_test['cleaned_text'])

    # Scale LFS numeric features
    scaler = StandardScaler()
    X_train_lfs = scaler.fit_transform(X_train[lfs_cols])
    X_test_lfs = scaler.transform(X_test[lfs_cols])

    # Combine sparse CV with dense LFS
    X_train_combined = hstack([X_train_cv, X_train_lfs])
    X_test_combined = hstack([X_test_cv, X_test_lfs])
    
    return X_train_combined, X_test_combined

# Generate embedded sets
Xtr_LFS1, Xte_LFS1 = cv_over_lfs(X_train, X_test, LFS1)
Xtr_LFS2, Xte_LFS2 = cv_over_lfs(X_train, X_test, LFS2)
Xtr_LFS3, Xte_LFS3 = cv_over_lfs(X_train, X_test, LFS3)

# Define base model (SVM as per WELFake best performer)
svm1 = SVC(kernel='linear', probability=True, random_state=42)
svm2 = SVC(kernel='linear', probability=True, random_state=42)
svm3 = SVC(kernel='linear', probability=True, random_state=42)

# Fit each SVM
svm1.fit(Xtr_LFS1, y_train)
svm2.fit(Xtr_LFS2, y_train)
svm3.fit(Xtr_LFS3, y_train)

# Stage 1 voting: combine predictions from LFS1, LFS2, LFS3
stage1_vote = VotingClassifier(
    estimators=[
        ('svm1', svm1),
        ('svm2', svm2),
        ('svm3', svm3)
    ],
    voting='hard'
)
stage1_vote.fit(
    hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]),  # Stack features for VotingClassifier fit
    y_train
)

# Stage 1 predictions
P6_train = stage1_vote.predict(hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]))
P6_test = stage1_vote.predict(hstack([Xte_LFS1, Xte_LFS2, Xte_LFS3]))

# ----- Stage 2: Combine P6 with CV-only and TF-IDF-only -----
from sklearn.feature_extraction.text import TfidfVectorizer

# CV-only on full text
cv_full = CountVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_cv_full = cv_full.fit_transform(X_train['cleaned_text'])
Xte_cv_full = cv_full.transform(X_test['cleaned_text'])

# TF-IDF-only on full text
tfidf_full = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_tfidf_full = tfidf_full.fit_transform(X_train['cleaned_text'])
Xte_tfidf_full = tfidf_full.transform(X_test['cleaned_text'])

# Final stage voting: P6, CV, TF-IDF
final_vote = VotingClassifier(
    estimators=[
        ('cv_svm', SVC(kernel='linear', probability=True).fit(Xtr_cv_full, y_train)),
        ('tfidf_svm', SVC(kernel='linear', probability=True).fit(Xtr_tfidf_full, y_train)),
        ('lfs_vote', stage1_vote)
    ],
    voting='hard'
)

final_vote.fit(Xtr_cv_full, y_train)  # Fit on one set, predictions from others are internal

# Final prediction
final_preds = final_vote.predict(Xte_cv_full)

from sklearn.metrics import accuracy_score
print("Final Accuracy:", accuracy_score(y_test, final_preds))


Final Accuracy: 0.8666666666666667


In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Example: Define your LFS feature groups (replace with your actual 20 features split into 3 sets)
LFS1 = ['num_special_chars', 'num_determinants', 'num_capital_letters', 
        'gunning_fog', 'polarity', 'num_syllables']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity',
        'subjectivity', 'num_words', 'rate_adj_adv']
LFS3 = ['num_long_sentences', 'ari', 'num_articles', 
        'num_verbs', 'num_sentences', 'words_per_sentence']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled, df_sampled['label'], test_size=0.3, random_state=42, stratify=df_sampled['label']
)

# Function to apply CV + LFS
def cv_over_lfs(X_train, X_test, lfs_cols):
    """Apply Count Vectorizer to text + concatenate LFS numeric features."""
    cv = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_cv = cv.fit_transform(X_train['cleaned_text'])
    X_test_cv = cv.transform(X_test['cleaned_text'])

    # Scale LFS numeric features
    scaler = StandardScaler()
    X_train_lfs = scaler.fit_transform(X_train[lfs_cols])
    X_test_lfs = scaler.transform(X_test[lfs_cols])

    # Combine sparse CV with dense LFS
    X_train_combined = hstack([X_train_cv, X_train_lfs])
    X_test_combined = hstack([X_test_cv, X_test_lfs])
    
    return X_train_combined, X_test_combined

# Generate embedded sets
Xtr_LFS1, Xte_LFS1 = cv_over_lfs(X_train, X_test, LFS1)
Xtr_LFS2, Xte_LFS2 = cv_over_lfs(X_train, X_test, LFS2)
Xtr_LFS3, Xte_LFS3 = cv_over_lfs(X_train, X_test, LFS3)

# Define base model (SVM as per WELFake best performer)
svm1 = SVC(kernel='linear', probability=True, random_state=42)
svm2 = SVC(kernel='linear', probability=True, random_state=42)
svm3 = SVC(kernel='linear', probability=True, random_state=42)

# Fit each SVM
svm1.fit(Xtr_LFS1, y_train)
svm2.fit(Xtr_LFS2, y_train)
svm3.fit(Xtr_LFS3, y_train)

# Stage 1 voting: combine predictions from LFS1, LFS2, LFS3
stage1_vote = VotingClassifier(
    estimators=[
        ('svm1', svm1),
        ('svm2', svm2),
        ('svm3', svm3)
    ],
    voting='hard'
)
stage1_vote.fit(
    hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]),  # Stack features for VotingClassifier fit
    y_train
)

# Stage 1 predictions
P6_train = stage1_vote.predict(hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]))
P6_test = stage1_vote.predict(hstack([Xte_LFS1, Xte_LFS2, Xte_LFS3]))

# ----- Stage 2: Combine P6 with CV-only and TF-IDF-only -----
from sklearn.feature_extraction.text import TfidfVectorizer

# CV-only on full text
cv_full = CountVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_cv_full = cv_full.fit_transform(X_train['cleaned_text'])
Xte_cv_full = cv_full.transform(X_test['cleaned_text'])

# TF-IDF-only on full text
tfidf_full = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_tfidf_full = tfidf_full.fit_transform(X_train['cleaned_text'])
Xte_tfidf_full = tfidf_full.transform(X_test['cleaned_text'])

# Final stage voting: P6, CV, TF-IDF
final_vote = VotingClassifier(
    estimators=[
        ('cv_svm', SVC(kernel='linear', probability=True).fit(Xtr_cv_full, y_train)),
        ('tfidf_svm', SVC(kernel='linear', probability=True).fit(Xtr_tfidf_full, y_train)),
        ('lfs_vote', stage1_vote)
    ],
    voting='hard'
)

final_vote.fit(Xtr_cv_full, y_train)  # Fit on one set, predictions from others are internal

# Final prediction
final_preds = final_vote.predict(Xte_cv_full)

from sklearn.metrics import accuracy_score
print("Final Accuracy:", accuracy_score(y_test, final_preds))


Final Accuracy: 0.8983333333333333


In [39]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Example: Define your LFS feature groups (replace with your actual 20 features split into 3 sets)
LFS1 = ['num_special_chars', 'num_determinants', 'num_capital_letters', 
        'gunning_fog', 'polarity', 'num_syllables']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity',
        'subjectivity', 'num_words', 'rate_adj_adv']
LFS3 = ['num_long_sentences', 'ari', 'num_articles', 
        'num_verbs', 'num_sentences', 'words_per_sentence']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled, df_sampled['label'], test_size=0.3, random_state=42, stratify=df_sampled['label']
)

# Function to apply CV + LFS
def cv_over_lfs(X_train, X_test, lfs_cols):
    """Apply Count Vectorizer to text + concatenate LFS numeric features."""
    cv = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_cv = cv.fit_transform(X_train['cleaned_text'])
    X_test_cv = cv.transform(X_test['cleaned_text'])

    # Scale LFS numeric features
    scaler = StandardScaler()
    X_train_lfs = scaler.fit_transform(X_train[lfs_cols])
    X_test_lfs = scaler.transform(X_test[lfs_cols])

    # Combine sparse CV with dense LFS
    X_train_combined = hstack([X_train_cv, X_train_lfs])
    X_test_combined = hstack([X_test_cv, X_test_lfs])
    
    return X_train_combined, X_test_combined

# Generate embedded sets
Xtr_LFS1, Xte_LFS1 = cv_over_lfs(X_train, X_test, LFS1)
Xtr_LFS2, Xte_LFS2 = cv_over_lfs(X_train, X_test, LFS2)
Xtr_LFS3, Xte_LFS3 = cv_over_lfs(X_train, X_test, LFS3)

# Define base model (SVM as per WELFake best performer)
svm1 = SVC(kernel='linear', probability=True, random_state=42)
svm2 = SVC(kernel='linear', probability=True, random_state=42)
svm3 = SVC(kernel='linear', probability=True, random_state=42)

# Fit each SVM
svm1.fit(Xtr_LFS1, y_train)
svm2.fit(Xtr_LFS2, y_train)
svm3.fit(Xtr_LFS3, y_train)

# Stage 1 voting: combine predictions from LFS1, LFS2, LFS3
stage1_vote = VotingClassifier(
    estimators=[
        ('svm1', svm1),
        ('svm2', svm2),
        ('svm3', svm3)
    ],
    voting='hard'
)
stage1_vote.fit(
    hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]),  # Stack features for VotingClassifier fit
    y_train
)

# Stage 1 predictions
P6_train = stage1_vote.predict(hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]))
P6_test = stage1_vote.predict(hstack([Xte_LFS1, Xte_LFS2, Xte_LFS3]))

# ----- Stage 2: Combine P6 with CV-only and TF-IDF-only -----
from sklearn.feature_extraction.text import TfidfVectorizer

# CV-only on full text
cv_full = CountVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_cv_full = cv_full.fit_transform(X_train['cleaned_text'])
Xte_cv_full = cv_full.transform(X_test['cleaned_text'])

# TF-IDF-only on full text
tfidf_full = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_tfidf_full = tfidf_full.fit_transform(X_train['cleaned_text'])
Xte_tfidf_full = tfidf_full.transform(X_test['cleaned_text'])

# Final stage voting: P6, CV, TF-IDF
final_vote = VotingClassifier(
    estimators=[
        ('cv_svm', SVC(kernel='linear', probability=True).fit(Xtr_cv_full, y_train)),
        ('tfidf_svm', SVC(kernel='linear', probability=True).fit(Xtr_tfidf_full, y_train)),
        ('lfs_vote', stage1_vote)
    ],
    voting='hard'
)

final_vote.fit(Xtr_cv_full, y_train)  # Fit on one set, predictions from others are internal

# Final prediction
final_preds = final_vote.predict(Xte_cv_full)

from sklearn.metrics import accuracy_score
print("Final Accuracy:", accuracy_score(y_test, final_preds))


Final Accuracy: 0.9235


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Example: Define your LFS feature groups (replace with your actual 20 features split into 3 sets)
LFS1 = ['num_special_chars', 'num_determinants', 'num_capital_letters', 
        'gunning_fog', 'polarity', 'num_syllables']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity',
        'subjectivity', 'num_words', 'rate_adj_adv']
LFS3 = ['num_long_sentences', 'ari', 'num_articles', 
        'num_verbs', 'num_sentences', 'words_per_sentence']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled, df_sampled['label'], test_size=0.3, random_state=42, stratify=df_sampled['label']
)

# Function to apply CV + LFS
def cv_over_lfs(X_train, X_test, lfs_cols):
    """Apply Count Vectorizer to text + concatenate LFS numeric features."""
    cv = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_cv = cv.fit_transform(X_train['cleaned_text'])
    X_test_cv = cv.transform(X_test['cleaned_text'])

    # Scale LFS numeric features
    scaler = StandardScaler()
    X_train_lfs = scaler.fit_transform(X_train[lfs_cols])
    X_test_lfs = scaler.transform(X_test[lfs_cols])

    # Combine sparse CV with dense LFS
    X_train_combined = hstack([X_train_cv, X_train_lfs])
    X_test_combined = hstack([X_test_cv, X_test_lfs])
    
    return X_train_combined, X_test_combined

# Generate embedded sets
Xtr_LFS1, Xte_LFS1 = cv_over_lfs(X_train, X_test, LFS1)
Xtr_LFS2, Xte_LFS2 = cv_over_lfs(X_train, X_test, LFS2)
Xtr_LFS3, Xte_LFS3 = cv_over_lfs(X_train, X_test, LFS3)

# Define base model (SVM as per WELFake best performer)
svm1 = SVC(kernel='linear', probability=True, random_state=42)
svm2 = SVC(kernel='linear', probability=True, random_state=42)
svm3 = SVC(kernel='linear', probability=True, random_state=42)

# Fit each SVM
svm1.fit(Xtr_LFS1, y_train)
svm2.fit(Xtr_LFS2, y_train)
svm3.fit(Xtr_LFS3, y_train)

# Stage 1 voting: combine predictions from LFS1, LFS2, LFS3
stage1_vote = VotingClassifier(
    estimators=[
        ('svm1', svm1),
        ('svm2', svm2),
        ('svm3', svm3)
    ],
    voting='hard'
)
stage1_vote.fit(
    hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]),  # Stack features for VotingClassifier fit
    y_train
)

# Stage 1 predictions
P6_train = stage1_vote.predict(hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]))
P6_test = stage1_vote.predict(hstack([Xte_LFS1, Xte_LFS2, Xte_LFS3]))

# ----- Stage 2: Combine P6 with CV-only and TF-IDF-only -----
from sklearn.feature_extraction.text import TfidfVectorizer

# CV-only on full text
cv_full = CountVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_cv_full = cv_full.fit_transform(X_train['cleaned_text'])
Xte_cv_full = cv_full.transform(X_test['cleaned_text'])

# TF-IDF-only on full text
tfidf_full = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_tfidf_full = tfidf_full.fit_transform(X_train['cleaned_text'])
Xte_tfidf_full = tfidf_full.transform(X_test['cleaned_text'])

# Final stage voting: P6, CV, TF-IDF
final_vote = VotingClassifier(
    estimators=[
        ('cv_svm', SVC(kernel='linear', probability=True).fit(Xtr_cv_full, y_train)),
        ('tfidf_svm', SVC(kernel='linear', probability=True).fit(Xtr_tfidf_full, y_train)),
        ('lfs_vote', stage1_vote)
    ],
    voting='hard'
)

final_vote.fit(Xtr_cv_full, y_train)  # Fit on one set, predictions from others are internal

# Final prediction
final_preds = final_vote.predict(Xte_cv_full)

from sklearn.metrics import accuracy_score
print("Final Accuracy:", accuracy_score(y_test, final_preds))


#### Pyspark Implementation

In [33]:
from pyspark.sql import SparkSession

# Start Spark session (only once)
spark = SparkSession.builder \
    .appName("Fake News Detection") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

25/08/18 16:36:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/18 16:36:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [37]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import (
    LogisticRegression, DecisionTreeClassifier, RandomForestClassifier,
    GBTClassifier, NaiveBayes
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Define features
feature_cols = [
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# Assemble features into vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_vec")

# Scale features
scaler = StandardScaler(inputCol="features_vec", outputCol="features", withStd=True, withMean=True)

# Convert Pandas DataFrame to Spark DataFrame
from pyspark.sql import Row
spark_df = spark.createDataFrame([Row(**row) for row in df_sampled.to_dict(orient="records")])


# Train/test split
train_df, test_df = spark_df.randomSplit([0.8, 0.2], seed=42)

# Models available in Spark MLlib
models = {
    "Logistic Regression": LogisticRegression(featuresCol="features", labelCol="label", maxIter=100),
    "Decision Tree": DecisionTreeClassifier(featuresCol="features", labelCol="label"),
    "Random Forest": RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100),
    "Gradient Boosting": GBTClassifier(featuresCol="features", labelCol="label", maxIter=100),
    "Naive Bayes": NaiveBayes(featuresCol="features", labelCol="label", modelType="gaussian")
}

# Evaluators
bin_eval = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
multi_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

results = []

for name, clf in models.items():
    pipeline = Pipeline(stages=[assembler, scaler, clf])
    model = pipeline.fit(train_df)
    preds = model.transform(test_df)

    acc = multi_eval.setMetricName("accuracy").evaluate(preds)
    f1 = multi_eval.setMetricName("f1").evaluate(preds)
    precision = multi_eval.setMetricName("weightedPrecision").evaluate(preds)
    recall = multi_eval.setMetricName("weightedRecall").evaluate(preds)
    auc = bin_eval.evaluate(preds)

    results.append((name, acc, precision, recall, f1, auc))

# Convert results to Spark DataFrame
results_df = spark.createDataFrame(results, ["Model", "Accuracy", "Precision", "Recall", "F1", "AUC"])
results_df.orderBy("Accuracy", ascending=False).show(truncate=False)


                                                                                

+-------------------+------------------+------------------+------------------+------------------+-------------------+
|Model              |Accuracy          |Precision         |Recall            |F1                |AUC                |
+-------------------+------------------+------------------+------------------+------------------+-------------------+
|Random Forest      |0.6585365853658537|0.6757973733583489|0.6585365853658537|0.652350653941322 |0.7714285714285715 |
|Decision Tree      |0.6097560975609756|0.6333422674886089|0.6097560975609756|0.5963653754184601|0.48571428571428565|
|Logistic Regression|0.6097560975609756|0.6228893058161351|0.6097560975609756|0.6026864616472252|0.7119047619047618 |
|Gradient Boosting  |0.5853658536585366|0.6002710027100271|0.5853658536585366|0.5747794499221588|0.65               |
|Naive Bayes        |0.5609756097560976|0.561219512195122 |0.5609756097560976|0.5556733828207847|0.519047619047619  |
+-------------------+------------------+----------------

In [40]:
pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1", "AUC"]).sort_values(by="Accuracy", ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,AUC
2,Random Forest,0.658537,0.675797,0.658537,0.652351,0.771429
0,Logistic Regression,0.609756,0.622889,0.609756,0.602686,0.711905
1,Decision Tree,0.609756,0.633342,0.609756,0.596365,0.485714
3,Gradient Boosting,0.585366,0.600271,0.585366,0.574779,0.65
4,Naive Bayes,0.560976,0.56122,0.560976,0.555673,0.519048


In [45]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import (
    LogisticRegression, DecisionTreeClassifier, RandomForestClassifier,
    GBTClassifier, NaiveBayes, LinearSVC
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql import Row
from pyspark.sql.functions import col

# ✅ Start Spark Session
spark = SparkSession.builder.appName("Fake News Detection PySpark4").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

# ✅ Ensure DataFrame is Spark DataFrame (handles Pandas >= 2.0 too)
if "pandas" in str(type(df_sampled)):
    try:
        spark_df = spark.createDataFrame(df_sampled)  # direct conversion
    except Exception:
        # fallback for Pandas >= 2.0
        spark_df = spark.createDataFrame([Row(**row) for row in df_sampled.to_dict(orient="records")])
else:
    spark_df = df_sampled

# ✅ Cast label to integer (required for MLlib)
spark_df = spark_df.withColumn("label", col("label").cast("integer"))

# ✅ Features
feature_cols = [
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_vec")
scaler = StandardScaler(inputCol="features_vec", outputCol="features", withStd=True, withMean=True)

# ✅ Split train/test
train_df, test_df = spark_df.randomSplit([0.8, 0.2], seed=42)

# ✅ Models (all supported in PySpark 4.0)
models = {
    "Logistic Regression": LogisticRegression(featuresCol="features", labelCol="label", maxIter=100),
    "Decision Tree": DecisionTreeClassifier(featuresCol="features", labelCol="label"),
    "Random Forest": RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100),
    "Gradient Boosting (GBT)": GBTClassifier(featuresCol="features", labelCol="label", maxIter=100),
    "Naive Bayes": NaiveBayes(featuresCol="features", labelCol="label", modelType="gaussian"),
    "Linear SVM": LinearSVC(featuresCol="features", labelCol="label", maxIter=100, regParam=0.01)
}

# ✅ Evaluators
bin_eval = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
multi_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

results = []

for name, clf in models.items():
    pipeline = Pipeline(stages=[assembler, scaler, clf])
    model = pipeline.fit(train_df)
    preds = model.transform(test_df)

    acc = multi_eval.setMetricName("accuracy").evaluate(preds)
    f1 = multi_eval.setMetricName("f1").evaluate(preds)
    precision = multi_eval.setMetricName("weightedPrecision").evaluate(preds)
    recall = multi_eval.setMetricName("weightedRecall").evaluate(preds)
    auc = bin_eval.evaluate(preds)

    results.append((name, acc, precision, recall, f1, auc))

# ✅ Show Results
results_df = spark.createDataFrame(results, ["Model", "Accuracy", "Precision", "Recall", "F1", "AUC"])
results_df.orderBy("Accuracy", ascending=False).show(truncate=False)


+-----------------------+------------------+------------------+------------------+------------------+-------------------+
|Model                  |Accuracy          |Precision         |Recall            |F1                |AUC                |
+-----------------------+------------------+------------------+------------------+------------------+-------------------+
|Linear SVM             |0.7073170731707317|0.7287054409005629|0.7073170731707317|0.702014846235419 |0.7523809523809524 |
|Random Forest          |0.6585365853658537|0.6757973733583489|0.6585365853658537|0.652350653941322 |0.7714285714285715 |
|Decision Tree          |0.6097560975609756|0.6333422674886089|0.6097560975609756|0.5963653754184601|0.48571428571428565|
|Logistic Regression    |0.6097560975609756|0.6228893058161351|0.6097560975609756|0.6026864616472252|0.7119047619047618 |
|Gradient Boosting (GBT)|0.5853658536585366|0.6002710027100271|0.5853658536585366|0.5747794499221588|0.65               |
|Naive Bayes            

In [None]:
# ✅ Ensure DataFrame is Spark DataFrame (handles Pandas >= 2.0 too)
if "pandas" in str(type(df_sampled)):
    try:
        spark_df = spark.createDataFrame(df_sampled)  # direct conversion
    except Exception:
        # fallback for Pandas >= 2.0
        spark_df = spark.createDataFrame([Row(**row) for row in df_sampled.to_dict(orient="records")])
else:
    spark_df = df_sampled

# ✅ Cast label to integer (required for MLlib)
spark_df = spark_df.withColumn("label", col("label").cast("integer"))

In [8]:
import re
from textblob import TextBlob
import textstat
from nltk import word_tokenize, pos_tag
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, rand, row_number
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType
from pyspark.sql.window import Window

# Initialize Spark
spark = SparkSession.builder.appName("WELFake Features").getOrCreate()

# Define schema
feature_schema = StructType([
    StructField("num_special_chars", IntegerType()),
    StructField("num_determinants", IntegerType()),
    StructField("num_capital_letters", IntegerType()),
    StructField("num_short_sentences", IntegerType()),
    StructField("num_long_sentences", IntegerType()),
    StructField("gunning_fog", DoubleType()),
    StructField("smog", DoubleType()),
    StructField("ari", DoubleType()),
    StructField("polarity", DoubleType()),
    StructField("title_similarity", DoubleType()),
    StructField("subjectivity", DoubleType()),
    StructField("num_syllables", IntegerType()),
    StructField("num_words", IntegerType()),
    StructField("rate_adj_adv", DoubleType()),
    StructField("words_per_sentence", DoubleType()),
    StructField("num_articles", IntegerType()),
    StructField("num_verbs", IntegerType()),
    StructField("num_sentences", IntegerType()),
    StructField("num_adjectives", IntegerType()),
    StructField("num_adverbs", IntegerType())
])

# Feature extraction
def extract_features(text):
    try:
        tokens = word_tokenize(text)
        words = [w for w in tokens if w.isalpha()]
        sentences = re.split(r'[.!?]+', text)
        pos_tags = pos_tag(words)

        num_special_chars = len(re.findall(r'[^a-zA-Z0-9\s]', text))
        num_determinants = sum(1 for w in words if w.lower() in ['the', 'a', 'an'])
        num_capital_letters = sum(1 for c in text if c.isupper())
        num_short_sent = sum(1 for s in sentences if len(s.split()) < 10)
        num_long_sent = sum(1 for s in sentences if len(s.split()) > 20)

        gunning_fog = textstat.gunning_fog(text)
        smog = textstat.smog_index(text)
        ari = textstat.automated_readability_index(text)

        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity
        title_similarity = 0.0

        num_syllables = textstat.syllable_count(text)
        num_words = len(words)
        num_sentences = len([s for s in sentences if s.strip()])
        num_adjectives = sum(1 for _, tag in pos_tags if tag in ['JJ', 'JJR', 'JJS'])
        num_adverbs = sum(1 for _, tag in pos_tags if tag in ['RB', 'RBR', 'RBS'])
        num_verbs = sum(1 for _, tag in pos_tags if tag.startswith('VB'))
        num_articles = sum(1 for w in words if w.lower() in ['a', 'an', 'the'])

        rate_adj_adv = (num_adjectives + num_adverbs) / num_words if num_words > 0 else 0
        words_per_sent = num_words / num_sentences if num_sentences > 0 else 0

        return (
            num_special_chars, num_determinants, num_capital_letters, num_short_sent, num_long_sent,
            gunning_fog, smog, ari,
            polarity, title_similarity, subjectivity,
            num_syllables, num_words, rate_adj_adv, words_per_sent,
            num_articles, num_verbs, num_sentences, num_adjectives, num_adverbs
        )
    except:
        return (None,) * 20

# Register UDF
extract_features_udf = udf(extract_features, feature_schema)

# Explicit schema for your dataset
schema = StructType([
    StructField("index", IntegerType(), True),
    StructField("cleaned_title", StringType(), True),
    StructField("cleaned_text", StringType(), True),
    StructField("label", IntegerType(), True),   # force label as integer
])

# Load dataset
spark_df = spark.read.csv(
    "cleaned_welfake.csv",
    header=True,
    schema=schema,
    quote='"',
    escape='"',
    multiLine=True  # needed since your text column contains line breaks
)

# --- Stratified sampling with Window (exact N per label) ---
n = 100  # number of rows per class
w = Window.partitionBy("label").orderBy(rand(seed=42))
df_ranked = spark_df.withColumn("row_num", row_number().over(w))
df_sampled = df_ranked.filter(col("row_num") <= n).drop("row_num")

# Apply feature extraction
df_with_features = df_sampled.withColumn("features", extract_features_udf("cleaned_text"))

# Flatten struct
for col_name in feature_schema.fieldNames():
    df_with_features = df_with_features.withColumn(col_name, df_with_features["features"][col_name])

df_with_features = df_with_features.drop("features")

df_with_features.show(5)



+-----+--------------------+--------------------+-----+-----------------+----------------+-------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+----------------+-------------------+-------------+---------+-------------------+------------------+------------+---------+-------------+--------------+-----------+
|index|       cleaned_title|        cleaned_text|label|num_special_chars|num_determinants|num_capital_letters|num_short_sentences|num_long_sentences|       gunning_fog|              smog|               ari|            polarity|title_similarity|       subjectivity|num_syllables|num_words|       rate_adj_adv|words_per_sentence|num_articles|num_verbs|num_sentences|num_adjectives|num_adverbs|
+-----+--------------------+--------------------+-----+-----------------+----------------+-------------------+-------------------+------------------+------------------+------------------+------------------+----------

                                                                                

In [8]:
df_with_features.count()

200

In [11]:
df_with_features = df_with_features.na.drop()

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, IDF, VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.ml import Pipeline
from functools import reduce

# --------------------------------------
# Spark session
# --------------------------------------
# spark = SparkSession.builder.appName("WELFake-PySpark").getOrCreate()

# --------------------------------------
# Define LFS feature groups (same as sklearn version)
# --------------------------------------
LFS1 = ['num_special_chars','num_determinants','num_capital_letters',
        'gunning_fog','polarity','num_syllables']
LFS2 = ['num_short_sentences','smog','title_similarity',
        'subjectivity','num_words','rate_adj_adv']
LFS3 = ['num_long_sentences','ari','num_articles',
        'num_verbs','num_sentences','words_per_sentence']

# --------------------------------------
# Tokenizer + CV + TF-IDF
# --------------------------------------
tokenizer = RegexTokenizer(inputCol="cleaned_text", outputCol="words", pattern="\\W")
cv = CountVectorizer(inputCol="words", outputCol="cv_features", vocabSize=5000, minDF=2)
idf = IDF(inputCol="cv_features", outputCol="tfidf_features")

# Assemble CV+LFS features
vec1 = VectorAssembler(inputCols=LFS1 + ["cv_features"], outputCol="features_lfs1")
vec2 = VectorAssembler(inputCols=LFS2 + ["cv_features"], outputCol="features_lfs2")
vec3 = VectorAssembler(inputCols=LFS3 + ["cv_features"], outputCol="features_lfs3")

# --------------------------------------
# Train/test split
# --------------------------------------
# # ✅ Ensure DataFrame is Spark DataFrame (handles Pandas >= 2.0 too)
# if "pandas" in str(type(df_sampled)):
#     try:
#         spark_df = spark.createDataFrame(df_sampled)  # direct conversion
#     except Exception:
#         # fallback for Pandas >= 2.0
#         spark_df = spark.createDataFrame([Row(**row) for row in df_sampled.to_dict(orient="records")])
# else:
#     spark_df = df_sampled

# # ✅ Cast label to integer (required for MLlib)
# spark_df = spark_df.withColumn("label", col("label").cast("integer"))
df_with_features = df_with_features.na.drop()
train_df, test_df = df_with_features.randomSplit([0.7, 0.3], seed=42)

# --------------------------------------
# Stage 1: feature extraction (tokenizer + cv + idf + assemblers)
# --------------------------------------
base_pipeline = Pipeline(stages=[tokenizer, cv, idf, vec1, vec2, vec3])
base_model = base_pipeline.fit(train_df)
train_feats = base_model.transform(train_df)
test_feats  = base_model.transform(test_df)

# --------------------------------------
# Stage 1: train three SVMs on LFS1/2/3
# --------------------------------------
svm1 = LinearSVC(featuresCol="features_lfs1", labelCol="label", predictionCol="pred1", rawPredictionCol="raw1")
svm2 = LinearSVC(featuresCol="features_lfs2", labelCol="label", predictionCol="pred2", rawPredictionCol="raw2")
svm3 = LinearSVC(featuresCol="features_lfs3", labelCol="label", predictionCol="pred3", rawPredictionCol="raw3")

svm1_model = svm1.fit(train_feats)
svm2_model = svm2.fit(train_feats)
svm3_model = svm3.fit(train_feats)

# Predictions from each SVM
preds1 = svm1_model.transform(test_feats).select("index","pred1")
preds2 = svm2_model.transform(test_feats).select("index","pred2")
preds3 = svm3_model.transform(test_feats).select("index","pred3")

# Join predictions + label
preds_all = reduce(
    lambda l,r: l.join(r, on="index"),
    [test_feats.select("index","label"), preds1, preds2, preds3]
)

# Majority vote UDF
vote_udf = F.udf(lambda p1,p2,p3: int(round((p1+p2+p3)/3.0)), IntegerType())
preds_all = preds_all.withColumn("stage1_vote", vote_udf("pred1","pred2","pred3"))

# --------------------------------------
# Stage 2: CV-only and TF-IDF-only models
# --------------------------------------
vec_cv = VectorAssembler(inputCols=["cv_features"], outputCol="cv_final")
vec_tfidf = VectorAssembler(inputCols=["tfidf_features"], outputCol="tfidf_final")

train_feats2 = vec_cv.transform(train_feats)
train_feats2 = vec_tfidf.transform(train_feats2)
test_feats2  = vec_cv.transform(test_feats)
test_feats2  = vec_tfidf.transform(test_feats2)

svm_cv = LinearSVC(featuresCol="cv_final", labelCol="label", predictionCol="pred_cv", rawPredictionCol="rawCV")
svm_tfidf = LinearSVC(featuresCol="tfidf_final", labelCol="label", predictionCol="pred_tfidf", rawPredictionCol="rawTFIDF")

svm_cv_model = svm_cv.fit(train_feats2)
svm_tfidf_model = svm_tfidf.fit(train_feats2)

preds_cv = svm_cv_model.transform(test_feats2).select("index","pred_cv")
preds_tfidf = svm_tfidf_model.transform(test_feats2).select("index","pred_tfidf")

# --------------------------------------
# Final voting: combine Stage1 vote + CV + TF-IDF
# --------------------------------------
final_preds = preds_all.join(preds_cv, on="index").join(preds_tfidf, on="index")

final_vote_udf = F.udf(lambda p1,p2,p3: int(round((p1+p2+p3)/3.0)), IntegerType())
final_preds = final_preds.withColumn("final_vote", final_vote_udf("stage1_vote","pred_cv","pred_tfidf"))

# --------------------------------------
# Evaluate accuracy
# --------------------------------------
accuracy = final_preds.filter(F.col("final_vote") == F.col("label")).count() / final_preds.count()
print("Final Accuracy:", accuracy)


25/08/18 17:48:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
25/08/18 17:48:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
Exception in thread "RemoteBlock-temp-file-clean-thread" java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.storage.BlockManager$RemoteBlockDownloadFileManager$$Lambda$836/0x000000030084d840.get$Lambda(Unknown Source)
	at java.base/java.lang.invoke.DirectMethodHandle$Holder.invokeStatic(DirectMethodHandle$Holder)
	at java.base/java.lang.invoke.LambdaForm$MH/0x0000000300062840.linkToTargetMethod(LambdaForm$MH)
	at org.apache.spark.storage.BlockManager$RemoteBlockDownloadFileManager.org$apache$spark$storage$BlockManager$RemoteBlockDownloadFileManager$$keepCleaning(BlockManager.scala:1940)
	at org.apache.spark.storage.BlockManager$RemoteBlockDownloadFileManager$$anon$2.run(BlockManager.scala:1906)


Py4JJavaError: An error occurred while calling o1127.count.
: java.lang.OutOfMemoryError: Java heap space
	at scala.collection.immutable.HashSet$HashTrieSet.updated0(HashSet.scala:557)
	at scala.collection.immutable.HashSet.$plus(HashSet.scala:84)
	at scala.collection.immutable.Set$Set4.$plus(Set.scala:198)
	at scala.collection.immutable.Set$Set4.$plus(Set.scala:192)
	at scala.collection.mutable.SetBuilder.$plus$eq(SetBuilder.scala:28)
	at scala.collection.mutable.SetBuilder.$plus$eq(SetBuilder.scala:24)
	at scala.collection.generic.Growable.$anonfun$$plus$plus$eq$1(Growable.scala:62)
	at scala.collection.generic.Growable$$Lambda$10/0x0000000300115840.apply(Unknown Source)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.SetBuilder.$plus$plus$eq(SetBuilder.scala:24)
	at scala.collection.TraversableLike.to(TraversableLike.scala:678)
	at scala.collection.TraversableLike.to$(TraversableLike.scala:675)
	at scala.collection.AbstractTraversable.to(Traversable.scala:108)
	at scala.collection.TraversableOnce.toSet(TraversableOnce.scala:309)
	at scala.collection.TraversableOnce.toSet$(TraversableOnce.scala:309)
	at scala.collection.AbstractTraversable.toSet(Traversable.scala:108)
	at org.apache.spark.sql.catalyst.trees.TreeNode.containsChild$lzycompute(TreeNode.scala:115)
	at org.apache.spark.sql.catalyst.trees.TreeNode.containsChild(TreeNode.scala:115)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapChild$1(TreeNode.scala:263)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$withNewChildren$4(TreeNode.scala:276)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$Lambda$1549/0x0000000300cbd040.apply(Unknown Source)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
	at scala.collection.TraversableLike$$Lambda$60/0x00000003001e3040.apply(Unknown Source)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at scala.collection.TraversableLike.map(TraversableLike.scala:238)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:231)


25/08/18 20:21:59 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1037087 ms exceeds timeout 120000 ms
25/08/18 20:21:59 WARN SparkContext: Killing executors is not supported by current scheduler.


In [10]:
import re
from textblob import TextBlob
import textstat
from nltk import word_tokenize, pos_tag
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, rand, row_number
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
from pyspark.sql.window import Window
from pyspark.sql import functions as F

train_ratio = 0.7
test_ratio = 0.3

# Randomly order within each class
w = Window.partitionBy("label").orderBy(F.rand(seed=42))
df_ranked = df_with_features.withColumn("row_num", F.row_number().over(w))

# Count per label
label_counts = df_with_features.groupBy("label").count().collect()
label_counts = {row["label"]: row["count"] for row in label_counts}

# Assign rows to train/test based on ratio
df_split = df_ranked.withColumn(
    "set",
    F.when(
        F.col("row_num") <= (F.lit(train_ratio) * F.lit(label_counts[0])).cast("int"),
        "train"
    ).when(
        (F.col("label") == 1) & (F.col("row_num") <= (F.lit(train_ratio) * F.lit(label_counts[1])).cast("int")),
        "train"
    ).otherwise("test")
)

train_df = df_split.filter(F.col("set") == "train").drop("row_num", "set")
test_df = df_split.filter(F.col("set") == "test").drop("row_num", "set")

# Verify stratification
train_df.groupBy("label").count().show()
test_df.groupBy("label").count().show()

25/08/18 17:43:18 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-----+-----+
|label|count|
+-----+-----+
|    1|   70|
|    0|   70|
+-----+-----+



                                                                                

+-----+-----+
|label|count|
+-----+-----+
|    1|   30|
|    0|   30|
+-----+-----+

