### Feature Extraction

In [1]:
import pandas as pd
import re
import textstat
import nltk
from textblob import TextBlob
from nltk import word_tokenize, pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bachtiarherdianto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bachtiarherdianto/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
def extract_features(text):
    try:
        tokens = word_tokenize(text)
        words = [w for w in tokens if w.isalpha()]
        sentences = re.split(r'[.!?]+', text)
        pos_tags = pos_tag(words)
        
        # Writing pattern
        num_special_chars = len(re.findall(r'[^a-zA-Z0-9\s]', text))
        num_determinants = sum(1 for w in words if w.lower() in ['the', 'a', 'an'])
        num_capital_letters = sum(1 for c in text if c.isupper())
        num_short_sentences = sum(1 for s in sentences if len(s.split()) < 10)
        num_long_sentences = sum(1 for s in sentences if len(s.split()) > 20)

        # Readability indices
        gunning_fog = textstat.gunning_fog(text)
        smog = textstat.smog_index(text)
        ari = textstat.automated_readability_index(text)

        # Psycholinguistics
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity
        title_similarity = 0  # Optional, you can compute this with cosine or Jaccard if you have 'title'

        # Quantity
        num_syllables = textstat.syllable_count(text)
        num_words = len(words)
        num_sentences = len([s for s in sentences if s.strip()])
        num_adjectives = sum(1 for _, tag in pos_tags if tag in ['JJ', 'JJR', 'JJS'])
        num_adverbs = sum(1 for _, tag in pos_tags if tag in ['RB', 'RBR', 'RBS'])
        num_verbs = sum(1 for _, tag in pos_tags if tag.startswith('VB'))
        num_articles = sum(1 for w in words if w.lower() in ['a', 'an', 'the'])

        rate_adj_adv = (num_adjectives + num_adverbs) / num_words if num_words > 0 else 0
        words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

        return pd.Series([
            num_special_chars, num_determinants, num_capital_letters, num_short_sentences, num_long_sentences,
            gunning_fog, smog, ari,
            polarity, title_similarity, subjectivity,
            num_syllables, num_words, rate_adj_adv, words_per_sentence,
            num_articles, num_verbs, num_sentences, num_adjectives, num_adverbs
        ])
    except:
        return pd.Series([None]*20)

In [5]:
df.columns

Index(['index', 'cleaned_title', 'cleaned_text'], dtype='object')

In [36]:
df = pd.read_csv('cleaned_welfake.csv')

# # Sample 50% from each class
# df_sampled = df.groupby('label', group_keys=False).sample(frac=0.5, random_state=42)

# Sample 2 rows per class
df_sampled = df.groupby('label', group_keys=False).sample(n=10000, random_state=42)
df_sampled.head()

Unnamed: 0,index,cleaned_title,cleaned_text,label
21354,17322,Trump's choice for U.S. attorney general says ...,WASHINGTON (Reuters) - U.S. President-elect Do...,0
25012,21871,"Alison Wright, Exiled From 'The Americans' (Pe...",It took Alison Wright 34 years to land her fir...,0
9901,32704,Kurdistan supervisors begin counting votes in ...,"ERBIL, Iraq (Reuters) - Voting stations set up...",0
24177,35894,New Saudi king ascends to the throne as terror...,At 3 a.m. on a cold desert night earlier this ...,0
37552,24368,May shook on gentlemen's agreement on Brexit d...,BRUSSELS (Reuters) - An interim Brexit deal st...,0


In [37]:
print(df_sampled['label'].value_counts())

label
0    10000
1    10000
Name: count, dtype: int64


In [38]:
# Assuming your dataframe is called df and has a 'text' column
feature_columns = [
    'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
    'gunning_fog', 'smog', 'ari',
    'polarity', 'title_similarity', 'subjectivity',
    'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
    'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
]

# # Apply feature extraction and add prefix to the resulting columns
# df_sampled[text_feature_cols := ['text_' + col for col in feature_columns]] = (
#     df_sampled['cleaned_text'].apply(extract_features)
# )

# # Apply feature extraction and add prefix to the resulting columns
# df_sampled[title_feature_cols := ['title_' + col for col in feature_columns]] = (
#     df_sampled['cleaned_title'].apply(extract_features)
# )

df_sampled[feature_columns] = df_sampled['cleaned_text'].apply(extract_features)

df_sampled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 21354 to 55338
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                20000 non-null  int64  
 1   cleaned_title        20000 non-null  object 
 2   cleaned_text         20000 non-null  object 
 3   label                20000 non-null  int64  
 4   num_special_chars    20000 non-null  float64
 5   num_determinants     20000 non-null  float64
 6   num_capital_letters  20000 non-null  float64
 7   num_short_sentences  20000 non-null  float64
 8   num_long_sentences   20000 non-null  float64
 9   gunning_fog          20000 non-null  float64
 10  smog                 20000 non-null  float64
 11  ari                  20000 non-null  float64
 12  polarity             20000 non-null  float64
 13  title_similarity     20000 non-null  float64
 14  subjectivity         20000 non-null  float64
 15  num_syllables        20000 non-null  

### Modelling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    BaggingClassifier,
    AdaBoostClassifier,
)

In [30]:
feature_cols = text_feature_cols + title_feature_cols
# [
#     'num_special_chars', 'num_determinants', 'num_capital_letters', 'num_short_sentences', 'num_long_sentences',
#     'gunning_fog', 'smog', 'ari',
#     'polarity', 'title_similarity', 'subjectivity',
#     'num_syllables', 'num_words', 'rate_adj_adv', 'words_per_sentence',
#     'num_articles', 'num_verbs', 'num_sentences', 'num_adjectives', 'num_adverbs'
# ]

X = df_sampled[feature_cols]
y = df_sampled['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
models = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Bagging': BaggingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[name] = {
        'accuracy': acc,
        'report': classification_report(y_test, y_pred, output_dict=True)
    }
    # print(f"✅ {name} Accuracy: {acc:.4f}")
    # print(classification_report(y_test, y_pred))

accuracy_df = pd.DataFrame([
    {'Model': name, 'Accuracy': result['accuracy']}
    for name, result in results.items()
])
print(accuracy_df.sort_values(by='Accuracy', ascending=False))

                 Model  Accuracy
6        Random Forest    0.8875
7    Gradient Boosting    0.8700
1                  SVM    0.8650
9  Logistic Regression    0.8650
4              Bagging    0.8575
8          Extra Trees    0.8525
3        Decision Tree    0.8375
5             AdaBoost    0.8300
0                  KNN    0.8125
2          Naive Bayes    0.7975


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Example: Define your LFS feature groups (replace with your actual 20 features split into 3 sets)
LFS1 = ['num_special_chars', 'num_determinants', 'num_capital_letters', 
        'gunning_fog', 'polarity', 'num_syllables']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity',
        'subjectivity', 'num_words', 'rate_adj_adv']
LFS3 = ['num_long_sentences', 'ari', 'num_articles', 
        'num_verbs', 'num_sentences', 'words_per_sentence']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df, df['label'], test_size=0.3, random_state=42, stratify=df['label']
)

# Function to apply CV + LFS
def cv_over_lfs(X_train, X_test, lfs_cols):
    """Apply Count Vectorizer to text + concatenate LFS numeric features."""
    cv = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_cv = cv.fit_transform(X_train['cleaned_text'])
    X_test_cv = cv.transform(X_test['cleaned_text'])

    # Scale LFS numeric features
    scaler = StandardScaler()
    X_train_lfs = scaler.fit_transform(X_train[lfs_cols])
    X_test_lfs = scaler.transform(X_test[lfs_cols])

    # Combine sparse CV with dense LFS
    X_train_combined = hstack([X_train_cv, X_train_lfs])
    X_test_combined = hstack([X_test_cv, X_test_lfs])
    
    return X_train_combined, X_test_combined

# Generate embedded sets
Xtr_LFS1, Xte_LFS1 = cv_over_lfs(X_train, X_test, LFS1)
Xtr_LFS2, Xte_LFS2 = cv_over_lfs(X_train, X_test, LFS2)
Xtr_LFS3, Xte_LFS3 = cv_over_lfs(X_train, X_test, LFS3)

# Define base model (SVM as per WELFake best performer)
svm1 = SVC(kernel='linear', probability=True, random_state=42)
svm2 = SVC(kernel='linear', probability=True, random_state=42)
svm3 = SVC(kernel='linear', probability=True, random_state=42)

# Fit each SVM
svm1.fit(Xtr_LFS1, y_train)
svm2.fit(Xtr_LFS2, y_train)
svm3.fit(Xtr_LFS3, y_train)

# Stage 1 voting: combine predictions from LFS1, LFS2, LFS3
stage1_vote = VotingClassifier(
    estimators=[
        ('svm1', svm1),
        ('svm2', svm2),
        ('svm3', svm3)
    ],
    voting='hard'
)
stage1_vote.fit(
    hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]),  # Stack features for VotingClassifier fit
    y_train
)

# Stage 1 predictions
P6_train = stage1_vote.predict(hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]))
P6_test = stage1_vote.predict(hstack([Xte_LFS1, Xte_LFS2, Xte_LFS3]))

# ----- Stage 2: Combine P6 with CV-only and TF-IDF-only -----
from sklearn.feature_extraction.text import TfidfVectorizer

# CV-only on full text
cv_full = CountVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_cv_full = cv_full.fit_transform(X_train['cleaned_text'])
Xte_cv_full = cv_full.transform(X_test['cleaned_text'])

# TF-IDF-only on full text
tfidf_full = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_tfidf_full = tfidf_full.fit_transform(X_train['cleaned_text'])
Xte_tfidf_full = tfidf_full.transform(X_test['cleaned_text'])

# Final stage voting: P6, CV, TF-IDF
final_vote = VotingClassifier(
    estimators=[
        ('cv_svm', SVC(kernel='linear', probability=True).fit(Xtr_cv_full, y_train)),
        ('tfidf_svm', SVC(kernel='linear', probability=True).fit(Xtr_tfidf_full, y_train)),
        ('lfs_vote', stage1_vote)
    ],
    voting='hard'
)

final_vote.fit(Xtr_cv_full, y_train)  # Fit on one set, predictions from others are internal

# Final prediction
final_preds = final_vote.predict(Xte_cv_full)

from sklearn.metrics import accuracy_score
print("Final Accuracy:", accuracy_score(y_test, final_preds))


In [35]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Example: Define your LFS feature groups (replace with your actual 20 features split into 3 sets)
LFS1 = ['num_special_chars', 'num_determinants', 'num_capital_letters', 
        'gunning_fog', 'polarity', 'num_syllables']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity',
        'subjectivity', 'num_words', 'rate_adj_adv']
LFS3 = ['num_long_sentences', 'ari', 'num_articles', 
        'num_verbs', 'num_sentences', 'words_per_sentence']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled, df_sampled['label'], test_size=0.3, random_state=42, stratify=df_sampled['label']
)

# Function to apply CV + LFS
def cv_over_lfs(X_train, X_test, lfs_cols):
    """Apply Count Vectorizer to text + concatenate LFS numeric features."""
    cv = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_cv = cv.fit_transform(X_train['cleaned_text'])
    X_test_cv = cv.transform(X_test['cleaned_text'])

    # Scale LFS numeric features
    scaler = StandardScaler()
    X_train_lfs = scaler.fit_transform(X_train[lfs_cols])
    X_test_lfs = scaler.transform(X_test[lfs_cols])

    # Combine sparse CV with dense LFS
    X_train_combined = hstack([X_train_cv, X_train_lfs])
    X_test_combined = hstack([X_test_cv, X_test_lfs])
    
    return X_train_combined, X_test_combined

# Generate embedded sets
Xtr_LFS1, Xte_LFS1 = cv_over_lfs(X_train, X_test, LFS1)
Xtr_LFS2, Xte_LFS2 = cv_over_lfs(X_train, X_test, LFS2)
Xtr_LFS3, Xte_LFS3 = cv_over_lfs(X_train, X_test, LFS3)

# Define base model (SVM as per WELFake best performer)
svm1 = SVC(kernel='linear', probability=True, random_state=42)
svm2 = SVC(kernel='linear', probability=True, random_state=42)
svm3 = SVC(kernel='linear', probability=True, random_state=42)

# Fit each SVM
svm1.fit(Xtr_LFS1, y_train)
svm2.fit(Xtr_LFS2, y_train)
svm3.fit(Xtr_LFS3, y_train)

# Stage 1 voting: combine predictions from LFS1, LFS2, LFS3
stage1_vote = VotingClassifier(
    estimators=[
        ('svm1', svm1),
        ('svm2', svm2),
        ('svm3', svm3)
    ],
    voting='hard'
)
stage1_vote.fit(
    hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]),  # Stack features for VotingClassifier fit
    y_train
)

# Stage 1 predictions
P6_train = stage1_vote.predict(hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]))
P6_test = stage1_vote.predict(hstack([Xte_LFS1, Xte_LFS2, Xte_LFS3]))

# ----- Stage 2: Combine P6 with CV-only and TF-IDF-only -----
from sklearn.feature_extraction.text import TfidfVectorizer

# CV-only on full text
cv_full = CountVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_cv_full = cv_full.fit_transform(X_train['cleaned_text'])
Xte_cv_full = cv_full.transform(X_test['cleaned_text'])

# TF-IDF-only on full text
tfidf_full = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_tfidf_full = tfidf_full.fit_transform(X_train['cleaned_text'])
Xte_tfidf_full = tfidf_full.transform(X_test['cleaned_text'])

# Final stage voting: P6, CV, TF-IDF
final_vote = VotingClassifier(
    estimators=[
        ('cv_svm', SVC(kernel='linear', probability=True).fit(Xtr_cv_full, y_train)),
        ('tfidf_svm', SVC(kernel='linear', probability=True).fit(Xtr_tfidf_full, y_train)),
        ('lfs_vote', stage1_vote)
    ],
    voting='hard'
)

final_vote.fit(Xtr_cv_full, y_train)  # Fit on one set, predictions from others are internal

# Final prediction
final_preds = final_vote.predict(Xte_cv_full)

from sklearn.metrics import accuracy_score
print("Final Accuracy:", accuracy_score(y_test, final_preds))


Final Accuracy: 0.8983333333333333


In [39]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Example: Define your LFS feature groups (replace with your actual 20 features split into 3 sets)
LFS1 = ['num_special_chars', 'num_determinants', 'num_capital_letters', 
        'gunning_fog', 'polarity', 'num_syllables']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity',
        'subjectivity', 'num_words', 'rate_adj_adv']
LFS3 = ['num_long_sentences', 'ari', 'num_articles', 
        'num_verbs', 'num_sentences', 'words_per_sentence']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled, df_sampled['label'], test_size=0.3, random_state=42, stratify=df_sampled['label']
)

# Function to apply CV + LFS
def cv_over_lfs(X_train, X_test, lfs_cols):
    """Apply Count Vectorizer to text + concatenate LFS numeric features."""
    cv = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_cv = cv.fit_transform(X_train['cleaned_text'])
    X_test_cv = cv.transform(X_test['cleaned_text'])

    # Scale LFS numeric features
    scaler = StandardScaler()
    X_train_lfs = scaler.fit_transform(X_train[lfs_cols])
    X_test_lfs = scaler.transform(X_test[lfs_cols])

    # Combine sparse CV with dense LFS
    X_train_combined = hstack([X_train_cv, X_train_lfs])
    X_test_combined = hstack([X_test_cv, X_test_lfs])
    
    return X_train_combined, X_test_combined

# Generate embedded sets
Xtr_LFS1, Xte_LFS1 = cv_over_lfs(X_train, X_test, LFS1)
Xtr_LFS2, Xte_LFS2 = cv_over_lfs(X_train, X_test, LFS2)
Xtr_LFS3, Xte_LFS3 = cv_over_lfs(X_train, X_test, LFS3)

# Define base model (SVM as per WELFake best performer)
svm1 = SVC(kernel='linear', probability=True, random_state=42)
svm2 = SVC(kernel='linear', probability=True, random_state=42)
svm3 = SVC(kernel='linear', probability=True, random_state=42)

# Fit each SVM
svm1.fit(Xtr_LFS1, y_train)
svm2.fit(Xtr_LFS2, y_train)
svm3.fit(Xtr_LFS3, y_train)

# Stage 1 voting: combine predictions from LFS1, LFS2, LFS3
stage1_vote = VotingClassifier(
    estimators=[
        ('svm1', svm1),
        ('svm2', svm2),
        ('svm3', svm3)
    ],
    voting='hard'
)
stage1_vote.fit(
    hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]),  # Stack features for VotingClassifier fit
    y_train
)

# Stage 1 predictions
P6_train = stage1_vote.predict(hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]))
P6_test = stage1_vote.predict(hstack([Xte_LFS1, Xte_LFS2, Xte_LFS3]))

# ----- Stage 2: Combine P6 with CV-only and TF-IDF-only -----
from sklearn.feature_extraction.text import TfidfVectorizer

# CV-only on full text
cv_full = CountVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_cv_full = cv_full.fit_transform(X_train['cleaned_text'])
Xte_cv_full = cv_full.transform(X_test['cleaned_text'])

# TF-IDF-only on full text
tfidf_full = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_tfidf_full = tfidf_full.fit_transform(X_train['cleaned_text'])
Xte_tfidf_full = tfidf_full.transform(X_test['cleaned_text'])

# Final stage voting: P6, CV, TF-IDF
final_vote = VotingClassifier(
    estimators=[
        ('cv_svm', SVC(kernel='linear', probability=True).fit(Xtr_cv_full, y_train)),
        ('tfidf_svm', SVC(kernel='linear', probability=True).fit(Xtr_tfidf_full, y_train)),
        ('lfs_vote', stage1_vote)
    ],
    voting='hard'
)

final_vote.fit(Xtr_cv_full, y_train)  # Fit on one set, predictions from others are internal

# Final prediction
final_preds = final_vote.predict(Xte_cv_full)

from sklearn.metrics import accuracy_score
print("Final Accuracy:", accuracy_score(y_test, final_preds))


Final Accuracy: 0.9235


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Example: Define your LFS feature groups (replace with your actual 20 features split into 3 sets)
LFS1 = ['num_special_chars', 'num_determinants', 'num_capital_letters', 
        'gunning_fog', 'polarity', 'num_syllables']
LFS2 = ['num_short_sentences', 'smog', 'title_similarity',
        'subjectivity', 'num_words', 'rate_adj_adv']
LFS3 = ['num_long_sentences', 'ari', 'num_articles', 
        'num_verbs', 'num_sentences', 'words_per_sentence']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled, df_sampled['label'], test_size=0.3, random_state=42, stratify=df_sampled['label']
)

# Function to apply CV + LFS
def cv_over_lfs(X_train, X_test, lfs_cols):
    """Apply Count Vectorizer to text + concatenate LFS numeric features."""
    cv = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_cv = cv.fit_transform(X_train['cleaned_text'])
    X_test_cv = cv.transform(X_test['cleaned_text'])

    # Scale LFS numeric features
    scaler = StandardScaler()
    X_train_lfs = scaler.fit_transform(X_train[lfs_cols])
    X_test_lfs = scaler.transform(X_test[lfs_cols])

    # Combine sparse CV with dense LFS
    X_train_combined = hstack([X_train_cv, X_train_lfs])
    X_test_combined = hstack([X_test_cv, X_test_lfs])
    
    return X_train_combined, X_test_combined

# Generate embedded sets
Xtr_LFS1, Xte_LFS1 = cv_over_lfs(X_train, X_test, LFS1)
Xtr_LFS2, Xte_LFS2 = cv_over_lfs(X_train, X_test, LFS2)
Xtr_LFS3, Xte_LFS3 = cv_over_lfs(X_train, X_test, LFS3)

# Define base model (SVM as per WELFake best performer)
svm1 = SVC(kernel='linear', probability=True, random_state=42)
svm2 = SVC(kernel='linear', probability=True, random_state=42)
svm3 = SVC(kernel='linear', probability=True, random_state=42)

# Fit each SVM
svm1.fit(Xtr_LFS1, y_train)
svm2.fit(Xtr_LFS2, y_train)
svm3.fit(Xtr_LFS3, y_train)

# Stage 1 voting: combine predictions from LFS1, LFS2, LFS3
stage1_vote = VotingClassifier(
    estimators=[
        ('svm1', svm1),
        ('svm2', svm2),
        ('svm3', svm3)
    ],
    voting='hard'
)
stage1_vote.fit(
    hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]),  # Stack features for VotingClassifier fit
    y_train
)

# Stage 1 predictions
P6_train = stage1_vote.predict(hstack([Xtr_LFS1, Xtr_LFS2, Xtr_LFS3]))
P6_test = stage1_vote.predict(hstack([Xte_LFS1, Xte_LFS2, Xte_LFS3]))

# ----- Stage 2: Combine P6 with CV-only and TF-IDF-only -----
from sklearn.feature_extraction.text import TfidfVectorizer

# CV-only on full text
cv_full = CountVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_cv_full = cv_full.fit_transform(X_train['cleaned_text'])
Xte_cv_full = cv_full.transform(X_test['cleaned_text'])

# TF-IDF-only on full text
tfidf_full = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr_tfidf_full = tfidf_full.fit_transform(X_train['cleaned_text'])
Xte_tfidf_full = tfidf_full.transform(X_test['cleaned_text'])

# Final stage voting: P6, CV, TF-IDF
final_vote = VotingClassifier(
    estimators=[
        ('cv_svm', SVC(kernel='linear', probability=True).fit(Xtr_cv_full, y_train)),
        ('tfidf_svm', SVC(kernel='linear', probability=True).fit(Xtr_tfidf_full, y_train)),
        ('lfs_vote', stage1_vote)
    ],
    voting='hard'
)

final_vote.fit(Xtr_cv_full, y_train)  # Fit on one set, predictions from others are internal

# Final prediction
final_preds = final_vote.predict(Xte_cv_full)

from sklearn.metrics import accuracy_score
print("Final Accuracy:", accuracy_score(y_test, final_preds))
