# NLP 2025 PROJECT : FAKE NEWS DETECTION

In [2]:
import torch

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
print(device)

cuda


In [5]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## Loading the data

In [43]:
true_path = 'True.csv'
fake_path = 'Fake.csv'

true_df = pd.read_csv(true_path)
fake_df = pd.read_csv(fake_path)

Adding a label and combining the two datasets :

In [44]:
# Adding labels 1 for real, 0 for fake
true_df['label'] = 1
fake_df['label'] = 0

# Combining into one Dataframe
df = pd.concat([true_df, fake_df], ignore_index=True)

# Shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Only keeping first 10000 samples to speed up computations
df = df[:10000]

In [45]:
df.shape

(10000, 5)

## Cleaning the data

In [46]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/onyxia/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 1) Heavy cleaning for Bag-of-words and TDIDF (term frequency-inverse document frequency)

In [47]:
def heavy_cleaning(text):
    # Putting all text in lowercase
    text = text.lower()
    # Removing URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Removing Twitter handles
    text = re.sub(r'@\w+', '', text)
    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Removing numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = nltk.word_tokenize(text)
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [48]:
# Applying the cleaning
df['text_heavy_cleaned'] = df['text'].apply(heavy_cleaning)

In [12]:
df.head()

Unnamed: 0,title,text,subject,date,label,text_heavy_cleaned
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",0,donald trump white house chaos trying cover ru...
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",0,donald trump presumptive gop nominee time reme...
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",0,mike penny huge homophobe support exgay conver...
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",1,san francisco reuters california attorney gene...
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",0,twisted reasoning come pelosi day especially p...


### 2) Light cleaning for Word2Vec and BERT

In [49]:
def light_cleaning(text):
    # putting all text in lowercase
    text = text.lower()
    # Removing URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Removing Twitter handles
    text = re.sub(r'@\w+', '', text)
    # Removing excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [50]:
# Applying the cleaning
df['text_light_cleaned'] = df['text'].apply(light_cleaning)

In [51]:
df.head()

Unnamed: 0,title,text,subject,date,label,text_heavy_cleaned,text_light_cleaned
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",0,donald trump white house chaos trying cover ru...,"donald trump s white house is in chaos, and th..."
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",0,donald trump presumptive gop nominee time reme...,now that donald trump is the presumptive gop n...
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",0,mike penny huge homophobe support exgay conver...,mike pence is a huge homophobe. he supports ex...
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",1,san francisco reuters california attorney gene...,san francisco (reuters) - california attorney ...
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",0,twisted reasoning come pelosi day especially p...,twisted reasoning is all that comes from pelos...


## Feature extraction

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
import torch
from textblob import TextBlob
from nltk import pos_tag, word_tokenize
nltk.download('averaged_perceptron_tagger')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/onyxia/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

### 1) Bag-of-words

In [52]:
# Using heavy cleaned text
bow_vectorizer = CountVectorizer(max_features=5000)  # Limit vocab size for efficiency
X_bow = bow_vectorizer.fit_transform(df['text_heavy_cleaned'])

In [53]:
X_bow

<10000x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 1279672 stored elements in Compressed Sparse Row format>

### 2) TF-IDF

In [54]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['text_heavy_cleaned'])

In [55]:
X_tfidf

<10000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1279672 stored elements in Compressed Sparse Row format>

### 3) Word-to-Vec

In [56]:
# Tokenize the light-cleaned text for Word2Vec
df['tokens'] = df['text_light_cleaned'].apply(word_tokenize)

# Train a Word2Vec model on your corpus
w2v_model = Word2Vec(
    sentences=df['tokens'],
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    seed=42
)

# Function to get average Word2Vec embedding for a document, to prepare the input for the machine learning model
def get_avg_word2vec(tokens, model, vector_size=100):
    vecs = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(vector_size)

# Apply to your dataset
X_w2v = np.vstack(df['tokens'].apply(lambda tokens: get_avg_word2vec(tokens, w2v_model)))

In [57]:
df.shape

(10000, 8)

In [58]:
X_w2v.shape

(10000, 100)

### 4) BERT Embeddings

In [59]:
import hf_xet
from tqdm import tqdm

In [60]:
# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.to(device)
bert_model.eval()

def get_bert_embeddings(texts, batch_size=32):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = bert_model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(embeddings)
    return np.vstack(all_embeddings)

In [61]:
X_bert = get_bert_embeddings(df['text_light_cleaned'].tolist())

100%|██████████| 313/313 [06:14<00:00,  1.20s/it]


### 5) Linguistic cues

In [37]:
import string
import re
from collections import Counter
import textstat

In [62]:
nltk.download('averaged_perceptron_tagger_eng')

# Defining linguistic categories according to table 1 from the article
PERSONAL_PRONOUNS = {"i", "we", "she", "him", "me", "us", "her", "he"}
FIRST_PERSON_SINGULAR = {"i", "me"}
FIRST_PERSON_PLURAL = {"we", "us"}
SECOND_PERSON = {"you", "your"}
THIRD_PERSON_SINGULAR = {"she", "he", "her", "him"}
IMPERSONAL_PRONOUNS = {"it", "that", "anything", "everything", "something"}
ARTICLES = {"a", "an", "the"}
PREPOSITIONS = {"above", "below", "near", "under", "over", "behind", "beyond", "through", "among", "within", "without", "across", "against", "along", "around", "at", "before", "by", "during", "except", "for", "from", "in", "into", "of", "off", "on", "to", "until", "up", "with"}
AUXILIARY_VERBS = {"am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did"}
COMMON_ADVERBS = {"just", "usually", "even", "only", "also", "still", "already"}
CONJUNCTIONS = {"and", "but", "or", "yet", "so", "for", "nor", "although", "because", "since", "unless", "until", "while"}
NEGATIONS = {"no", "not", "never", "none", "nothing", "nowhere"}
COMMON_VERBS = {
    "run", "walk", "swim", "go", "come", "make", "do", "say", "get", 
    "see", "know", "think", "be", "have", "take", "look", "want", 
    "give", "use", "find", "tell", "ask", "work", "seem", "feel", 
    "try", "leave", "call"
}   # according to https://www.englishclub.com/vocabulary/common-verbs-25.php

COMMON_ADJECTIVES = {
    "better", "greater", "larger", "good", "bad", "happy", "sad", 
    "new", "old", "young", "small", "big", "first", "last", "long", 
    "great", "little", "own", "other", "right", "high", "different", 
    "large", "next", "early", "important", "few", "public", "same", "able"
}   # according to https://www.englishclub.com/vocabulary/common-adjectives-25.php

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/onyxia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Extracting main linguistic features, sentiment analysis and a few others have been commented to make the code run in decent time

In [64]:
def extract_linguistic_features(text):
    features = {}
    # blob = TextBlob(text)
    tokens = word_tokenize(text.lower())
    tags = pos_tag(tokens)
    token_count = Counter(tokens)
    total_words = len(tokens)
    total_chars = len(text)
    # total_sentences = len(blob.sentences)
    features['char_count'] = total_chars
    features['word_count'] = total_words
    # features['sentence_count'] = total_sentences
    # features['syllables_count'] = textstat.syllable_count(text)
    # features['words_per_sentence'] = total_words / total_sentences if total_sentences > 0 else 0
    features['long_words_count'] = sum(1 for w in tokens if len(w) > 6)
    features['all_caps_count'] = sum(1 for w in tokens if w.isupper())
    features['unique_words_count'] = len(set(tokens))
    features['avg_word_length'] = np.mean([len(w) for w in tokens]) if tokens else 0

    # Percentage-based linguistic categories
    def count_percent(word_set):
        return sum(1 for w in tokens if w in word_set) / total_words if total_words > 0 else 0

    features['personal_pronouns_pct'] = count_percent(PERSONAL_PRONOUNS)
    features['first_person_singular_pct'] = count_percent(FIRST_PERSON_SINGULAR)
    features['first_person_plural_pct'] = count_percent(FIRST_PERSON_PLURAL)
    features['second_person_pct'] = count_percent(SECOND_PERSON)
    features['third_person_singular_pct'] = count_percent(THIRD_PERSON_SINGULAR)
    features['impersonal_pronouns_pct'] = count_percent(IMPERSONAL_PRONOUNS)
    features['articles_pct'] = count_percent(ARTICLES)
    features['prepositions_pct'] = count_percent(PREPOSITIONS)
    features['auxiliary_verbs_pct'] = count_percent(AUXILIARY_VERBS)
    features['common_adverbs_pct'] = count_percent(COMMON_ADVERBS)
    features['conjunctions_pct'] = count_percent(CONJUNCTIONS)
    features['negations_pct'] = count_percent(NEGATIONS)
    features['common_verbs_pct'] = count_percent(COMMON_VERBS)
    features['common_adjectives_pct'] = count_percent(COMMON_ADJECTIVES)

    # POS Tag counts
    features['noun_count'] = sum(1 for _, tag in tags if tag.startswith('NN'))
    features['pronoun_count'] = sum(1 for _, tag in tags if tag.startswith('PRP'))

    # Sentiment
    # features['sentiment_polarity'] = blob.sentiment.polarity

    # Punctuation
    features['punctuation_count'] = sum(1 for c in text if c in string.punctuation)
    features['fullstop_count'] = text.count('.')
    features['comma_count'] = text.count(',')
    features['colon_count'] = text.count(':')
    features['semicolon_count'] = text.count(';')
    features['question_mark_count'] = text.count('?')
    features['exclamation_mark_count'] = text.count('!')
    features['dash_count'] = text.count('-')
    features['apostrophe_count'] = text.count("'")
    features['brackets_count'] = text.count('(') + text.count(')')

    return pd.Series(features)

# Applying to our data
linguistic_df = df['text'].apply(extract_linguistic_features)

In [65]:
linguistic_df.head()

Unnamed: 0,char_count,word_count,long_words_count,all_caps_count,unique_words_count,avg_word_length,personal_pronouns_pct,first_person_singular_pct,first_person_plural_pct,second_person_pct,...,punctuation_count,fullstop_count,comma_count,colon_count,semicolon_count,question_mark_count,exclamation_mark_count,dash_count,apostrophe_count,brackets_count
0,2114.0,402.0,73.0,0.0,208.0,4.348259,0.007463,0.0,0.002488,0.0,...,50.0,16.0,26.0,2.0,0.0,0.0,0.0,3.0,0.0,2.0
1,2823.0,580.0,87.0,0.0,298.0,4.005172,0.034483,0.003448,0.010345,0.005172,...,93.0,21.0,44.0,10.0,0.0,3.0,1.0,4.0,1.0,4.0
2,2402.0,428.0,108.0,0.0,236.0,4.703271,0.021028,0.0,0.0,0.0,...,72.0,21.0,22.0,2.0,0.0,0.0,0.0,10.0,0.0,4.0
3,629.0,105.0,36.0,0.0,75.0,5.133333,0.009524,0.0,0.0,0.0,...,10.0,3.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
4,793.0,145.0,29.0,0.0,99.0,4.510345,0.027586,0.0,0.0,0.0,...,10.0,5.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Testing our model of each feature set

In [66]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import joblib
import os

In [67]:
y = df['label']

# Define feature sets
feature_sets = {
    "Bag of Words": X_bow,
    "TF-IDF": X_tfidf,
    "Word2Vec": X_w2v,
    "BERT": X_bert,
    "Linguistic": linguistic_df
}

results = {}

for name, X in feature_sets.items():
    print(f"\n--- Training with {name} features ---")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Model training
    clf = GradientBoostingClassifier(verbose=1)
    clf.fit(X_train, y_train)

    # Create a directory to save and store models
    os.makedirs("models", exist_ok=True)
    model_path = f"models/gbc_{name.replace(' ', '_').lower()}.joblib"
    joblib.dump(clf, model_path)
    print(f"Saved model: {model_path}")

    # Prediction
    y_pred = clf.predict(X_test)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Store results
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

results_df = pd.DataFrame(results).T
print("\n=== Model Evaluation Results ===")
print(results_df.round(4))


--- Training with Bag of Words features ---
      Iter       Train Loss   Remaining Time 
         1           1.1978           10.34s
         2           1.0453            9.86s
         3           0.9182            9.39s
         4           0.8108            9.03s
         5           0.7191            8.83s
         6           0.6399            8.65s
         7           0.5715            8.61s
         8           0.5117            8.47s
         9           0.4595            8.32s
        10           0.4131            8.25s
        20           0.1602            7.22s
        30           0.0763            6.30s
        40           0.0467            5.31s
        50           0.0407            4.23s
        60           0.0378            3.28s
        70           0.0324            2.40s
        80           0.0301            1.57s
        90           0.0275            0.78s
       100           0.0247            0.00s
Saved model: models/gbc_bag_of_words.joblib

--- Train

## Testing generalizability

Using the fake or real news dataset that can be found here : https://www.kaggle.com/datasets/jillanisofttech/fake-or-real-news

Loading models if needed :

In [106]:
# model_names = ["bag_of_words", "tf-idf", "word2vec", "bert", "linguistic"]
name = "linguistic"
loaded_models = {}
path = f"models/gbc_{name}.joblib"
loaded_models[name] = joblib.load(path)
print(f"Loaded model: {path}")

Loaded model: models/gbc_linguistic.joblib


In [72]:
new_data_path = 'fake_or_real_news.csv'

new_data = pd.read_csv(new_data_path)

In [75]:
new_data.shape

(6335, 4)

In [73]:
new_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [76]:
new_data['text_heavy_cleaned'] = new_data['text'].apply(heavy_cleaning)

In [78]:
new_data['text_light_cleaned'] = new_data['text'].apply(light_cleaning)

In [80]:
new_data['label_numeric'] = new_data['label'].map({'REAL': 1, 'FAKE': 0})

In [81]:
new_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,text_heavy_cleaned,text_light_cleaned,label_numeric
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,daniel greenfield shillman journalism fellow f...,"daniel greenfield, a shillman journalism fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,google pinterest digg linkedin reddit stumbleu...,google pinterest digg linkedin reddit stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,u secretary state john f kerry said monday sto...,u.s. secretary of state john f. kerry said mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,— kaydee king november lesson tonight dem loss...,"— kaydee king () november 9, 2016 the lesson f...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,primary day new york frontrunners hillary clin...,it's primary day in new york and front-runners...,1


### 1) Testing generalizability of bag_of_words based model

In [82]:
# Using heavy cleaned text
bow_vectorizer = CountVectorizer(max_features=5000)  # Limit vocab size for efficiency
X_bow_new = bow_vectorizer.fit_transform(new_data['text_heavy_cleaned'])

In [83]:
y_true_new = new_data['label_numeric']
y_pred_new = loaded_models["bag_of_words"].predict(X_bow_new)

# Evaluate performance
accuracy = accuracy_score(y_true_new, y_pred_new)
precision = precision_score(y_true_new, y_pred_new)
recall = recall_score(y_true_new, y_pred_new)
f1 = f1_score(y_true_new, y_pred_new)

print("Evaluation on new data (X_bow_new):")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")

Evaluation on new data (X_bow_new):
Accuracy : 0.5122
Precision: 0.6738
Recall   : 0.0495
F1 Score : 0.0922


### 2) Testing generalizability of TF-IDF based model

In [86]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf_new = tfidf_vectorizer.fit_transform(new_data['text_heavy_cleaned'])

In [97]:
y_true_new = new_data['label_numeric']
y_pred_new = loaded_models["tf-idf"].predict(X_tfidf_new)

# Evaluate performance
accuracy = accuracy_score(y_true_new, y_pred_new)
precision = precision_score(y_true_new, y_pred_new)
recall = recall_score(y_true_new, y_pred_new)
f1 = f1_score(y_true_new, y_pred_new)

print("Evaluation on new data (X_tfidf_new):")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")

Evaluation on new data (X_bow_new):
Accuracy : 0.5121
Precision: 0.6835
Recall   : 0.0470
F1 Score : 0.0879


### 3) Testing generalizability of Word-to-vec based model

In [91]:
# Tokenize the light-cleaned text for Word2Vec
new_data['tokens'] = new_data['text_light_cleaned'].apply(word_tokenize)

# Train a Word2Vec model on your corpus
w2v_model = Word2Vec(
    sentences=df['tokens'],
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    seed=42
)

# Function to get average Word2Vec embedding for a document, to prepare the input for the machine learning model
def get_avg_word2vec(tokens, model, vector_size=100):
    vecs = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(vector_size)

# Apply to your dataset
X_w2v_new = np.vstack(new_data['tokens'].apply(lambda tokens: get_avg_word2vec(tokens, w2v_model)))

In [101]:
y_true_new = new_data['label_numeric']
y_pred_new = loaded_models["word2vec"].predict(X_w2v_new)

# Evaluate performance
accuracy = accuracy_score(y_true_new, y_pred_new)
precision = precision_score(y_true_new, y_pred_new)
recall = recall_score(y_true_new, y_pred_new)
f1 = f1_score(y_true_new, y_pred_new)

print("Evaluation on new data (X_w2v_new):")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")

Evaluation on new data (X_w2v_new):
Accuracy : 0.5001
Precision: 1.0000
Recall   : 0.0013
F1 Score : 0.0025


### 4) Testing generalizability of BERT based model

In [102]:
X_bert_new = get_bert_embeddings(new_data['text_light_cleaned'].tolist())

100%|██████████| 198/198 [04:13<00:00,  1.28s/it]


In [104]:
y_true_new = new_data['label_numeric']
y_pred_new = loaded_models["bert"].predict(X_bert_new)

# Evaluate performance
accuracy = accuracy_score(y_true_new, y_pred_new)
precision = precision_score(y_true_new, y_pred_new)
recall = recall_score(y_true_new, y_pred_new)
f1 = f1_score(y_true_new, y_pred_new)

print("Evaluation on new data (X_bert_new):")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")

Evaluation on new data (X_bert_new):
Accuracy : 0.6586
Precision: 0.7238
Recall   : 0.5140
F1 Score : 0.6011


### 5) Testing generalizability of Linguistic cues based model

In [105]:
linguistic_df = new_data['text'].apply(extract_linguistic_features)

In [108]:
y_true_new = new_data['label_numeric']
y_pred_new = loaded_models["linguistic"].predict(linguistic_df)

# Evaluate performance
accuracy = accuracy_score(y_true_new, y_pred_new)
precision = precision_score(y_true_new, y_pred_new)
recall = recall_score(y_true_new, y_pred_new)
f1 = f1_score(y_true_new, y_pred_new)

print("Evaluation on new data (linguistic_df):")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")

Evaluation on new data (linguistic_df):
Accuracy : 0.5634
Precision: 0.6576
Recall   : 0.2665
F1 Score : 0.3793
