In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from torchinfo import summary

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from wordcloud import WordCloud, STOPWORDS
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm  # For progress bar

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, f1_score
import skfuzzy as fuzz
from skfuzzy import control as ctrl

import os

In [6]:
# Load the data
df_real = pd.read_csv("data/real_test_20.csv") 
df_fake = pd.read_csv("data/fake_test_20.csv")

# Add labels to the datasets
df_fake['isReal'] = 0  # add label
df_real['isReal'] = 1  # add label

# Concatenate real and fake news
df = pd.concat([df_real, df_fake]).reset_index(drop = True)
df.head()

Unnamed: 0,title,text,subject,date,isReal
0,Colombian president says he left firm listed i...,BOGOTA (Reuters) - Colombian President Juan Ma...,worldnews,"November 6, 2017",1
1,Senate Democratic leader Schumer calls for spe...,NEW YORK (Reuters) - U.S. Senate Democratic le...,worldnews,"September 25, 2017",1
2,Libyan forces suffer casualties as fighting dr...,"BENGHAZI, Libya (Reuters) - Libyan forces figh...",worldnews,"December 11, 2017",1
3,"Factbox: Trump's policies on immigration, econ...",(Reuters) - Republican presidential candidate ...,politicsNews,"September 27, 2016",1
4,UK PM May promises to uphold N. Ireland peace ...,LONDON (Reuters) - British Prime Minister Ther...,worldnews,"December 8, 2017",1


In [7]:
# Combine title and text together
df['combine'] = df['title'] + ' ' + df['text']
# Shuffle the dataFrame
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Download stopwords
nltk.download("stopwords")

# Endlish stopwords
stop_words = stopwords.words('english')
stop_words[:10]

# Remove stopwords and remove words with 2 or less characters
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
            result.append(token)
            
    return result

# Apply the function to the dataframe
df['clean'] = df['combine'].apply(preprocess)

# Join the words into a string
df['clean_joined'] = df['clean'].apply(lambda x: " ".join(x))
df = df.drop(columns=['subject', 'title', 'text', 'combine', 'clean', 'date'])
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,isReal,clean_joined
0,0,actors quit ferguson play days opening want me...
1,1,body near wreckage russian helicopter svalbard...
2,0,consequences liberal tolerance isis flag hangi...
3,0,racists spew hate facebook black people hold a...
4,1,sinn fein eyes northern ireland power sharing ...


In [9]:
# Replace the tokenizer instantiation with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Convert the dataset to features as the input for Bert model
def convert_examples_to_features(texts, tokenizer, max_length=512):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',           
            return_attention_mask=True,
            return_token_type_ids=False,
            truncation=True,
            return_tensors='pt'              # Output as PyTorch tensors directly
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    # Concatenate into big tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Apply on the test data
x_test = df['clean_joined']
y_test = torch.tensor(df['isReal'].values)  # Labels as PyTorch tensor

input_ids, attention_mask = convert_examples_to_features(x_test, tokenizer, max_length=512)

In [11]:
# Implement BERT classifier in PyTorch with frozen encoder

class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        
        # Load pre-trained BERT
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # Freeze BERT parameters
        for param in self.bert.parameters():
            param.requires_grad = False
        
        # Classifier layers
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # shape: (batch_size, seq_length, hidden_size)

        # Global average pooling over the sequence length
        x = last_hidden_state.permute(0, 2, 1)  # (batch_size, hidden_size, seq_length)
        x = self.pool(x).squeeze(-1)             # (batch_size, hidden_size)

        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [12]:
# Load the model
model = BertClassifier()
model.load_state_dict(torch.load("bert_pretrain_classifier.pt"))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Prepare column
df['real_prob'] = 0.0

# Loop through rows and compute probability
for i in tqdm(range(len(df))):
    text = df.loc[i, 'clean_joined']

    # Tokenize with truncation for BERT input
    encoded = tokenizer(
        text,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        prob_real = output.squeeze().item()  # sigmoid output ∈ [0, 1]

    df.at[i, 'real_prob'] = prob_real

100%|██████████| 8979/8979 [01:12<00:00, 123.91it/s]


In [13]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("GroNLP/mdebertav3-subjectivity-english")
model = AutoModelForSequenceClassification.from_pretrained("GroNLP/mdebertav3-subjectivity-english")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Add columns for subjectivity scores
df['subj_score'] = 0.0
df['obj_score'] = 0.0

# Compute scores for all sentences
for i in tqdm(range(len(df))):
    text = df.loc[i, 'clean_joined']
    
    # Tokenize and move to device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)[0]
        df.at[i, 'subj_score'] = probs[1].item()

100%|██████████| 8979/8979 [01:48<00:00, 82.51it/s]


In [None]:
# Count tokens and whether they will be truncated
df['token_count'] = df['clean_joined'].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))
df['was_truncated'] = df['token_count'] > 512