In [1]:
!pip install torchtext==0.6.0

!pip install -U pip setuptools wheel

!pip install -U spacy==2.3.5

!python -m spacy download en_core_web_sm

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m778.9 kB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from torchtext==0.6.0)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.16.0
    Uninstalling torchtext-0.16.0:
      Successfully uninstalled torchtext-0.16.0
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0
Collecting pip
  Downloading pip-23.3.2-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting setuptools
  Downloading setuptools-69.0.3-py3

Collecting spacy==2.3.5
  Downloading spacy-2.3.5.tar.gz (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting thinc<7.5.0,>=7.4.1 (from spacy==2.3.5)
  Using cached thinc-7.4.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
Collecting wasabi<1.1.0,>=0.4.0 (from spacy==2.3.5)
  Using cached wasabi-0.10.1-py3-none-any.whl (26 kB)
Collecting srsly<1.1.0,>=1.0.2 (from spacy==2.3.5)
  Using cached srsly-1.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting catalogue<1.1.0,>=0.0.7 (from spacy==2.3.5)
  Using cached catalogue-1.0.2-py2.py3-none-any.whl (16 kB)
Collecting plac<1.2.0,>=0.9.6 (from spacy==2.3.5)
  Using cach

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data import Field, TabularDataset, BucketIterator, LabelField
import spacy
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import nltk
from nltk.tokenize import word_tokenize
import time

In [None]:
!pip install gdown==4.6.0
!gdown --folder https://drive.google.com/drive/u/1/folders/15Wn46r7gidaiZbx2ArFYsd7rjYH4y7JM

In [None]:
# Set random seed for reproducibility
seed = 1234
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# load the liar dataset
csv_path_liar_train = os.path.join( '/content', 'liar_dataset', 'train.tsv')
csv_path_liar_test = os.path.join( '/content', 'liar_dataset', 'test.tsv')

df_liar_train = pd.read_csv(csv_path_liar_train, sep='\t', header=None)
df_liar_test = pd.read_csv(csv_path_liar_test, sep='\t', header=None)

df_liar_train.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']
df_liar_test.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']

df_liar_train = df_liar_train[['label', 'statement']]
df_liar_test = df_liar_test[['label', 'statement']]
df_liar_train = df_liar_train.dropna()
df_liar_test = df_liar_test.dropna()


# save the train and test sets to csv files
df_liar_train.to_csv('train.csv', index=False)
df_liar_test.to_csv('test.csv', index=False)

In [None]:
nlp = spacy.load("en_core_web_sm")
def tokenize_spacy(text):
    return [token.text for token in nlp(text)]


# define the fields
TEXT = Field(tokenize=tokenize_spacy, include_lengths=True, batch_first=True)
LABEL = LabelField(dtype=torch.float, batch_first = True)

fields = [('text', TEXT), ('label', LABEL)]
train_data, test_data = TabularDataset.splits(
    path='./',
    train='train.csv',
    test='test.csv',
    format='csv',
    skip_header=True,
    fields=[('label', LABEL), ('text', TEXT)]
)

# split the train data into train and validation sets

train_data, valid_data = train_data.split(random_state=random.seed(seed))

# build the vocabulary
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                    max_size=MAX_VOCAB_SIZE,
                    vectors="glove.6B.100d",
                    unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)


# Define iterator
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
    )

.vector_cache/glove.6B.zip: 862MB [02:39, 5.40MB/s]                           
100%|█████████▉| 399999/400000 [00:21<00:00, 18863.02it/s]


In [None]:
# Define the transformer model
class Transformer(nn.Module):
    def __init__(self, input_dim, emb_dim, nhead, hid_dim, n_layers, output_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.transformer = nn.Transformer(
            d_model=emb_dim,
            nhead=nhead,
            num_encoder_layers=n_layers,
            num_decoder_layers=n_layers,
            dim_feedforward=hid_dim,
            dropout=dropout
        )
        self.fc = nn.Linear(emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
      embedded = self.embedding(text)

      # If the input is sparse, convert it to a dense tensor
      if isinstance(embedded, torch.sparse.FloatTensor):
          embedded = embedded.to_dense()

      embedded = embedded.permute(1, 0, 2)

      output = self.transformer(embedded, embedded)
      output = output.mean(dim=0)
      output = self.fc(output)

      return F.log_softmax(output, dim=1)

In [None]:
# Initialize the model, optimizer, and loss function
INPUT_DIM = len(TEXT.vocab)
EMB_DIM = 100
NHEAD = 4
HID_DIM = 256
N_LAYERS = 2
OUTPUT_DIM = 6
DROPOUT = 0.5

model = Transformer(INPUT_DIM, EMB_DIM, NHEAD, HID_DIM, N_LAYERS, OUTPUT_DIM, DROPOUT)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# Send the model to GPU if available

model = model.to(device)
criterion = criterion.to(device)



In [None]:
# Training loop
# define the training function

def categorical_accuracy(preds, y):
    top_pred = preds.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    return correct.float() / y.shape[0]

def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in tqdm(iterator):

        optimizer.zero_grad()


        text, text_lengths = batch.text

        predictions = model(text, text_lengths).squeeze(1)

        loss = criterion(predictions, batch.label.long())

        acc = categorical_accuracy(predictions, batch.label.long())

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)
# Evaluation loop
def evaluate(model, iterator, criterion):

        epoch_loss = 0
        epoch_acc = 0

        all_predictions = []
        all_labels = []


        model.eval()

        with torch.no_grad():

            for batch in tqdm(iterator):

                text, text_lengths = batch.text

                predictions = model(text, text_lengths).squeeze(1)

                loss = criterion(predictions, batch.label.long())

                acc = categorical_accuracy(predictions, batch.label.long())

                epoch_loss += loss.item()
                epoch_acc += acc.item()

                # calculate precision, recall and f1 score
                y_pred = predictions.argmax(1, keepdim=True)
                y_pred = y_pred.squeeze(1)
                y_true = batch.label.long()
                #y_true = y_true.squeeze(1)

                all_predictions.extend(y_pred.cpu().numpy())
                all_labels.extend(batch.label.long().cpu().numpy())

        precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=True)
        recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=True)
        f1 = f1_score(all_labels, all_predictions, average='weighted')


        return epoch_loss / len(iterator), epoch_acc / len(iterator), precision, recall, f1

In [None]:
# Training and evaluation
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train(model, train_iterator, optimizer, criterion)
    test_loss, test_accuracy, _, _, _ = evaluate(model, test_iterator, criterion)
    print(f'Epoch: {epoch+1}, Test Loss: {test_loss:.3f}, Test Accuracy: {test_accuracy:.3f}')

# Save the trained model
torch.save(model.state_dict(), 'fake_news_transformer_model.pth')

100%|██████████| 112/112 [00:03<00:00, 30.03it/s]
100%|██████████| 20/20 [00:00<00:00, 138.61it/s]


Epoch: 1, Test Loss: 1.761, Test Accuracy: 0.219


100%|██████████| 112/112 [00:02<00:00, 55.57it/s]
100%|██████████| 20/20 [00:00<00:00, 131.02it/s]


Epoch: 2, Test Loss: 1.804, Test Accuracy: 0.204


100%|██████████| 112/112 [00:02<00:00, 53.20it/s]
100%|██████████| 20/20 [00:00<00:00, 137.10it/s]


Epoch: 3, Test Loss: 1.802, Test Accuracy: 0.212


100%|██████████| 112/112 [00:01<00:00, 58.92it/s]
100%|██████████| 20/20 [00:00<00:00, 136.91it/s]


Epoch: 4, Test Loss: 1.827, Test Accuracy: 0.228


100%|██████████| 112/112 [00:02<00:00, 47.95it/s]
100%|██████████| 20/20 [00:00<00:00, 105.16it/s]


Epoch: 5, Test Loss: 1.987, Test Accuracy: 0.211


In [None]:
# evaluate the model on the test set
test_loss, test_acc, precision, recall, f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Precision: {precision} | Recall: {recall} | F1: {f1}')

# save the results to a csv file
results = pd.DataFrame([[test_acc, precision, recall, f1]], columns=['accuracy', 'precision', 'recall', 'f1'], index=['Transformer'])
results.to_csv('results_liar_transformer.csv')

100%|██████████| 20/20 [00:00<00:00, 95.99it/s]

Test Loss: 1.987 | Test Acc: 21.08% | Precision: 0.16976947159985484 | Recall: 0.2107340173638516 | F1: 0.17456077812545948





In [2]:
!pip install gdown==4.6.0
!gdown --folder https://drive.google.com/drive/u/1/folders/1wf7mFLCqQo0t802IDkZKMOinciUwohuR

Collecting gdown==4.6.0
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
Successfully installed gdown-4.6.0
[0mRetrieving folder list
Processing file 11UvyoobnRVXsNkjCsRl848mdYN0Yi18K WELFake_Dataset.csv
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=11UvyoobnRVXsNkjCsRl848mdYN0Yi18K
To: /content/WELFake/WELFake_Dataset.csv
100% 245M/245M [00:23<00:00, 10.4MB/s]
Download completed


In [3]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


# set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

nltk.download('punkt')
def tokenize(text):
    return word_tokenize(text)



# define the fields
TEXT = Field(tokenize=tokenize, include_lengths=True, unk_token='<unk>', batch_first=True)
LABEL = LabelField(dtype=torch.float)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
# load the welfake dataset

csv_path_welfake = os.path.join( '/content', 'WELFake', 'WELFake_Dataset.csv')

df = pd.read_csv(csv_path_welfake)
df = df.drop(['Unnamed: 0', 'title'], axis=1)
df.columns = ['text', 'label']
df['label'] = df['label'].replace('fake', 0)
df['label'] = df['label'].replace('real', 1)
df.to_csv('.//welfake.csv', index=False)
# drop the rows with np.nan values on text column
df = df.dropna(subset=['text'])
df = df[df['text'].str.len() > 30]

# split the dataset into train, validation and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED)


####### DEBUG

#train_df, test_df = train_test_split(test_df, test_size=0.2, random_state=SEED)

# save the train, validation and test sets as csv files
train_df.to_csv('.//welfake_train.csv', index=False)
test_df.to_csv('.//welfake_test.csv', index=False)

In [5]:
# load the data
train_data, test_data = TabularDataset.splits(
    path='./',
    train='welfake_train.csv',
    test='welfake_test.csv',
    format='csv',
    fields=[('text', TEXT), ('label', LABEL)]
)

# split the train data into train and validation sets

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [6]:
# build the vocabulary
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                    max_size=MAX_VOCAB_SIZE,
                    vectors="glove.6B.100d",
                    unk_init=torch.Tensor.normal_)

LABEL.build_vocab(train_data)



.vector_cache/glove.6B.zip: 862MB [02:42, 5.32MB/s]                           
100%|█████████▉| 399999/400000 [00:20<00:00, 19502.90it/s]


In [7]:
# create the iterators
BATCH_SIZE = 16

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
    )

In [8]:
# Define the transformer model
class Transformer(nn.Module):
    def __init__(self, input_dim, emb_dim, nhead, hid_dim, n_layers, output_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.transformer = nn.Transformer(
            d_model=emb_dim,
            nhead=nhead,
            num_encoder_layers=n_layers,
            num_decoder_layers=n_layers,
            dim_feedforward=hid_dim,
            dropout=dropout
        )
        self.fc = nn.Linear(emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
      embedded = self.embedding(text)

      # If the input is sparse, convert it to a dense tensor
      if isinstance(embedded, torch.sparse.FloatTensor):
          embedded = embedded.to_dense()

      embedded = embedded.permute(1, 0, 2)

      output = self.transformer(embedded, embedded)
      output = output.mean(dim=0)
      output = self.fc(output).squeeze(dim=0)

      return output

In [9]:
# Initialize the model, optimizer, and loss function
INPUT_DIM = len(TEXT.vocab)
EMB_DIM = 100
NHEAD = 4
HID_DIM = 256
N_LAYERS = 2
OUTPUT_DIM = 1
DROPOUT = 0.5

model = Transformer(INPUT_DIM, EMB_DIM, NHEAD, HID_DIM, N_LAYERS, OUTPUT_DIM, DROPOUT)
optimizer = optim.Adam(model.parameters(), lr = .001)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
criterion = nn.BCEWithLogitsLoss()

# Send the model to GPU if available

model = model.to(device)
criterion = criterion.to(device)



In [14]:
# Training loop
# define the training function
def binary_accuracy(preds, y):
    threshold = 0.5
    binary_predictions = (preds > threshold).float()
    correct = (binary_predictions == y).float()
    #correct = (preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):


    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in tqdm(iterator):

        optimizer.zero_grad()


        text, text_lengths = batch.text

        predictions = model(text, text_lengths).squeeze(1)

        loss = criterion(predictions, batch.label.float())

        acc = binary_accuracy(predictions, batch.label.long())

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()


    return epoch_loss / len(iterator), epoch_acc / len(iterator)
# Evaluation loop
def evaluate(model, iterator, criterion):

        epoch_loss = 0
        epoch_acc = 0

        all_predictions = []
        all_labels = []


        model.eval()

        with torch.no_grad():

            for batch in tqdm(iterator):

                text, text_lengths = batch.text

                try:
                  predictions = model(text, text_lengths).squeeze(1)
                except:
                  predictions = model(text, text_lengths)
                loss = criterion(predictions, batch.label)

                acc = binary_accuracy(predictions, batch.label)

                epoch_loss += loss.item()
                epoch_acc += acc.item()

                # Convert probability scores to binary predictions using a threshold (e.g., 0.5)
                threshold = 0.5
                binary_predictions = (predictions > threshold).float()

                # calculate precision, recall and f1 score

                all_predictions.extend(binary_predictions.cpu().numpy())
                all_labels.extend(batch.label.long().cpu().numpy())


        precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=True)
        recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=True)
        f1 = f1_score(all_labels, all_predictions, average='weighted')


        return epoch_loss / len(iterator), epoch_acc / len(iterator), precision, recall, f1

def epoch_time(start_time, end_time):

        elapsed_time = end_time - start_time

        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

        return elapsed_mins, elapsed_secs

In [11]:
# train the model

N_EPOCHS = 4

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
            valid_loss, valid_acc, _, _, _ = evaluate(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(), 'welfake-model.pt')

            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
            print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

# evaluate the model on the test set

model.load_state_dict(torch.load('welfake-model.pt'))

100%|██████████| 2492/2492 [13:34<00:00,  3.06it/s]
100%|██████████| 1068/1068 [00:50<00:00, 21.18it/s]


Epoch: 01 | Epoch Time: 14m 25s
	Train Loss: 0.198 | Train Acc: 91.81%
	 Val. Loss: 0.178 |  Val. Acc: 92.97%


100%|██████████| 2492/2492 [14:28<00:00,  2.87it/s]
100%|██████████| 1068/1068 [00:50<00:00, 21.30it/s]


Epoch: 02 | Epoch Time: 15m 18s
	Train Loss: 0.099 | Train Acc: 96.43%
	 Val. Loss: 0.093 |  Val. Acc: 97.24%


100%|██████████| 2492/2492 [14:40<00:00,  2.83it/s]
100%|██████████| 1068/1068 [00:49<00:00, 21.61it/s]


Epoch: 03 | Epoch Time: 15m 29s
	Train Loss: 0.105 | Train Acc: 96.24%
	 Val. Loss: 0.302 |  Val. Acc: 93.22%


100%|██████████| 2492/2492 [13:59<00:00,  2.97it/s]
100%|██████████| 1068/1068 [00:49<00:00, 21.37it/s]

Epoch: 04 | Epoch Time: 14m 49s
	Train Loss: 0.258 | Train Acc: 88.04%
	 Val. Loss: 0.733 |  Val. Acc: 50.40%





<All keys matched successfully>

In [15]:
# evaluate the model on the test set
test_loss, test_acc, precision, recall, f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Precision: {precision} | Recall: {recall} | F1: {f1}')

# save the results to a csv file
results = pd.DataFrame([[test_acc, precision, recall, f1]], columns=['accuracy', 'precision', 'recall', 'f1'], index=['Transformer'])
results.to_csv('results_WELFake_transformer.csv')

100%|██████████| 891/891 [00:47<00:00, 18.84it/s]

Test Loss: 0.093 | Test Acc: 97.15% | Precision: 0.97149313575598 | Recall: 0.9714907660978864 | F1: 0.9714564080718354



