In [None]:
!pip install gdown==4.6.0
!gdown --folder https://drive.google.com/drive/u/1/folders/15Wn46r7gidaiZbx2ArFYsd7rjYH4y7JM

!pip install torchtext==0.6.0

!pip install -U pip setuptools wheel

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import torch
from torchtext import data
from torchtext import datasets
import random
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import datetime
import spacy
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import nltk
from nltk.tokenize import word_tokenize

In [None]:
# set the seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


# set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
def tokenize_spacy(text):
    return [token.text for token in nlp(text)]


# define the fields
TEXT = data.Field(tokenize=tokenize_spacy, include_lengths=True, unk_token='<unk>')
LABEL = data.LabelField(dtype=torch.float)

In [None]:
# load the liar dataset
csv_path_liar_train = os.path.join( '/content', 'liar_dataset', 'train.tsv')
csv_path_liar_test = os.path.join( '/content', 'liar_dataset', 'test.tsv')

df_liar_train = pd.read_csv(csv_path_liar_train, sep='\t', header=None)
df_liar_test = pd.read_csv(csv_path_liar_test, sep='\t', header=None)

df_liar_train.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']
df_liar_test.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']

df_liar_train = df_liar_train[['label', 'statement']]
df_liar_test = df_liar_test[['label', 'statement']]
df_liar_train = df_liar_train.dropna()
df_liar_test = df_liar_test.dropna()

# save the train and test sets to csv files
df_liar_train.to_csv('train.csv', index=False)
df_liar_test.to_csv('test.csv', index=False)

In [None]:
# load the data
train_data, test_data = data.TabularDataset.splits(
    path='./',
    train='train.csv',
    test='test.csv',
    format='csv',
    skip_header=True,
    fields=[('label', LABEL), ('text', TEXT)]
)

# split the train data into train and validation sets

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

# build the vocabulary
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                    max_size=MAX_VOCAB_SIZE,
                    vectors="glove.6B.100d",
                    unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)



.vector_cache/glove.6B.zip: 862MB [02:38, 5.43MB/s]                           
100%|█████████▉| 399999/400000 [00:21<00:00, 18379.85it/s]


In [None]:
# create the iterators
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
    )

In [None]:
# define the model

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout)

        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        self.dropout = nn.Dropout(dropout)
        self.output_dim = output_dim

    def forward(self, text, text_lengths):

        embedded = self.dropout(self.embedding(text))


        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False)

        packed_output, (hidden, cell) = self.rnn(packed_embedded)

        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))

        return self.fc(hidden)

In [None]:
# define the hyperparameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 6
N_LAYERS = 4
BIDIRECTIONAL = True
DROPOUT = 0.7
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# initialize the model
model = RNN(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

# define the optimizer and the loss function
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
criterion = nn.CrossEntropyLoss()

# push the model to the device
model = model.to(device)
criterion = criterion.to(device)

In [None]:
# define the accuracy function
def categorical_accuracy(preds, y):
    top_pred = preds.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    return correct.float() / y.shape[0]

# define the training function
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in tqdm(iterator):

        optimizer.zero_grad()


        text, text_lengths = batch.text

        predictions = model(text, text_lengths).squeeze(1)

        loss = criterion(predictions, batch.label.long())

        acc = categorical_accuracy(predictions, batch.label.long())

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# define the evaluation function

def evaluate(model, iterator, criterion):

        epoch_loss = 0
        epoch_acc = 0

        all_predictions = []
        all_labels = []

        model.eval()

        with torch.no_grad():

            for batch in tqdm(iterator):

                text, text_lengths = batch.text

                predictions = model(text, text_lengths).squeeze(1)

                loss = criterion(predictions, batch.label.long())

                acc = categorical_accuracy(predictions, batch.label.long())

                epoch_loss += loss.item()
                epoch_acc += acc.item()

                # calculate precision, recall and f1 score
                y_pred = predictions.argmax(1, keepdim=True)
                y_pred = y_pred.squeeze(1)
                y_true = batch.label.long()
                #y_true = y_true.squeeze(1)

                all_predictions.extend(y_pred.cpu().numpy())
                all_labels.extend(batch.label.long().cpu().numpy())

        precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=True)
        recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=True)
        f1 = f1_score(all_labels, all_predictions, average='weighted')


        return epoch_loss / len(iterator), epoch_acc / len(iterator), precision, recall, f1
# define the function to calculate the time elapsed

def epoch_time(start_time, end_time):

        elapsed_time = end_time - start_time

        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

        return elapsed_mins, elapsed_secs

In [None]:
# train the model

N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        valid_loss, valid_acc, _, _, _ = evaluate(model, valid_iterator, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'liar-model.pt')

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|██████████| 112/112 [00:04<00:00, 25.90it/s]
100%|██████████| 48/48 [00:00<00:00, 103.22it/s]


Epoch: 01 | Epoch Time: 0m 4s
	Train Loss: 1.745 | Train Acc: 21.19%
	 Val. Loss: 1.732 |  Val. Acc: 22.10%


100%|██████████| 112/112 [00:02<00:00, 40.15it/s]
100%|██████████| 48/48 [00:00<00:00, 125.31it/s]


Epoch: 02 | Epoch Time: 0m 3s
	Train Loss: 1.740 | Train Acc: 21.90%
	 Val. Loss: 1.731 |  Val. Acc: 22.88%


100%|██████████| 112/112 [00:02<00:00, 43.52it/s]
100%|██████████| 48/48 [00:00<00:00, 127.59it/s]


Epoch: 03 | Epoch Time: 0m 2s
	Train Loss: 1.742 | Train Acc: 22.36%
	 Val. Loss: 1.731 |  Val. Acc: 23.54%


100%|██████████| 112/112 [00:02<00:00, 42.60it/s]
100%|██████████| 48/48 [00:00<00:00, 115.55it/s]


Epoch: 04 | Epoch Time: 0m 3s
	Train Loss: 1.741 | Train Acc: 22.13%
	 Val. Loss: 1.729 |  Val. Acc: 22.95%


100%|██████████| 112/112 [00:02<00:00, 41.15it/s]
100%|██████████| 48/48 [00:00<00:00, 120.28it/s]


Epoch: 05 | Epoch Time: 0m 3s
	Train Loss: 1.736 | Train Acc: 22.68%
	 Val. Loss: 1.729 |  Val. Acc: 22.88%


100%|██████████| 112/112 [00:02<00:00, 42.19it/s]
100%|██████████| 48/48 [00:00<00:00, 124.32it/s]


Epoch: 06 | Epoch Time: 0m 3s
	Train Loss: 1.736 | Train Acc: 22.80%
	 Val. Loss: 1.728 |  Val. Acc: 23.11%


100%|██████████| 112/112 [00:02<00:00, 43.29it/s]
100%|██████████| 48/48 [00:00<00:00, 123.98it/s]


Epoch: 07 | Epoch Time: 0m 3s
	Train Loss: 1.734 | Train Acc: 22.35%
	 Val. Loss: 1.730 |  Val. Acc: 23.21%


100%|██████████| 112/112 [00:02<00:00, 43.09it/s]
100%|██████████| 48/48 [00:00<00:00, 116.18it/s]


Epoch: 08 | Epoch Time: 0m 3s
	Train Loss: 1.730 | Train Acc: 23.69%
	 Val. Loss: 1.743 |  Val. Acc: 23.27%


100%|██████████| 112/112 [00:02<00:00, 41.10it/s]
100%|██████████| 48/48 [00:00<00:00, 123.61it/s]


Epoch: 09 | Epoch Time: 0m 3s
	Train Loss: 1.732 | Train Acc: 23.14%
	 Val. Loss: 1.724 |  Val. Acc: 24.45%


100%|██████████| 112/112 [00:02<00:00, 43.88it/s]
100%|██████████| 48/48 [00:00<00:00, 122.82it/s]


Epoch: 10 | Epoch Time: 0m 2s
	Train Loss: 1.723 | Train Acc: 24.00%
	 Val. Loss: 1.732 |  Val. Acc: 23.47%


100%|██████████| 112/112 [00:02<00:00, 43.35it/s]
100%|██████████| 48/48 [00:00<00:00, 123.09it/s]


Epoch: 11 | Epoch Time: 0m 3s
	Train Loss: 1.725 | Train Acc: 24.07%
	 Val. Loss: 1.724 |  Val. Acc: 23.47%


100%|██████████| 112/112 [00:02<00:00, 43.00it/s]
100%|██████████| 48/48 [00:00<00:00, 109.54it/s]


Epoch: 12 | Epoch Time: 0m 3s
	Train Loss: 1.723 | Train Acc: 23.84%
	 Val. Loss: 1.726 |  Val. Acc: 23.08%


100%|██████████| 112/112 [00:02<00:00, 40.29it/s]
100%|██████████| 48/48 [00:00<00:00, 121.09it/s]


Epoch: 13 | Epoch Time: 0m 3s
	Train Loss: 1.718 | Train Acc: 24.53%
	 Val. Loss: 1.726 |  Val. Acc: 23.37%


100%|██████████| 112/112 [00:02<00:00, 41.69it/s]
100%|██████████| 48/48 [00:00<00:00, 117.80it/s]


Epoch: 14 | Epoch Time: 0m 3s
	Train Loss: 1.717 | Train Acc: 25.14%
	 Val. Loss: 1.723 |  Val. Acc: 23.70%


100%|██████████| 112/112 [00:02<00:00, 42.28it/s]
100%|██████████| 48/48 [00:00<00:00, 119.08it/s]


Epoch: 15 | Epoch Time: 0m 3s
	Train Loss: 1.710 | Train Acc: 24.94%
	 Val. Loss: 1.732 |  Val. Acc: 23.37%


100%|██████████| 112/112 [00:02<00:00, 42.42it/s]
100%|██████████| 48/48 [00:00<00:00, 114.89it/s]


Epoch: 16 | Epoch Time: 0m 3s
	Train Loss: 1.709 | Train Acc: 25.63%
	 Val. Loss: 1.729 |  Val. Acc: 23.80%


100%|██████████| 112/112 [00:02<00:00, 38.55it/s]
100%|██████████| 48/48 [00:00<00:00, 119.01it/s]


Epoch: 17 | Epoch Time: 0m 3s
	Train Loss: 1.707 | Train Acc: 25.27%
	 Val. Loss: 1.733 |  Val. Acc: 23.57%


100%|██████████| 112/112 [00:02<00:00, 42.21it/s]
100%|██████████| 48/48 [00:00<00:00, 118.40it/s]


Epoch: 18 | Epoch Time: 0m 3s
	Train Loss: 1.700 | Train Acc: 26.31%
	 Val. Loss: 1.737 |  Val. Acc: 23.89%


100%|██████████| 112/112 [00:02<00:00, 41.45it/s]
100%|██████████| 48/48 [00:00<00:00, 118.93it/s]


Epoch: 19 | Epoch Time: 0m 3s
	Train Loss: 1.705 | Train Acc: 25.35%
	 Val. Loss: 1.750 |  Val. Acc: 23.89%


100%|██████████| 112/112 [00:02<00:00, 41.53it/s]
100%|██████████| 48/48 [00:00<00:00, 119.10it/s]

Epoch: 20 | Epoch Time: 0m 3s
	Train Loss: 1.702 | Train Acc: 26.07%
	 Val. Loss: 1.738 |  Val. Acc: 23.44%





In [None]:
model.load_state_dict(torch.load('liar-model.pt'))

test_loss, test_acc, precision, recall, f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Precision: {precision} | Recall: {recall} | F1: {f1}')

# save the results to a csv file
results = pd.DataFrame([[test_acc, precision, recall, f1]], columns=['accuracy', 'precision', 'recall', 'f1'], index=['RNN'])
results.to_csv('results_liar_rnn.csv')

100%|██████████| 20/20 [00:00<00:00, 57.54it/s]

Test Loss: 1.717 | Test Acc: 24.23% | Precision: 0.2299107228945637 | Recall: 0.24072612470402527 | F1: 0.20934509010732358





In [None]:
!gdown --folder https://drive.google.com/drive/u/1/folders/1wf7mFLCqQo0t802IDkZKMOinciUwohuR

Retrieving folder list
Processing file 11UvyoobnRVXsNkjCsRl848mdYN0Yi18K WELFake_Dataset.csv
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=11UvyoobnRVXsNkjCsRl848mdYN0Yi18K
To: /content/WELFake/WELFake_Dataset.csv
100% 245M/245M [00:09<00:00, 24.6MB/s]
Download completed


In [None]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


# set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



nltk.download('punkt')
def tokenize(text):
    return word_tokenize(text)



# define the fields
TEXT = data.Field(tokenize=tokenize, include_lengths=True, unk_token='<unk>')
LABEL = data.LabelField(dtype=torch.float)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# load the welfake dataset

csv_path_welfake = os.path.join( '/content', 'WELFake', 'WELFake_Dataset.csv')

df = pd.read_csv(csv_path_welfake)
df = df.drop(['Unnamed: 0', 'title'], axis=1)
df.columns = ['text', 'label']
df['label'] = df['label'].replace('fake', 0)
df['label'] = df['label'].replace('real', 1)
df.to_csv('.//welfake.csv', index=False)
# drop the rows with np.nan values on text column
df = df.dropna(subset=['text'])
df = df[df['text'].str.len() > 30]

# split the dataset into train, validation and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED)

# save the train, validation and test sets as csv files
train_df.to_csv('.//welfake_train.csv', index=False)
test_df.to_csv('.//welfake_test.csv', index=False)

In [None]:
# load the data
train_data, test_data = data.TabularDataset.splits(
    path='./',
    train='welfake_train.csv',
    test='welfake_test.csv',
    format='csv',
    fields=[('text', TEXT), ('label', LABEL)]
)

# split the train data into train and validation sets

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [None]:
# build the vocabulary
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                    max_size=MAX_VOCAB_SIZE,
                    vectors="glove.6B.100d",
                    unk_init=torch.Tensor.normal_)

LABEL.build_vocab(train_data)



.vector_cache/glove.6B.zip: 862MB [02:39, 5.42MB/s]                           
100%|█████████▉| 399999/400000 [00:21<00:00, 18238.03it/s]


In [None]:
# create the iterators
BATCH_SIZE = 26

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
    )

In [None]:
# define the hyperparameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 4
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# initialize the model
model = RNN(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

# define the optimizer and the loss function
optimizer = optim.Adam(model.parameters(), lr = 0.001)
criterion = nn.BCEWithLogitsLoss()

# push the model to the device
model = model.to(device)
criterion = criterion.to(device)

In [None]:
# define the accuracy function
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


# define the training function
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in tqdm(iterator):

        optimizer.zero_grad()


        text, text_lengths = batch.text

        # convert tensor to actual text

        if any(length <= 0 for length in text_lengths):
            print("Skipping batch with zero or negative sequence length.")
            continue

        #try:
        #print(text_lengths)
        predictions = model(text, text_lengths).squeeze(1)
        #except:
        #  text_debug = [TEXT.vocab.itos[i] for i in text[:,0]]
        #  print(text_debug)
        #print(text_lengths)
        loss = criterion(predictions, batch.label)

        acc = binary_accuracy(predictions, batch.label.long())

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# define the evaluation function

def evaluate(model, iterator, criterion):

        epoch_loss = 0
        epoch_acc = 0
        all_predictions = []
        all_labels = []

        model.eval()

        with torch.no_grad():

            for batch in tqdm(iterator):

                text, text_lengths = batch.text

                predictions = model(text, text_lengths).squeeze(1)

                loss = criterion(predictions, batch.label)

                acc = binary_accuracy(predictions, batch.label.long())

                epoch_loss += loss.item()
                epoch_acc += acc.item()

                # Convert probability scores to binary predictions using a threshold (e.g., 0.5)
                threshold = 0.5
                binary_predictions = (predictions > threshold).float()

                # calculate precision, recall and f1 score

                all_predictions.extend(binary_predictions.cpu().numpy())
                all_labels.extend(batch.label.long().cpu().numpy())

        precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=True)
        recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=True)
        f1 = f1_score(all_labels, all_predictions, average='weighted')

        return epoch_loss / len(iterator), epoch_acc / len(iterator), precision, recall, f1
# define the function to calculate the time elapsed

def epoch_time(start_time, end_time):

        elapsed_time = end_time - start_time

        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

        return elapsed_mins, elapsed_secs

In [None]:
# train the model

N_EPOCHS = 4

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
            valid_loss, valid_acc, _, _, _ = evaluate(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(), 'welfake-model.pt')

            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
            print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

# evaluate the model on the test set

model.load_state_dict(torch.load('welfake-model.pt'))

test_loss, test_acc, precision, recall, f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Precision: {precision} | Recall: {recall} | F1: {f1}')



# save the results to a csv file
results = pd.DataFrame([[test_acc, precision, recall, f1]], columns=['accuracy', 'precision', 'recall', 'f1'], index=['RNN'])
results.to_csv('results_WELFake_rnn.csv')


100%|██████████| 1534/1534 [07:02<00:00,  3.63it/s]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 658/658 [01:10<00:00,  9.34it/s]


Epoch: 01 | Epoch Time: 8m 13s
	Train Loss: 0.118 | Train Acc: 95.14%
	 Val. Loss: 0.073 |  Val. Acc: 97.44%


100%|██████████| 1534/1534 [07:00<00:00,  3.65it/s]
100%|██████████| 658/658 [01:07<00:00,  9.81it/s]


Epoch: 02 | Epoch Time: 8m 7s
	Train Loss: 0.076 | Train Acc: 97.16%
	 Val. Loss: 0.040 |  Val. Acc: 98.61%


100%|██████████| 1534/1534 [07:01<00:00,  3.64it/s]
100%|██████████| 658/658 [01:07<00:00,  9.79it/s]


Epoch: 03 | Epoch Time: 8m 8s
	Train Loss: 0.044 | Train Acc: 98.42%
	 Val. Loss: 0.047 |  Val. Acc: 98.20%


100%|██████████| 1534/1534 [07:02<00:00,  3.63it/s]
100%|██████████| 658/658 [01:10<00:00,  9.39it/s]


Epoch: 04 | Epoch Time: 8m 13s
	Train Loss: 0.035 | Train Acc: 98.72%
	 Val. Loss: 0.029 |  Val. Acc: 98.98%


  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 548/548 [00:59<00:00,  9.22it/s]

Test Loss: 0.034 | Test Acc: 98.77% | Precision: 0.8947368421052632 | Recall: 0.8947368421052632 | F1: 0.8947368421052632





In [None]:
test_loss, test_acc, precision, recall, f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Precision: {precision} | Recall: {recall} | F1: {f1}')
# save the results to a csv file
results = pd.DataFrame([[test_acc, precision, recall, f1]], columns=['accuracy', 'precision', 'recall', 'f1'], index=['RNN'])
results.to_csv('results_WELFake_rnn.csv')

100%|██████████| 548/548 [00:55<00:00,  9.85it/s]

Test Loss: 0.034 | Test Acc: 98.77% | Precision: 0.9882926794849918 | Recall: 0.9882732954146478 | F1: 0.9882390721372001





In [16]:
# load the liar dataset
csv_path_liar_train = os.path.join( '/content', 'liar_dataset', 'train.tsv')
csv_path_liar_test = os.path.join( '/content', 'liar_dataset', 'test.tsv')

df_liar_train = pd.read_csv(csv_path_liar_train, sep='\t', header=None)
df_liar_test = pd.read_csv(csv_path_liar_test, sep='\t', header=None)

df_liar_train.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']
df_liar_test.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']

df_liar_train = df_liar_train[['label', 'statement']]
df_liar_test = df_liar_test[['label', 'statement']]
df_liar_train = df_liar_train.dropna()
df_liar_test = df_liar_test.dropna()
df_liar_train['label'] = df_liar_train['label'].replace(['mostly-true'], 'true')
df_liar_test['label'] = df_liar_test['label'].replace(['mostly-true'], 'true')
df_liar_train['label'] = df_liar_train['label'].replace(['half-true'], 'true')
df_liar_test['label'] = df_liar_test['label'].replace(['half-true'], 'true')
df_liar_train['label'] = df_liar_train['label'].replace(['barely-true'], 'false')
df_liar_test['label'] = df_liar_test['label'].replace(['barely-true'], 'false')
df_liar_train['label'] = df_liar_train['label'].replace(['pants-fire'], 'false')
df_liar_test['label'] = df_liar_test['label'].replace(['pants-fire'], 'false')



# save the train and test sets to csv files
df_liar_train.to_csv('train.csv', index=False)
df_liar_test.to_csv('test.csv', index=False)

In [17]:
df_liar_train['label'].unique()

array(['false', 'true'], dtype=object)

In [18]:
# set the seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


# set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
def tokenize_spacy(text):
    return [token.text for token in nlp(text)]


# define the fields
TEXT = data.Field(tokenize=tokenize_spacy, include_lengths=True, unk_token='<unk>')
LABEL = data.LabelField(dtype=torch.float)

In [19]:
# load the data
train_data, test_data = data.TabularDataset.splits(
    path='./',
    train='train.csv',
    test='test.csv',
    format='csv',
    skip_header=True,
    fields=[('label', LABEL), ('text', TEXT)]
)

# split the train data into train and validation sets

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

# build the vocabulary
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                    max_size=MAX_VOCAB_SIZE,
                    vectors="glove.6B.100d",
                    unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)



In [20]:
# create the iterators
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
    )

In [21]:
# define the model

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout)

        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        self.dropout = nn.Dropout(dropout)
        self.output_dim = output_dim

    def forward(self, text, text_lengths):

        embedded = self.dropout(self.embedding(text))


        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False)

        packed_output, (hidden, cell) = self.rnn(packed_embedded)

        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))

        return self.fc(hidden)

In [36]:
# define the hyperparameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 4
BIDIRECTIONAL = True
DROPOUT = 0.6
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# initialize the model
model = RNN(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

# define the optimizer and the loss function
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
criterion = nn.BCEWithLogitsLoss()

# push the model to the device
model = model.to(device)
criterion = criterion.to(device)

In [23]:
# define the accuracy function
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


# define the training function
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in tqdm(iterator):

        optimizer.zero_grad()


        text, text_lengths = batch.text

        # convert tensor to actual text

        if any(length <= 0 for length in text_lengths):
            print("Skipping batch with zero or negative sequence length.")
            continue

        predictions = model(text, text_lengths).squeeze(1)

        loss = criterion(predictions, batch.label)

        acc = binary_accuracy(predictions, batch.label.long())

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# define the evaluation function

def evaluate(model, iterator, criterion):

        epoch_loss = 0
        epoch_acc = 0
        all_predictions = []
        all_labels = []

        model.eval()

        with torch.no_grad():

            for batch in tqdm(iterator):

                text, text_lengths = batch.text

                predictions = model(text, text_lengths).squeeze(1)

                loss = criterion(predictions, batch.label)

                acc = binary_accuracy(predictions, batch.label.long())

                epoch_loss += loss.item()
                epoch_acc += acc.item()

                # Convert probability scores to binary predictions using a threshold (e.g., 0.5)
                threshold = 0.5
                binary_predictions = (predictions > threshold).float()

                # calculate precision, recall and f1 score

                all_predictions.extend(binary_predictions.cpu().numpy())
                all_labels.extend(batch.label.long().cpu().numpy())

        precision = precision_score(all_labels, all_predictions,  zero_division=True)
        recall = recall_score(all_labels, all_predictions,  zero_division=True)
        f1 = f1_score(all_labels, all_predictions)

        return epoch_loss / len(iterator), epoch_acc / len(iterator), precision, recall, f1
# define the function to calculate the time elapsed

def epoch_time(start_time, end_time):

        elapsed_time = end_time - start_time

        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

        return elapsed_mins, elapsed_secs

In [37]:
# train the model

N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
            valid_loss, valid_acc, _, _, _ = evaluate(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(), 'welfake-model.pt')

            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
            print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

# evaluate the model on the test set

model.load_state_dict(torch.load('welfake-model.pt'))

test_loss, test_acc, precision, recall, f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Precision: {precision} | Recall: {recall} | F1: {f1}')



# save the results to a csv file
results = pd.DataFrame([[test_acc, precision, recall, f1]], columns=['accuracy', 'precision', 'recall', 'f1'], index=['RNN'])
results.to_csv('results_WELFake_rnn.csv')


100%|██████████| 112/112 [00:02<00:00, 38.13it/s]
100%|██████████| 48/48 [00:00<00:00, 109.17it/s]


Epoch: 01 | Epoch Time: 0m 3s
	Train Loss: 0.687 | Train Acc: 56.04%
	 Val. Loss: 0.684 |  Val. Acc: 56.51%


100%|██████████| 112/112 [00:03<00:00, 37.13it/s]
100%|██████████| 48/48 [00:00<00:00, 107.78it/s]


Epoch: 02 | Epoch Time: 0m 3s
	Train Loss: 0.685 | Train Acc: 56.03%
	 Val. Loss: 0.682 |  Val. Acc: 56.51%


100%|██████████| 112/112 [00:02<00:00, 40.02it/s]
100%|██████████| 48/48 [00:00<00:00, 117.64it/s]


Epoch: 03 | Epoch Time: 0m 3s
	Train Loss: 0.682 | Train Acc: 56.29%
	 Val. Loss: 0.674 |  Val. Acc: 57.16%


100%|██████████| 112/112 [00:02<00:00, 40.07it/s]
100%|██████████| 48/48 [00:00<00:00, 120.06it/s]


Epoch: 04 | Epoch Time: 0m 3s
	Train Loss: 0.676 | Train Acc: 57.65%
	 Val. Loss: 0.669 |  Val. Acc: 58.72%


100%|██████████| 112/112 [00:02<00:00, 39.37it/s]
100%|██████████| 48/48 [00:00<00:00, 121.47it/s]


Epoch: 05 | Epoch Time: 0m 3s
	Train Loss: 0.672 | Train Acc: 57.97%
	 Val. Loss: 0.667 |  Val. Acc: 59.51%


100%|██████████| 112/112 [00:02<00:00, 38.42it/s]
100%|██████████| 48/48 [00:00<00:00, 118.76it/s]


Epoch: 06 | Epoch Time: 0m 3s
	Train Loss: 0.672 | Train Acc: 58.50%
	 Val. Loss: 0.664 |  Val. Acc: 59.18%


100%|██████████| 112/112 [00:02<00:00, 39.94it/s]
100%|██████████| 48/48 [00:00<00:00, 117.44it/s]


Epoch: 07 | Epoch Time: 0m 3s
	Train Loss: 0.667 | Train Acc: 58.83%
	 Val. Loss: 0.662 |  Val. Acc: 59.57%


100%|██████████| 112/112 [00:02<00:00, 40.00it/s]
100%|██████████| 48/48 [00:00<00:00, 118.78it/s]


Epoch: 08 | Epoch Time: 0m 3s
	Train Loss: 0.664 | Train Acc: 60.04%
	 Val. Loss: 0.666 |  Val. Acc: 58.50%


100%|██████████| 112/112 [00:02<00:00, 39.59it/s]
100%|██████████| 48/48 [00:00<00:00, 109.57it/s]


Epoch: 09 | Epoch Time: 0m 3s
	Train Loss: 0.661 | Train Acc: 59.86%
	 Val. Loss: 0.658 |  Val. Acc: 60.35%


100%|██████████| 112/112 [00:02<00:00, 39.41it/s]
100%|██████████| 48/48 [00:00<00:00, 116.79it/s]


Epoch: 10 | Epoch Time: 0m 3s
	Train Loss: 0.661 | Train Acc: 60.59%
	 Val. Loss: 0.657 |  Val. Acc: 60.68%


100%|██████████| 20/20 [00:00<00:00, 79.67it/s] 

Test Loss: 0.658 | Test Acc: 60.89% | Precision: 0.6078431372549019 | Recall: 0.05605786618444846 | F1: 0.10264900662251657





In [31]:
test_loss, test_acc, precision, recall, f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Precision: {precision} | Recall: {recall} | F1: {f1}')
# save the results to a csv file
results = pd.DataFrame([[test_acc, precision, recall, f1]], columns=['accuracy', 'precision', 'recall', 'f1'], index=['RNN'])
results.to_csv('results_LIAR_BINARY_rnn.csv')

100%|██████████| 20/20 [00:00<00:00, 71.29it/s] 

Test Loss: 0.669 | Test Acc: 58.63% | Precision: 0.5652173913043478 | Recall: 0.0705244122965642 | F1: 0.12540192926045018



