In [None]:
!pip install gdown==4.6.0

!gdown --folder https://drive.google.com/drive/u/1/folders/15Wn46r7gidaiZbx2ArFYsd7rjYH4y7JM

!pip install torchtext==0.6.0

!pip install -U pip setuptools wheel

!pip install -U spacy==2.3.5

!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import torch
from torchtext import data
from torchtext import datasets
import random
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import datetime
import spacy
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import nltk
from nltk.tokenize import word_tokenize

In [None]:
# set the seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


# set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

nlp = spacy.load("en_core_web_sm")
def tokenize_spacy(text):
    return [token.text for token in nlp(text)]


# define the fields
TEXT = data.Field(tokenize=tokenize_spacy, include_lengths=True, batch_first=True)
LABEL = data.LabelField(dtype=torch.float)

In [None]:
# load the liar dataset
csv_path_liar_train = os.path.join( '/content', 'liar_dataset', 'train.tsv')
csv_path_liar_test = os.path.join( '/content', 'liar_dataset', 'test.tsv')

df_liar_train = pd.read_csv(csv_path_liar_train, sep='\t', header=None)
df_liar_test = pd.read_csv(csv_path_liar_test, sep='\t', header=None)

df_liar_train.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']
df_liar_test.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']

df_liar_train = df_liar_train[['label', 'statement']]
df_liar_test = df_liar_test[['label', 'statement']]
df_liar_train = df_liar_train.dropna()
df_liar_test = df_liar_test.dropna()

# save the train and test sets to csv files
df_liar_train.to_csv('train.csv', index=False)
df_liar_test.to_csv('test.csv', index=False)

In [None]:
# load the data
train_data, test_data = data.TabularDataset.splits(
    path='./',
    train='train.csv',
    test='test.csv',
    format='csv',
    skip_header=True,
    fields=[('label', LABEL), ('text', TEXT)]
)

# split the train data into train and validation sets

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

# build the vocabulary
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                    max_size=MAX_VOCAB_SIZE,
                    vectors="glove.6B.100d",
                    unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)



In [None]:
# create the iterators
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
)

In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim, dropout_prob=0.5):
        super(CNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=fs)
            for fs in filter_sizes
        ])

        # Calculate total number of filters
        total_filters = num_filters * len(filter_sizes)

        self.fc = nn.Linear(total_filters, output_dim)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        embedded = self.embedding(x)

        # Permute the tensor to have channels as the second dimension
        embedded = embedded.permute(0, 2, 1)

        # Apply convolutional layers and use ReLU activation
        conv_outputs = [F.relu(conv(embedded)) for conv in self.conv_layers]

        # Max-pooling over time
        pooled_outputs = [F.max_pool1d(conv_output, conv_output.size(2)).squeeze(2) for conv_output in conv_outputs]

        # Concatenate the pooled features
        cat_features = torch.cat(pooled_outputs, dim=1)

        # Fully connected layer without ReLU in the last layer
        fc_output = self.fc(cat_features)

        # Dropout for regularization
        output = self.dropout(fc_output)

        return output

In [None]:
# define the hyperparameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 64
N_FILTERS = 28
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = 6
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# initialize the model
model = CNN(25000, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

# define the optimizer and the loss function
optimizer = optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss()

# push the model to the device
model = model.to(device)
criterion = criterion.to(device)

In [None]:
# define the accuracy function
def categorical_accuracy(preds, y):
    top_pred = preds.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    return correct.float() / y.shape[0]

# define the training function
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in tqdm(iterator):

        optimizer.zero_grad()


        text, text_lengths = batch.text

        predictions = model(text).squeeze(1)

        loss = criterion(predictions, batch.label.long())

        acc = categorical_accuracy(predictions, batch.label.long())

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# define the evaluation function

def evaluate(model, iterator, criterion):

        epoch_loss = 0
        epoch_acc = 0

        all_predictions = []
        all_labels = []

        model.eval()

        with torch.no_grad():

            for batch in tqdm(iterator):

                text, text_lengths = batch.text

                predictions = model(text).squeeze(1)

                loss = criterion(predictions, batch.label.long())

                acc = categorical_accuracy(predictions, batch.label.long())

                epoch_loss += loss.item()
                epoch_acc += acc.item()

                # calculate precision, recall and f1 score
                y_pred = predictions.argmax(1, keepdim=True)
                y_pred = y_pred.squeeze(1)
                y_true = batch.label.long()
                #y_true = y_true.squeeze(1)

                all_predictions.extend(y_pred.cpu().numpy())
                all_labels.extend(batch.label.long().cpu().numpy())

        precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=True)
        recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=True)
        f1 = f1_score(all_labels, all_predictions, average='weighted')


        return epoch_loss / len(iterator), epoch_acc / len(iterator), precision, recall, f1
# define the function to calculate the time elapsed

def epoch_time(start_time, end_time):

        elapsed_time = end_time - start_time

        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

        return elapsed_mins, elapsed_secs

In [None]:
# train the model

N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        valid_loss, valid_acc, _, _, _ = evaluate(model, valid_iterator, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'liar-model.pt')

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|██████████| 112/112 [00:02<00:00, 44.96it/s]
100%|██████████| 48/48 [00:00<00:00, 250.71it/s]


Epoch: 01 | Epoch Time: 0m 2s
	Train Loss: 1.741 | Train Acc: 24.94%
	 Val. Loss: 1.767 |  Val. Acc: 21.09%


100%|██████████| 112/112 [00:02<00:00, 44.12it/s]
100%|██████████| 48/48 [00:00<00:00, 172.31it/s]


Epoch: 02 | Epoch Time: 0m 2s
	Train Loss: 1.701 | Train Acc: 28.45%
	 Val. Loss: 1.762 |  Val. Acc: 22.66%


100%|██████████| 112/112 [00:03<00:00, 32.65it/s]
100%|██████████| 48/48 [00:00<00:00, 268.16it/s]


Epoch: 03 | Epoch Time: 0m 3s
	Train Loss: 1.651 | Train Acc: 31.82%
	 Val. Loss: 1.765 |  Val. Acc: 21.74%


100%|██████████| 112/112 [00:02<00:00, 45.64it/s]
100%|██████████| 48/48 [00:00<00:00, 247.76it/s]


Epoch: 04 | Epoch Time: 0m 2s
	Train Loss: 1.611 | Train Acc: 34.43%
	 Val. Loss: 1.770 |  Val. Acc: 21.88%


100%|██████████| 112/112 [00:02<00:00, 47.06it/s]
100%|██████████| 48/48 [00:00<00:00, 256.91it/s]


Epoch: 05 | Epoch Time: 0m 2s
	Train Loss: 1.563 | Train Acc: 37.64%
	 Val. Loss: 1.770 |  Val. Acc: 20.96%


100%|██████████| 112/112 [00:02<00:00, 38.80it/s]
100%|██████████| 48/48 [00:00<00:00, 93.03it/s]


Epoch: 06 | Epoch Time: 0m 3s
	Train Loss: 1.509 | Train Acc: 40.35%
	 Val. Loss: 1.778 |  Val. Acc: 21.48%


100%|██████████| 112/112 [00:06<00:00, 16.61it/s]
100%|██████████| 48/48 [00:00<00:00, 242.09it/s]


Epoch: 07 | Epoch Time: 0m 6s
	Train Loss: 1.461 | Train Acc: 42.13%
	 Val. Loss: 1.792 |  Val. Acc: 21.68%


100%|██████████| 112/112 [00:02<00:00, 45.05it/s]
100%|██████████| 48/48 [00:00<00:00, 261.65it/s]


Epoch: 08 | Epoch Time: 0m 2s
	Train Loss: 1.398 | Train Acc: 45.77%
	 Val. Loss: 1.794 |  Val. Acc: 22.23%


100%|██████████| 112/112 [00:02<00:00, 46.54it/s]
100%|██████████| 48/48 [00:00<00:00, 249.62it/s]


Epoch: 09 | Epoch Time: 0m 2s
	Train Loss: 1.334 | Train Acc: 48.28%
	 Val. Loss: 1.808 |  Val. Acc: 21.74%


100%|██████████| 112/112 [00:02<00:00, 37.99it/s]
100%|██████████| 48/48 [00:00<00:00, 174.47it/s]


Epoch: 10 | Epoch Time: 0m 3s
	Train Loss: 1.278 | Train Acc: 50.35%
	 Val. Loss: 1.813 |  Val. Acc: 22.07%


In [None]:
model.load_state_dict(torch.load('liar-model.pt'))

test_loss, test_acc, precision, recall, f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Precision: {precision} | Recall: {recall} | F1: {f1}')

# save the results to a csv file
results = pd.DataFrame([[test_acc, precision, recall, f1]], columns=['accuracy', 'precision', 'recall', 'f1'], index=['CNN'])
results.to_csv('results_liar_cnn.csv')

NameError: name 'model' is not defined

In [None]:
!gdown --folder https://drive.google.com/drive/u/1/folders/1wf7mFLCqQo0t802IDkZKMOinciUwohuR

Retrieving folder list
Processing file 11UvyoobnRVXsNkjCsRl848mdYN0Yi18K WELFake_Dataset.csv
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=11UvyoobnRVXsNkjCsRl848mdYN0Yi18K
To: /content/WELFake/WELFake_Dataset.csv
100% 245M/245M [00:04<00:00, 51.5MB/s]
Download completed


In [None]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


# set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

nltk.download('punkt')
def tokenize(text):
    return word_tokenize(text)



# define the fields
TEXT = data.Field(tokenize=tokenize, include_lengths=True, unk_token='<unk>', batch_first=True)
LABEL = data.LabelField(dtype=torch.float)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# load the welfake dataset

csv_path_welfake = os.path.join( '/content', 'WELFake', 'WELFake_Dataset.csv')

df = pd.read_csv(csv_path_welfake)
df = df.drop(['Unnamed: 0', 'title'], axis=1)
df.columns = ['text', 'label']
df['label'] = df['label'].replace('fake', 0)
df['label'] = df['label'].replace('real', 1)
df.to_csv('.//welfake.csv', index=False)
# drop the rows with np.nan values on text column
df = df.dropna(subset=['text'])
df = df[df['text'].str.len() > 30]

# split the dataset into train, validation and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED)

# save the train, validation and test sets as csv files
train_df.to_csv('.//welfake_train.csv', index=False)
test_df.to_csv('.//welfake_test.csv', index=False)

In [None]:
# load the data
train_data, test_data = data.TabularDataset.splits(
    path='./',
    train='welfake_train.csv',
    test='welfake_test.csv',
    format='csv',
    fields=[('text', TEXT), ('label', LABEL)]
)

# split the train data into train and validation sets

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [None]:
# build the vocabulary
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                    max_size=MAX_VOCAB_SIZE,
                    vectors="glove.6B.100d",
                    unk_init=torch.Tensor.normal_)

LABEL.build_vocab(train_data)



In [None]:
# create the iterators
BATCH_SIZE = 26

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
    )

In [None]:
# define the hyperparameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 26
N_FILTERS = 28
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# initialize the model
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

# define the optimizer and the loss function
optimizer = optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.BCEWithLogitsLoss()


# push the model to the device
model = model.to(device)
criterion = criterion.to(device)

In [None]:
# define the accuracy function
def binary_accuracy(preds, y):
    threshold = .5
    binary_preds = (torch.sigmoid(preds) > threshold).float()

    correct = (binary_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


# define the training function
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in tqdm(iterator):

        optimizer.zero_grad()


        text, text_lengths = batch.text

        predictions = model(text)

        loss = criterion(predictions, batch.label.unsqueeze(1))

        acc = binary_accuracy(predictions, batch.label.unsqueeze(1))

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# define the evaluation function

def evaluate(model, iterator, criterion):

        epoch_loss = 0
        epoch_acc = 0
        all_predictions = []
        all_labels = []

        model.eval()

        with torch.no_grad():

            for batch in tqdm(iterator):

                text, text_lengths = batch.text

                predictions = model(text)

                loss = criterion(predictions, batch.label.unsqueeze(1))

                acc = binary_accuracy(predictions, batch.label.unsqueeze(1))

                epoch_loss += loss.item()
                epoch_acc += acc.item()

                # Convert probability scores to binary predictions using a threshold (e.g., 0.5)
                threshold = 0.5
                binary_predictions = (predictions > threshold).float()

                # calculate precision, recall and f1 score

                all_predictions.extend(binary_predictions.cpu().numpy())
                all_labels.extend(batch.label.long().cpu().numpy())

        precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=True)
        recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=True)
        f1 = f1_score(all_labels, all_predictions, average='weighted')

        return epoch_loss / len(iterator), epoch_acc / len(iterator), precision, recall, f1
# define the function to calculate the time elapsed

def epoch_time(start_time, end_time):

        elapsed_time = end_time - start_time

        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

        return elapsed_mins, elapsed_secs

In [None]:
# train the model

N_EPOCHS = 4

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
            valid_loss, valid_acc, _, _, _ = evaluate(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(), 'welfake-model.pt')

            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
            print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

# evaluate the model on the test set

model.load_state_dict(torch.load('welfake-model.pt'))




100%|██████████| 1534/1534 [00:41<00:00, 37.03it/s]
100%|██████████| 658/658 [00:07<00:00, 90.26it/s]


Epoch: 01 | Epoch Time: 0m 48s
	Train Loss: 0.352 | Train Acc: 75.38%
	 Val. Loss: 0.045 |  Val. Acc: 98.57%


100%|██████████| 1534/1534 [00:41<00:00, 36.80it/s]
100%|██████████| 658/658 [00:07<00:00, 86.86it/s]


Epoch: 02 | Epoch Time: 0m 49s
	Train Loss: 0.350 | Train Acc: 75.58%
	 Val. Loss: 0.046 |  Val. Acc: 98.50%


100%|██████████| 1534/1534 [00:42<00:00, 36.42it/s]
100%|██████████| 658/658 [00:07<00:00, 87.23it/s]


Epoch: 03 | Epoch Time: 0m 49s
	Train Loss: 0.350 | Train Acc: 75.47%
	 Val. Loss: 0.041 |  Val. Acc: 98.65%


100%|██████████| 1534/1534 [00:41<00:00, 36.57it/s]
100%|██████████| 658/658 [00:07<00:00, 86.45it/s]


Epoch: 04 | Epoch Time: 0m 49s
	Train Loss: 0.348 | Train Acc: 75.58%
	 Val. Loss: 0.039 |  Val. Acc: 98.75%


100%|██████████| 548/548 [00:06<00:00, 83.10it/s]


Test Loss: 0.037 | Test Acc: 98.71% | Precision: 0.9866243845061924 | Recall: 0.9865178007162418 | F1: 0.9864807774338483


In [None]:
test_loss, test_acc, precision, recall, f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Precision: {precision} | Recall: {recall} | F1: {f1}')



# save the results to a csv file
results = pd.DataFrame([[test_acc, precision, recall, f1]], columns=['accuracy', 'precision', 'recall', 'f1'], index=['CNN'])
results.to_csv('results_WELFake_cnn.csv')

100%|██████████| 548/548 [00:06<00:00, 85.20it/s]


Test Loss: 0.037 | Test Acc: 98.71% | Precision: 0.9866243845061924 | Recall: 0.9865178007162418 | F1: 0.9864807774338483
