<a href="https://colab.research.google.com/github/Anvians/Deep-Learning/blob/main/FakeNewsDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install datasets

In [24]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch.nn as nn

In [25]:
dataset = load_dataset("liar",trust_remote_code=True)

In [26]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['statement'], padding="max_length", truncation=True)

In [27]:
train_dataset = dataset['train'].map(tokenize_function, batched=True, remove_columns=["statement"])
test_dataset = dataset['test'].map(tokenize_function, batched=True, remove_columns=["statement"])

In [28]:
dataset.shape

{'train': (10269, 14), 'test': (1283, 14), 'validation': (1284, 14)}

In [29]:
len(tokenizer)

30522

30522

In [33]:
tokenizer.vocab_size

30522

In [30]:
dataset['train']['statement'].__len__()

10269

### Building my own Transformer

In [54]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

**Embedding Layer**

In [49]:
embedding_layer = nn.Embedding(num_embeddings=tokenizer.vocab_size, embedding_dim=768)

In [52]:
for idss in train_dataset['input_ids']:
  input_ids = torch.tensor(idss).unsqueeze(0)
  input_embedding = embedding_layer(input_ids)

input_embedding.shape

torch.Size([1, 512, 768])

**Positional Encoding**

In [63]:
class Positional_encoding(nn.Module):
  def __init__(self, embedding_dim, max_seq_length):
    super().__init__()
    self.encoding = torch.zeros(max_seq_length, embedding_dim)
    position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1).float()
    # div_term = torch.tensor([(10000 ** (2 * i)) / embedding_dim for i in range(embedding_dim)], dtype=torch.float)
    div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / embedding_dim))
    self.encoding[:, 0::2] = torch.sin(position/div_term)
    self.encoding[:, 1::2] = torch.cos(position/div_term)
    self.encoding = self.encoding.unsqueeze(0)

  def forward(self, x):
    seq_length = x.size(1)
    return x + self.encoding[:, :seq_length, :].to(x.device)



In [64]:
positional_encoding = Positional_encoding(embedding_dim = 768, max_seq_length = 512)
input_embedding_with_position = positional_encoding(input_embedding)
input_embedding_with_position.shape

torch.Size([1, 512, 768])

**Multihead Attention**

In [None]:
class Multihead_attention(nn.Module):
  def __init__(self, embedding_dim, num_heads):
    super().__init__()
    self.num_heads = num_heads
    self.head_dim = embedding_dim // num_heads


    self.q_layer = nn.Linear(embedding_dim, embedding_dim)
    self.k_layer = nn.Linear(embedding_dim, embedding_dim)
    self.v_layer = nn.Linear(embedding_dim, embedding_dim)

    self.output_layer = nn.Linear(embedding_dim, embedding_dim)

  def forward(self, x):
    batch_size, seq_length, embedding_dim = x.size()
    Q = self.q_layer(x)
    K = self.k_layer(x)
    V = self.v_layer(x)