In [None]:
import pandas as pd
import transformers

Column 1: the ID of the statement ([ID].json).

Column 2: the label.

Column 3: the statement.

Column 4: the subject(s).

Column 5: the speaker.

Column 6: the speaker's job title.

Column 7: the state info.

Column 8: the party affiliation.

Column 9-13: the total credit history count, including the current statement.

9: barely true counts.

10: false counts.

11: half true counts.

12: mostly true counts.

13: pants on fire counts.

Column 14: the context (venue / location of the speech or statement).

In [None]:
columns = [
    "ID",
    "label",
    "statement",
    "subjects",
    "speaker",
    "speaker_job",
    "state_info",
    "party_affiliation",
    "barely_true_counts",
    "false_counts",
    "half_true_counts",
    "mostly_true_counts",
    "pants_on_fire_counts",
    "context"
]

In [None]:
len(columns)

In [None]:
train_df = pd.read_csv("train.tsv", encoding="utf-8", delimiter="\t", header=None, names=columns)
valid_df = pd.read_csv("valid.tsv", encoding="utf-8", delimiter="\t", header=None, names=columns)
test_df = pd.read_csv("test.tsv", encoding="utf-8", delimiter="\t", header=None, names=columns)

In [None]:
train_df["statement"][1]

In [None]:
train_df

In [None]:
def join_metadata(x):
    
    subject = x["subjects"]
    speaker = x["speaker"]
    job = x["speaker_job"]
    state_info = x["state_info"]
    party = x["party_affiliation"]
    context = x["context"]
    
    metadata = f"""SUBJECT: {subject}
        SPEAKER: {speaker}
        JOB: {job}
        STATE INFO: {state_info}
        PARTY: {party}
        CONTEXT: {context}
        """
    return metadata

In [None]:
train_df.columns

In [None]:
train_df["metadata"] = train_df.apply(lambda x: join_metadata(x), axis=1)
test_df["metadata"] = test_df.apply(lambda x: join_metadata(x), axis=1)
valid_df["metadata"] = valid_df.apply(lambda x: join_metadata(x), axis=1)

In [None]:
train_df["content"] = train_df["statement"].copy()
test_df["content"] = test_df["statement"].copy()
valid_df["content"] = valid_df["statement"].copy()

In [None]:
train_df = train_df[["content", "metadata", "label"]]
valid_df = valid_df[["content", "metadata", "label"]]
test_df = test_df[["content", "metadata", "label"]]

In [None]:
train_df

In [None]:
label_map = {label: i for i, label in enumerate(train_df['label'].unique())}

In [None]:
train_df["label"] = train_df["label"].map(label_map)
valid_df["label"] = valid_df["label"].map(label_map)
test_df["label"] = test_df["label"].map(label_map)

In [None]:
train_df.label.value_counts()

In [None]:
valid_df.label.value_counts()

In [None]:
test_df.label.value_counts()

In [None]:
import pandas as pd
import torch
import re
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, pipeline

class FakeNewsDataset(Dataset):
    def __init__(self, dataframe, max_length=512):
        self.data = dataframe
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        content = self.data['content'][idx]
        metadata = self.data['metadata'][idx]
        label = self.data['label'][idx]

        # Lowercase the text
        content = content.lower()
        metadata = metadata.lower()
        
        # Remove special characters
        content = re.sub(r'[^\w\s]','',content)
        metadata = re.sub(r'[^\w\s]','',metadata)
        
        # Tokenize content
        content_inputs = self.tokenizer.encode_plus(
            content,
            None,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )
        content_ids = content_inputs["input_ids"]
        content_token_type_ids = content_inputs["token_type_ids"]
        content_mask = content_inputs["attention_mask"]

        content_tokens = {
            'ids': torch.tensor(content_ids, dtype=torch.long),
            'mask': torch.tensor(content_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(content_token_type_ids, dtype=torch.long),
        }
        
        # Tokenize metadata
        metadata_inputs = self.tokenizer.encode_plus(
            metadata,
            None,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )
        metadata_ids = metadata_inputs["input_ids"]
        metadata_token_type_ids = metadata_inputs["token_type_ids"]
        metadata_mask = metadata_inputs["attention_mask"]

        metadata_tokens = {
            'ids': torch.tensor(metadata_ids, dtype=torch.long),
            'mask': torch.tensor(metadata_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(metadata_token_type_ids, dtype=torch.long),
        }
        
        # Convert the labels to integers
        label = self.data['label'][idx]

        return content_tokens, metadata_tokens, label

In [None]:
train_dataset = FakeNewsDataset(train_df)
valid_dataset = FakeNewsDataset(valid_df)
test_dataset = FakeNewsDataset(test_df)

In [None]:
train_dataloader = DataLoader(
        train_dataset, batch_size=24,
        shuffle=True
)

valid_dataloader = DataLoader(
        valid_dataset, batch_size=24,
        shuffle=True
)

test_dataloader = DataLoader(
        test_dataset, batch_size=24,
        shuffle=True
)

In [None]:
content_features, metadata_features, labels = next(iter(train_dataloader))

In [None]:
import torch
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self, input_dim=768, num_ff_layers=6, ff_dim=3072, dropout_prob=0.2):
        super(Classifier, self).__init__()
        
        self.bert_model = transformers.BertModel.from_pretrained("bert-based-uncased")
        
        # Define the Query, Key, and Value Linear layers
        self.query_layer = nn.Linear(input_dim, input_dim)
        self.key_layer = nn.Linear(input_dim, input_dim)
        self.value_layer = nn.Linear(input_dim, input_dim)

        # Define the FeedForward layers
        self.ff_layers = nn.ModuleList()
        for i in range(num_ff_layers):
            self.ff_layers.append(nn.Sequential(
                nn.Linear(input_dim, ff_dim),
                nn.GELU(),
                nn.Dropout(dropout_prob),
                nn.Linear(ff_dim, input_dim),
                nn.GELU(),
                nn.Dropout(dropout_prob)
            ))

        # Define the classifier layer
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, len(label_map.keys())),
        )

        # Define the LayerNormalization layer
        self.layer_norm = nn.LayerNorm(input_dim)

    def forward(self, 
                metadata_ids,
                metadata_mask,
                metadata_token_type_ids,
               ):
        
        _, metadata = self.bert_model(
            metadata_ids, 
            attention_mask=metadata_mask, 
            token_type_ids=metadata_token_type_ids, 
            return_dict=False
        )
        
        output = self.classifier(metadata)
        
        return output.squeeze(-1)

In [None]:
model = Classifier()

In [None]:
from torchmetrics.classification import MulticlassF1Score
from tqdm.notebook import tqdm

In [None]:
device = "cuda:0"
lr = 2e-4
n_epochs = 100

model = Classifier()
model = model.to(device)

In [None]:
import wandb

wandb.init(
    project="ELEC877",
    entity="debadityashome",
    id="(Baseline) - Metadata only Classifier"
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
count_parameters(model)

In [None]:
for module in list(model.bert_model.modules()):
    for param in module.parameters():
        param.requires_grad = False

In [None]:
count_parameters(model)

In [None]:
optim = torch.optim.AdamW([param for param in model.parameters() if param.requires_grad == True], lr)

criterion = nn.CrossEntropyLoss()

train_metric = MulticlassF1Score(num_classes=len(label_map.keys())).to(device)
valid_metric = MulticlassF1Score(num_classes=len(label_map.keys())).to(device)

for i in range(n_epochs):
    print(f"\n****************** Epoch - {i} *******************\n\n")
    model.train()
    #wandb.log({"epoch": i})
    pbar = tqdm(train_dataloader)
    loss_ema = None

    for _, metadata_features, labels in pbar:

        optim.zero_grad()
        
        metadata_ids = metadata_features['ids'].to(device)
        metadata_token_type_ids = metadata_features['token_type_ids'].to(device)
        metadata_mask = metadata_features['mask'].to(device)

        labels = labels.to(device)
        
        prediction = model(
            metadata_ids,
            metadata_mask,
            metadata_token_type_ids
        )
        
        loss = criterion(prediction, labels)

        loss.mean().backward()

        pbar.set_description(f"loss: {loss.mean().item():.4f}")
        
        train_metric.update(prediction, labels)
        
        torch.nn.utils.clip_grad_norm(model.parameters(), 1)
        
        optim.step()
    
    print(f"\nTotal train F1 score: {train_metric.compute()}")
    

    model.eval()

    with torch.no_grad():

        correct = 0
        total = 0
        count = 0

        print("\n****Inference over Validation data****\n")

        for _, metadata_features, labels in tqdm(valid_dataloader):

            metadata_ids = metadata_features['ids'].to(device)
            metadata_token_type_ids = metadata_features['token_type_ids'].to(device)
            metadata_mask = metadata_features['mask'].to(device)

            labels = labels.to(device)

            prediction = model(
                metadata_ids,
                metadata_mask,
                metadata_token_type_ids
            )

            valid_metric.update(prediction, labels)

        print(f"\nTotal valid F1 score: {valid_metric.compute()}")
        
        wandb.log(
        {
            "train F1-score": train_metric.compute().item(),
            "validation F1-score": valid_metric.compute().item(),
            }
        )
    
    train_metric.reset()
    valid_metric.reset()

In [None]:
model.eval()

test_metric = MulticlassF1Score(num_classes=len(label_map.keys())).to(device)

with torch.no_grad():

    correct = 0
    total = 0
    count = 0

    print("\n****Inference over Test data****\n")

    for _, metadata_features, labels in tqdm(valid_dataloader):

        metadata_ids = metadata_features['ids'].to(device)
        metadata_token_type_ids = metadata_features['token_type_ids'].to(device)
        metadata_mask = metadata_features['mask'].to(device)

        labels = labels.float().to(device)

        prediction = model(
            metadata_ids,
            metadata_mask,
            metadata_token_type_ids
        )

        test_metric.update(prediction, labels)
        
    print(f"\nTotal test F1 score: {test_metric.compute()}")
        
    wandb.log(
    {
        "test F1-score": test_metric.compute().item(),
        }
    )