In [13]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn
from torch.utils.data import Dataset
import pandas as pd
from torch.utils.data import DataLoader
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

## Dataset

In [14]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
MAX_LEN = 512
BATCH_SIZE = 10
RANDOM_SEED = 42

In [15]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [16]:
stances_headlines =  pd.read_csv('train_stances.csv')
bodies = pd.read_csv('train_bodies.csv')
stances_bodies = stances_headlines.merge(bodies,on='Body ID')

In [17]:
class StancesDataset(Dataset):
    def __init__(self, headlines, bodies, stances, tokenizer, max_len):
        self.headlines = headlines
        self.bodies = bodies
        self.stances = stances
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.categories = {"unrelated": 0, "agree": 1, "discuss": 2, "disagree": 3}


    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        headline = self.headlines[idx]
        body = self.bodies[idx]
        stance = self.stances[idx]
        stance_label = self.categories[stance]
        print(stance_label)

        inputs = self.tokenizer.encode_plus(
            headline,
            body,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            "labels": torch.tensor([stance_label], dtype=torch.long) 
        }

## Model

In [18]:
class StanceDetectionModel(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.l1 = torch.nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _,pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                                 return_dict=False)
        print('pooled output', pooled_output)
        output = self.l1(pooled_output)
        return F.softmax(output,dim=1)

## Training

In [48]:
def train(batch_size, data, model, learning_rate=0.0001, device='cpu'):
    loss_fn = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train().to(device)
    num_correct_predictions = 0
    num_samples = len(data)
    training_loss = []

    for i, input_data in enumerate(data):
        print('Batch #', i)
        input_ids = input_data['input_ids'].to(device)
        attention_mask = input_data['attention_mask'].to(device)
        token_type_ids = input_data['token_type_ids'].to(device)
        labels = input_data['labels'].to(device).squeeze()
        output = model(input_ids, attention_mask, token_type_ids)
        preds = torch.argmax(output, dim=1)
        num_correct_predictions += torch.sum(preds == labels)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        training_loss.append(loss.item())
    return num_correct_predictions.item()/len(data), np.mean(training_loss)


In [49]:
def validate(model, data_loader, device='cpu'):
    model = model.eval()
    loss_fn = nn.CrossEntropyLoss().to(device)
    validation_losses = []
    correct_predictions = 0

    with torch.no_grad():
         for i, input_data in enumerate(data):
            input_ids = input_data["input_ids"].to(device)
            attention_mask = input_data["attention_mask"].to(device)
            token_type_ids = input_data['token_type_ids'].to(device)
            labels = input_data["labels"].to(device)
            output = model(input_ids, attention_mask, token_type_ids)
            preds = torch.argmax(output, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == labels)
            validation_losses.append(loss.item())

    return correct_predictions.item() / len(data), np.mean(validation_losses)

In [50]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = StancesDataset(df['Headline'].to_numpy(),
                        df['articleBody'].to_numpy(),
                        df['Stance'].to_numpy(),
                        tokenizer,
                        max_len)


    return DataLoader(
        ds,
        batch_size=batch_size
  )

df_train, df_test = train_test_split(stances_bodies, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
train_dataloader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_dataloader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_dataloader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
epochs = 1
batch_size=10
model = StanceDetectionModel(n_classes=4)
for i  in range(epochs):
    accuracy, mean_loss = train(batch_size,train_dataloader, model)
    print(f'Training accuracy at epoch {i} is {accuracy}')
    print(f'Mean training loss at epoch {i} is {mean_loss}')
    accuracy, mean_val_loss = validate(model, val_dataloader)
    print(f'Validaton accuracy at epoch {i} is {accuracy}')
    print(f'Mean validation loss at epoch {i} is {mean_loss}')
    