In [2]:
import os
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification,BertModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
import pandas as pd
import numpy as np

EPOCHS = 10
BATCH_SIZE = 32
VALIDATION_SIZE = 0.2

CLASSES = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [4]:
#Convert DataFrame to list of pairs and labels
train_data = [(row['premise'], row['hypothesis']) for _, row in train_df.iterrows()]
labels = [row['label'] for _, row in train_df.iterrows()]

In [5]:
#Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenized_train = tokenizer(train_data, padding='max_length', truncation=True, max_length=300, return_tensors='pt')

In [6]:
#Dataset and DataLoader
dataset = TensorDataset(tokenized_train['input_ids'], tokenized_train['attention_mask'], torch.tensor(labels))
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [7]:
#BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(CLASSES))
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
#Training Loop
model.train()
for epoch in range(EPOCHS):
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
# Save model checkpoint
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
    # Add any other information you want to save
}, 'bert_model_checkpoint.pth')

In [None]:
#TODO test_dataloader

In [None]:
spredictions = []
# Iterate through the test data
for batch in test_dataloader:  # Assuming test_dataloader is similar to the training dataloader
    input_ids, attention_mask, labels = batch
    #Disable gradient calculation to speed up inference
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    # Get logits from the model's output
    logits = outputs.logits
    # Convert logits to probabilities if needed (e.g., for classification tasks)
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    # Get predicted labels
    predicted_labels = torch.argmax(logits, dim=-1)
    # Store predictions and true labels
    predictions.extend(predicted_labels.tolist())

# Convert predictions and true labels to numpy arrays for further analysis
predictions = np.array(predictions)