# Using HuggingFace to finetune for LLM tasks

## Load libraries

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

## Load the tokenizer and model

In [None]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # 2 for binary classification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

## Tokenize 

In [None]:
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
labels = torch.tensor(labels)

## Dataset and DataLoader (PyTorch)

In [None]:
dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"], labels)
dataloader = DataLoader(dataset, batch_size=2)

## Optimizer, loss function and scheduler

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

## Training loop

In [None]:
model.train()
for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        # loss = loss_fn(outputs.logits, labels)  # Alternative if not using the model's loss

        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

# Special: Custom output model

In [None]:
class BertLSTMForSentimentAnalysis(nn.Module):
    def __init__(self, model_name, num_labels, lstm_hidden_size=256, lstm_layers=1, dropout_rate=0.1):
        super(BertLSTMForSentimentAnalysis, self).__init__()

        self.bert = BertModel.from_pretrained(model_name)
        self.lstm = nn.LSTM(
            input_size=self.bert.config.hidden_size,
            hidden_size=lstm_hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=False,  # Set to True for bidirectional
            dropout=dropout_rate if lstm_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(lstm_hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        # Get all hidden states from BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state # Use the last hidden state

        # Pass hidden states to LSTM
        lstm_output, (h_n, c_n) = self.lstm(hidden_states)

        # Use the last hidden state of the LSTM
        # If bidirectional, use torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        lstm_final_hidden_state = h_n[-1]

        # Dropout and classification
        x = self.dropout(lstm_final_hidden_state)
        logits = self.classifier(x)
        return logits