In [5]:
# BERT AUTOMATIC DEFINITION EXTRACTION

In [1]:
!pip install transformers -q
!pip install tqdm -q
!pip install scikit-learn -q

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
#from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from torch.optim import AdamW
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:


# ----- 6. Evaluation Function -----
def evaluate(loader, return_loss=False):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0
    with torch.no_grad():
        for batch, labels in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()

            preds = (outputs > 0.5).int().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    if return_loss:
        return acc, total_loss / len(loader)
    else:
        return acc

# === final classification report evaluation ===
def predict_all(loader):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for batch, labels in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            output = model(input_ids, attention_mask)
            preds.extend((output > 0.5).int().cpu().numpy())
            targets.extend(labels.numpy())

    return targets, preds

# ----- Load Data -----
datapath = "./data/"

train_df = pd.read_csv(datapath+"task1_train.csv")
dev_df = pd.read_csv(datapath+"task1_dev.csv")
test_df = pd.read_csv(datapath+"task1_test_labeled.csv")

# ----- 2. Dataset Class -----
class SentenceDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.encodings = tokenizer(df["Sentence"].tolist(), truncation=True, padding=True,
                                   max_length=max_len, return_tensors="pt")
        self.labels = torch.tensor(df["Label"].tolist()).float()

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

    def __len__(self):
        return len(self.labels)

# ----- 3. Model -----
class BERTClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)  # Binary classification
    
    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.pooler_output
        return torch.sigmoid(self.fc(cls)).squeeze()

# ----- 4. Setup -----
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_data = SentenceDataset(train_df, tokenizer)
dev_data   = SentenceDataset(dev_df, tokenizer)
test_data  = SentenceDataset(test_df, tokenizer)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
dev_loader   = DataLoader(dev_data, batch_size=16)
test_loader  = DataLoader(test_data, batch_size=16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier().to(device)
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
loss_fn = nn.BCELoss()

# ----- 5. Training -----
best_val_loss = float('inf')
patience = 2
patience_counter = 0
epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    
    for batch, labels in tqdm(train_loader) :
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Collect predictions and labels for accuracy
        preds = (outputs > 0.5).int().cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    # Compute training accuracy
    train_acc = accuracy_score(all_labels, all_preds)

    val_acc, val_loss = evaluate(dev_loader, return_loss=True)
    print(f"[Epoch {epoch+1}] Training Loss: {total_loss / len(train_loader):.4f}, Train Acc: {train_acc:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_model3.pt")
        print("new best model")

    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping.")
            break

    # Compute training accuracy
    train_acc = accuracy_score(all_labels, all_preds)

    '''val_acc, val_loss = evaluate(dev_loader, return_loss=True)
    print(f"[Epoch {epoch+1}] Training Loss: {total_loss / len(train_loader):.4f}, Train Acc: {train_acc:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")'''


# ----- 7. Run Eval -----
model.load_state_dict(torch.load("best_model3.pt"))

y_true, y_pred = predict_all(test_loader)
print(classification_report(y_true, y_pred, digits=4))


#print("Dev Accuracy:", predict_all(dev_loader))
#print("Test Accuracy:", evaluate(test_loader)) #No label

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1092/1092 [03:14<00:00,  5.63it/s]


[Epoch 1] Training Loss: 0.3932, Train Acc: 0.8264, Val Loss: 0.2934, Val Acc: 0.8841
new best model


100%|██████████| 1092/1092 [03:17<00:00,  5.53it/s]


[Epoch 2] Training Loss: 0.2778, Train Acc: 0.8879, Val Loss: 0.3012, Val Acc: 0.8817


100%|██████████| 1092/1092 [03:16<00:00,  5.57it/s]


[Epoch 3] Training Loss: 0.2044, Train Acc: 0.9207, Val Loss: 0.3254, Val Acc: 0.8852
Early stopping.
              precision    recall  f1-score   support

         0.0     0.8683    0.9207    0.8937       580
         1.0     0.8115    0.7097    0.7572       279

    accuracy                         0.8522       859
   macro avg     0.8399    0.8152    0.8254       859
weighted avg     0.8498    0.8522    0.8494       859



In [10]:
# ----- 7. Run Eval -----
model.load_state_dict(torch.load("best_model2.pt"))

y_true, y_pred = predict_all(test_loader)
print(classification_report(y_true, y_pred, digits=4))


              precision    recall  f1-score   support

         0.0     0.9033    0.8862    0.8947       580
         1.0     0.7724    0.8029    0.7873       279

    accuracy                         0.8591       859
   macro avg     0.8379    0.8445    0.8410       859
weighted avg     0.8608    0.8591    0.8598       859



In [4]:
# ----- 7. Run Eval -----
model.load_state_dict(torch.load("best_model.pt"))

y_true, y_pred = predict_all(test_loader)
print(classification_report(y_true, y_pred, digits=4))


              precision    recall  f1-score   support

         0.0     0.9120    0.8759    0.8936       580
         1.0     0.7616    0.8244    0.7917       279

    accuracy                         0.8591       859
   macro avg     0.8368    0.8501    0.8427       859
weighted avg     0.8632    0.8591    0.8605       859



In [6]:
# Save after training
'''model.save_pretrained("bert_best_mod")
tokenizer.save_pretrained("bert_best_tok")'''

'model.save_pretrained("bert_best_mod")\ntokenizer.save_pretrained("bert_best_tok")'

### INference Pipeline

In [3]:
# Load model for inference / Prod

from transformers import BertForSequenceClassification, BertTokenizer
import torch

# ----- 3. Model -----
class BERTClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)  # Binary classification
    
    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.pooler_output
        return torch.sigmoid(self.fc(cls)).squeeze()

# ----- 4. Setup -----
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier().to(device)
model.load_state_dict(torch.load("best_model.pt"))
model.eval()
print("model loaded")

#Predict function
def predict_sentences(sentences, model, tokenizer, device):
    model.eval()
    encodings = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device)
    encodings.pop("token_type_ids", None)  # Optional if forward doesn't take it
    
    with torch.no_grad():
        probs = model(**encodings)  # Already sigmoid outputs from your forward
        preds = (probs >= 0.5).long()  # Threshold for binary classification
    
    return preds.cpu().numpy(), probs.cpu().numpy()


model loaded


In [10]:
texts = [
    "The atom is the smallest unit of matter.",
    "I hate rainy days."
]
pred_labels, pred_probs = predict_sentences(texts, model, tokenizer, device)
print(pred_labels)  # e.g. [1, 0]
print(pred_probs)   # e.g. [[0.1, 0.9], [0.8, 0.2]]


[1 0]
[0.939231   0.00966283]


#### BATCHED VERSION OF PREDICTION

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

# ------------------------------
# Dataset for inference
# ------------------------------
class SentenceTermDataset(Dataset):
    def __init__(self, sentences, terms, tokenizer, max_length=128):
        self.sentences = sentences
        self.terms = terms
        self.encodings = tokenizer(
            sentences,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["term"] = self.terms[idx]
        item["sentence"] = self.sentences[idx]
        return item

# ------------------------------
# Batched prediction
# ------------------------------
def predict_in_batches(sentences, terms, model, tokenizer, device, batch_size=32):
    dataset = SentenceTermDataset(sentences, terms, tokenizer)
    loader = DataLoader(dataset, batch_size=batch_size)

    model.eval()
    all_terms, all_sentences, all_preds, all_probs = [], [], [], []

    with torch.no_grad():
        for batch in tqdm(loader):
            # Extract fields
            term_batch = batch.pop("term")
            sentence_batch = batch.pop("sentence")
            
            # Send tensors to device
            batch = {k: v.to(device) for k, v in batch.items()}
            batch.pop("token_type_ids", None)  # Optional if model.forward doesn't take it

            # Forward pass
            probs = model(**batch)  # Already sigmoid outputs from your forward
            preds = (probs >= 0.5).long()

            # Store
            all_terms.extend(term_batch)
            all_sentences.extend(sentence_batch)
            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    return all_terms, all_sentences, all_preds, all_probs


In [7]:
matches = pd.read_csv("data/deft_matches.csv")
sentences = matches["sentence"].to_list()
terms = matches["terms"].to_list()


terms_out, sentences_out, preds, probs = predict_in_batches(
    sentences, terms, model, tokenizer, device, batch_size=16
)

# Save to CSV
df = pd.DataFrame({
    "term": terms_out,
    "sentence": sentences_out,
    "pred_label": preds,
    "pred_prob": [float(p) for p in probs]
})
df.to_csv("predictions.csv", index=False, sep=";")

100%|██████████| 10/10 [00:01<00:00,  7.54it/s]
