In [None]:
#I made a requirements file, but you can also use this

!pip install torch torchvision 
!pip install transformers pandas numpy scikit-learn tqdm

In [1]:
import pandas as pd
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [20]:
def preprocess_text(text):
    text = str(text).lower()
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

In [3]:

def encode_text_pair(tokenizer, claim, evidence, max_length=512):
    claim = preprocess_text(claim)
    evidence = preprocess_text(evidence)
    
    return tokenizer.encode_plus(
        claim,
        evidence,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

In [4]:
#data loading stuff
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    print(f"read {len(df)} samples")
    return df

def create_batch(df, tokenizer, indices):
    claims = df.iloc[indices]['Claim'].values
    evidences = df.iloc[indices]['Evidence'].values
    labels = df.iloc[indices]['label'].values
    
    # Process all pairs in the batch
    batch_encodings = [
        encode_text_pair(tokenizer, claim, evidence)
        for claim, evidence in zip(claims, evidences)
    ]
    
    return {
        'input_ids': torch.cat([enc['input_ids'] for enc in batch_encodings]),
        'attention_mask': torch.cat([enc['attention_mask'] for enc in batch_encodings]),
        'labels': torch.tensor(labels, dtype=torch.long)
    }

def create_data_loader(df, tokenizer, batch_size, shuffle=True):
    """Create a data loader from a dataframe"""
    dataset_size = len(df)
    indices = np.arange(dataset_size)
    
    def batch_generator():
        if shuffle:
            np.random.shuffle(indices)
        
        for start_idx in range(0, dataset_size, batch_size):
            batch_indices = indices[start_idx:start_idx + batch_size]
            yield create_batch(df, tokenizer, batch_indices)
    
    return batch_generator

In [5]:
# setup tjhe model
def create_model():
    bert = BertModel.from_pretrained('bert-base-uncased')
    classifier = nn.Linear(bert.config.hidden_size, 2)
    dropout = nn.Dropout(0.3)
    
    def forward(input_ids, attention_mask):
        outputs = bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = dropout(pooled_output)
        return classifier(pooled_output)
    
    return forward, [bert, classifier, dropout]

def get_parameters(model_parts):
    params = []
    for part in model_parts:
        params.extend(part.parameters())
    return params

In [7]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
import torch

def load_and_balance_data(data_path):
    """Load and balance the dataset using RandomOverSampler."""
    data = pd.read_csv(data_path)
    ros = RandomOverSampler()
    X = data[['Claim', 'Evidence']]
    y = data['label']
    X_resampled, y_resampled = ros.fit_resample(X, y)
    balanced_data = pd.concat([X_resampled, y_resampled], axis=1)
    return balanced_data

def get_data_item(data, tokenizer, idx, max_length=512):
    """Get a single data item from the balanced dataset."""
    claim = str(data.iloc[idx]['Claim'])
    evidence = str(data.iloc[idx]['Evidence'])
    label = data.iloc[idx]['label']
    
    claim = preprocess_text(claim)
    evidence = preprocess_text(evidence)
    
    encoding = tokenizer.encode_plus(
        claim,
        evidence,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'label': torch.tensor(label, dtype=torch.long)
    }

In [8]:
#hardware stuff, ran locally. not sure how well this worked
def setup_device():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    print(f"Using device: {device}")
    return device

In [9]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
def evaluate_model(forward_func, data_generator, device):
    predictions = []
    actual_labels = []
    
    for batch in data_generator():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        with torch.no_grad():
            outputs = forward_func(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        actual_labels.extend(labels.cpu().numpy())
        
        if device.type == 'mps':
            torch.mps.empty_cache()
    
    return {
          'accuracy': accuracy_score(actual_labels, predictions),
           'f1': f1_score(actual_labels, predictions),
           'predictions': predictions,
           'actual_labels': actual_labels
    }



def plot_confusion_matrix(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.show()

def train_model(forward_func, model_parts, train_generator, val_generator, device, 
                num_epochs=3, learning_rate=2e-5, train_batches=None, val_batches=None):
    optimizer = torch.optim.AdamW(get_parameters(model_parts), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    best_val_f1 = 0
    history = {'train_loss': [], 'val_accuracy': [], 'val_f1': []}
    
    for epoch in range(num_epochs):
        #training
        train_losses = []
        for batch in tqdm(train_generator(), total=train_batches):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = forward_func(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(get_parameters(model_parts), max_norm=1.0)
            optimizer.step()
            
            train_losses.append(loss.item())
            
            if device.type == 'mps':
                torch.mps.empty_cache()
        
        #evals
        val_metrics = evaluate_model(forward_func, val_generator, device)
        plot_confusion_matrix(val_metrics['actual_labels'], val_metrics['predictions'], labels=[0, 1])
        
        #ledger
        history['train_loss'].append(np.mean(train_losses))
        history['val_accuracy'].append(val_metrics['accuracy'])
        history['val_f1'].append(val_metrics['f1'])
        
        print(f'Epoch {epoch + 1}/{num_epochs}:')
        print(f'Average Train Loss: {np.mean(train_losses):.4f}')
        print(f'Validation Accuracy: {val_metrics["accuracy"]:.4f}')
        print(f'Validation F1-Score: {val_metrics["f1"]:.4f}')
        
        if val_metrics['f1'] > best_val_f1:
            best_val_f1 = val_metrics['f1']
            #save model
            torch.save([part.state_dict() for part in model_parts], 'best_model.pt')
            print("Saved new best model!")
    
    return history

In [10]:
import matplotlib.pyplot as plt

def plot_training_history(history):
    """Plot training metrics"""
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    ax1.plot(history['train_loss'])
    ax1.set_title('Training Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    
    ax2.plot(history['val_accuracy'], label='Accuracy')
    ax2.plot(history['val_f1'], label='F1-Score')
    ax2.set_title('Validation Metrics')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Score')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

In [11]:
device = setup_device()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_df = load_dataset('training_data/ED/train.csv')
val_df = load_dataset('training_data/ED/dev.csv')


Using device: mps
Loaded 21508 samples from training_data/ED/train.csv
Loaded 5926 samples from training_data/ED/dev.csv


In [13]:

#toned it down a bit here
batch_size = {'train': 8, 'val': 16}
#train_dataset = BalancedEvidenceDetectionDataset('training_data/ED/train.csv', tokenizer)

train_generator = create_data_loader(train_df, tokenizer, batch_size['train'])
val_generator = create_data_loader(val_df, tokenizer, batch_size['val'], shuffle=False)


In [14]:

#calculate num  batches
train_batches = len(train_df) // batch_size['train']
val_batches = len(val_df) // batch_size['val']
print(train_batches)
print(val_batches)



forward_func, model_parts = create_model()
for part in model_parts:
    part.to(device)
 
#some hyper params
config = {
    'num_epochs': 3,
    'learning_rate': 2e-5
}


    
history = train_model(
        forward_func, 
        model_parts,
        train_generator,
        val_generator,
        device,
        num_epochs=config['num_epochs'],
        learning_rate=config['learning_rate'],
        train_batches=train_batches,
        val_batches=val_batches
    )
    
plot_training_history(history)


2688
370


 21%|██        | 566/2688 [06:48<25:31,  1.39it/s]


KeyboardInterrupt: 

Inference

In [17]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
from torch import nn


#rewrote some code from above so that I didnt have to train the whole thing again

def load_and_preprocess_data(file_path, tokenizer, max_length=512):
    data = pd.read_csv(file_path)
    input_ids_list = []
    attention_mask_list = []
    
    for _, row in data.iterrows():
        claim = preprocess_text(str(row['Claim']))
        evidence = preprocess_text(str(row['Evidence']))
        
        encoding = tokenizer.encode_plus(
            claim,
            evidence,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        input_ids_list.append(encoding['input_ids'].flatten())
        attention_mask_list.append(encoding['attention_mask'].flatten())
    
    return TensorDataset(torch.stack(input_ids_list), torch.stack(attention_mask_list))

def run_inference(forward_func, data_loader, device):
    predictions = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            
            outputs = forward_func(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
    
    return predictions

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Prepare the test data
test_dataset = load_and_preprocess_data('test.csv', tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

#load  model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
forward_func, model_parts = create_model()

state_dicts = torch.load('best_model.pt')
model_parts[0].load_state_dict(state_dicts[0]) 
model_parts[1].load_state_dict(state_dicts[1])  

for part in model_parts:
    part.to(device)

predictions = run_inference(forward_func, test_loader, device)

#save predictions
predictions_df = pd.DataFrame(predictions, columns=['label'])
predictions_df.to_csv('predictions.csv', index=False)

In [19]:
#makes sure the test.csv and predictions.csv have the same number of rows
test_df = pd.read_csv('test.csv')
predictions_df = pd.read_csv('predictions.csv')

print(f"num rows test: {len(test_df)}")
print(f"num rows predictions: {len(predictions_df)}")

Number of rows in test.csv: 50
Number of rows in predictions.csv: 50
