# Loading Data

In [21]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [23]:
# Load dataset
train_df = pd.read_json('train.json')
val_df = pd.read_json('val.json')
test_df = pd.read_json('test.json')

# Preprocessing Data

In [25]:
# Function to concatenate references
def concatenate_references(row):
    references = []
    num_refs = int(row.get('num_references', 0))  # Avoid KeyError
    for i in range(num_refs):
        ref_key = f'ref_abstract.cite_{i}.abstract'
        if ref_key in row:  # Ensure key exists
            references.append(row.get(ref_key, ''))  # Avoid KeyError
    return ' '.join(references).strip()

In [27]:
# Apply function to create a new column with concatenated references
train_df['references'] = train_df.apply(concatenate_references, axis=1)
val_df['references'] = val_df.apply(concatenate_references, axis=1)
test_df['references'] = test_df.apply(concatenate_references, axis=1)

# Bert Tokenization

In [30]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
MAX_LENGTH = 512

# Function to tokenize text
def tokenize_function(text):
    return tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

# Custom Dataset Class
class ReferenceDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        
        # Tokenize abstract and references
        abstract_tokens = tokenize_function(row['abstract'])
        references_tokens = tokenize_function(row['references'])

        return {
            "input_ids": abstract_tokens["input_ids"].squeeze(0),
            "attention_mask": abstract_tokens["attention_mask"].squeeze(0),
            "labels": references_tokens["input_ids"].squeeze(0)  # Treat references as labels
        }

# Create dataset instances
train_dataset = ReferenceDataset(train_df)
val_dataset = ReferenceDataset(val_df)
test_dataset = ReferenceDataset(test_df)

# Create DataLoaders
BATCH_SIZE = 8
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]