In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
 
import os
input_data_folder = '/kaggle/input/input-data-longformer/input_longformer'
input_labels_file = '/kaggle/input/coliee-24-labels-longformer/full_data_labels.json'
data_fulltext = {}
for filename in os.listdir(input_data_folder):
    with open(input_data_folder +  '/' + filename, 'r', encoding = 'utf-8') as f:
        data = json.load(f)
        fulltext = ""
        fulltext += data['meta'] + ' '
        paras = data['paragraphs']
        for i in range(0,len(paras)):
            fulltext += paras[i] + " "
        data_fulltext[filename.split('.')[0]] = fulltext
input_labels = {}
with open(input_labels_file, 'r', encoding = 'utf-8') as f:
    input_labels = json.load(f)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import transformers
import torch
import torch.nn as nn
import transformers
from transformers import DataCollatorForLanguageModeling
from transformers import LongformerModel, LongformerTokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

In [None]:
#load device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
list_skipped_words = ['should', 'did', 'must', 'just', '.', '..','...', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'of', 'for', 'with', 'by', 'and', 'or', 'but', 'so', 'nor', 'yet', 'from', 'into', 'onto', 'upon', 'out', 'off', 'over', 'under', 'below', 'above', 'between', 'among', 'through', 'during', 'before', 'after', 'since', 'until', 'while', 'as', 'like', 'about', 'against', 'among', 'around', 'before', 'behind', 'beneath', 'beside', 'between', 'beyond', 'during', 'inside', 'outside', 'underneath', 'within', 'without', 'throughout', 'along', 'across', 'toward', 'towards', 'up', 'down', 'forward', 'backward', 'right', 'left', 'here', 'there', 'where', 'when', 'why', 'how', 'what', 'which', 'who', 'whom', 'whose', 'whichever', 'whatever', 'whomever', 'whenever', 'wherever', 'however', 'whyever', ',', ';']
# load pretrained model
longformer_model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

In [None]:
longformer_model.config.output_hidden_states=True

In [None]:
import pandas as pd
class ColieeDataset(Dataset):
    def __init__(self, folder_path, tokenizer, max_len, labels_file, list_skipped_words,
                val_set):
        self.case_law = {}
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.labels = {}
        self.folder_path = folder_path
        self.is_val = val_set
        self.useless_words = set(list_skipped_words)
        
        # Đọc nhãn từ tệp và tải dữ liệu full text
        with open(labels_file, 'r') as f:
            self.labels = json.load(f)
        self.case_law = self.load_fulltext()
        train_examples, val_examples = self.split_data()
        if self.is_val:
            self.data = val_examples
        else:
            self.data = train_examples

    def split_data(self):
        examples = []
        for key, values in self.labels.items():
            case_id = key
            for nested_case_id, _label in values.items():
                if case_id in self.case_law:
                    examples.append({
                        'text1': self.case_law[case_id],
                        'text2': self.case_law[nested_case_id],
                        'label': _label})

        train_examples, val_examples = train_test_split(examples, test_size=0.2, random_state=42)
        train_df = pd.DataFrame(train_examples)
        val_df = pd.DataFrame(val_examples)

        # Reset index 
        train_df.reset_index(drop=True, inplace=True)
        val_df.reset_index(drop=True, inplace=True)

        return train_df, val_df
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = dict(self.data.iloc[idx])
        item  = self.tokenize_text_pair(item) # truncate inside
        item = {key: torch.tensor(val) for key, val in item.items()}
        return item
    def load_fulltext(self):
        _case_law = {}
        for filename in os.listdir(self.folder_path):
            with open(os.path.join(self.folder_path, filename), 'r') as f:
                temp_data = json.load(f)
                case_id = filename.split('.')[0]
                fulltext = temp_data['meta'] + ' '
                for par in temp_data['paragraphs']:
                    fulltext += par + ' '
                fulltext = self.filter_useless_words(fulltext)
                _case_law[case_id] = fulltext
        return _case_law
    def filter_useless_words(self, fulltext):
        # remove useless words in full text to make case law shorter for training
        words = fulltext.split()
        filtered_words = [word for word in words if word.lower() not in self.useless_words]
        return ' '.join(filtered_words)
    
    def tokenize_text_pair(self, item):
        inputs =  self.tokenizer(item['text1'], item['text2'], 
                                 padding='max_length', truncation=True)

        inputs['labels'] = torch.tensor(item['label'])
        return inputs

In [None]:
train_dataset = ColieeDataset(input_data_folder, longformer_tokenizer, 4096, 
                               input_labels_file, list_skipped_words, val_set = False)
test_dataset = ColieeDataset(input_data_folder, longformer_tokenizer, 4096, 
                             input_labels_file, list_skipped_words, val_set = True)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=6, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=6, shuffle=False)

In [None]:
train_dataset.__len__()

In [None]:
train_dataset.__getitem__(1)

In [None]:
class CustomModel(nn.Module):
    def __init__(self, longformer_model, ffn_output_size):
        super(CustomModel, self).__init__()
        self.longformer = longformer_model
        self.ffn = nn.Sequential(
            nn.Linear(self.longformer.config.hidden_size, ffn_output_size),
            nn.ReLU(),
            nn.Linear(ffn_output_size, 2)  # Output: 0 or 1
        )

    def forward(self, **inputs):
        longformer_output = self.longformer(
            **inputs
        )  # Last-layer hidden-state
        pooled_output = longformer_output.hidden_states[-1][:, 0, :]
#         pooled_output = longformer_output.last_hidden_state[:, 0, :]  # Take CLS token
        logits = self.ffn(pooled_output)
        return logits

#params
max_len = 4096
batch_size = 4
epochs = 3
lr = 2e-5

In [None]:
model = CustomModel(longformer_model, ffn_output_size=100)
model = model.to(device)
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)


In [None]:
# def test():
#     tmp_model =  model.to(torch.device('cpu'))
#     text1 = 'Hello my name is DCM'
#     text2 = 'I'm fine thank you kinchana'
#     label = torch.tensor([1])
#     test_input_ids = longformer_tokenizer.encode_plus(
#         text1,
#         text2,
#         add_special_tokens=True,
#         max_length=4096,
#         truncation=True,
#         padding='max_length',
#         return_token_type_ids=True,
#         return_attention_mask=True,
#         return_tensors='pt'
#     )

#     print(test_input_ids['input_ids'].shape)
#     print(test_input_ids['token_type_ids'].shape)
#     print(test_input_ids['attention_mask'].shape)
#     print(label.shape)
    
#     outputs = model(**test_input_ids)
    
#     print(outputs)
    

# test()

In [None]:
# train_subset = [next(iter(train_dataloader)) for _ in range(10)]
# test_subset = [next(iter(test_dataloader)) for _ in range(10)]

In [None]:
!pip install tqdm

In [None]:
import logging

# Set the logging level to WARNING
logging.getLogger("transformers").setLevel(logging.WARNING)

from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs}', leave=False):
        optimizer.zero_grad()
        labels = batch['labels'].to(device)
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device)
        }
        logits = model(**inputs)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_dataloader)
    
    # Validation loop
    model.eval()
    total_val_loss = 0
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc=f'Validation', leave=False):
            labels = batch['labels'].to(device)
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)
            }
            logits = model(**inputs)
            loss = criterion(logits, labels)
            total_val_loss += loss.item()
            predictions.extend(logits.argmax(dim=1).tolist())
            true_labels.extend(labels.tolist())
    avg_val_loss = total_val_loss / len(test_dataloader)
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss}, Val Loss: {avg_val_loss}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')
    
    # Save model checkpoint
    checkpoint_path = os.path.join('/kaggle/working', f"model_epoch_{epoch+1}.pt")
    torch.save(model.state_dict(), checkpoint_path)

print("Training finished.")


In [None]:
print('hello')