In [79]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
import json
import os
import torch

# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead")

GPU not available, using CPU instead


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### dataset

In [85]:
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

train_data = load_jsonl('echr/train.jsonl')
dev_data = load_jsonl('echr/dev.jsonl')
test_data = load_jsonl('echr/test.jsonl')

train_data.head(1)

Unnamed: 0,case_id,case_no,title,judgment_date,facts,applicants,defendants,allegedly_violated_articles,violated_articles,court_assessment_references,silver_rationales,gold_rationales
0,001-59587,25702/94,CASE OF K. AND T. v. FINLAND,2001-07-12,[11. At the beginning of the events relevant ...,"[K., T.]",[FINLAND],"[13, 8]",[8],"{'8': ['12', '140', '155', '156', '157', '158'...","[1, 13, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30...",[]


In [82]:
import multiprocessing as mp
from tqdm.notebook import tqdm
import numpy as np

num_cores = mp.cpu_count()
print(f"Detected {num_cores} CPU cores.")

# Preprocessing function
def preprocess_text_legacy(text):
    if isinstance(text, list):
        text = ' '.join(text)  # Convert list to string
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

# Simplified preprocessing function for debugging
def preprocess_text(text):
    if isinstance(text, list):
        text = ' '.join(text)  # Convert list to string
    # text = text.lower()  # Convert to lowercase
    return text

# Function to apply preprocessing in parallel
def parallel_apply(df, func, num_cores=num_cores):
    df_split = np.array_split(df, num_cores)
    pool = mp.Pool(num_cores)
    df = pd.concat(tqdm(pool.imap(func, df_split), total=len(df_split)))
    pool.close()
    pool.join()
    return df

# Wrap the preprocessing function to handle DataFrame input
def preprocess_dataframe(df):
    df['processed_text'] = df['facts'].apply(preprocess_text)
    return df

Detected 2 CPU cores.


  return bound(*args, **kwds)


  0%|          | 0/2 [00:01<?, ?it/s]

In [90]:
from sklearn.model_selection import train_test_split

# Apply preprocessing in parallel
data = pd.concat([train_data, dev_data, test_data])

data = parallel_apply(data, preprocess_dataframe)
data['violated_articles_str'] = data['violated_articles'].apply(lambda x: ','.join(x))

train_df, test_df = data[:len(train_data)], data[len(train_data) : len(dev_data) + len(test_data)]

  return bound(*args, **kwds)


  0%|          | 0/2 [00:00<?, ?it/s]

In [91]:
# Convert labels to integers
label_map = {label: idx for idx, label in enumerate(data['violated_articles_str'].unique())}
train_df['labels'] = train_df['violated_articles_str'].map(label_map)
test_df['labels'] = test_df['violated_articles_str'].map(label_map)

# Verify the label mapping
print("Label Map:", label_map)
print("Train Labels Unique:", train_df['labels'].unique())
print("Test Labels Unique:", test_df['labels'].unique())

Label Map: {'8': 0, '': 1, '6': 2, '10': 3, '6,10': 4, '5,2,13': 5, '8,3': 6, '5': 7, '6,5': 8, '6,13': 9, '11': 10, 'P1-1': 11, '8,13': 12, '6,5,3': 13, 'P1-1,6': 14, '9,13': 15, '6,8': 16, '3': 17, 'P4-4,5,13': 18, '6,8,13': 19, '2,13': 20, 'P1-3': 21, '2': 22, 'P1-1,14': 23, 'P7-4': 24, '5,2,13,3': 25, '5,3,8,2,13': 26, '8,14': 27, '8,5,13': 28, '13,3': 29, '34,6,3': 30, '12,8,13': 31, '13': 32, '9': 33, '7': 34, 'P7-2': 35, '8,5': 36, '13,11': 37, '5,7': 38, '8,10': 39, '6,13,3': 40, '5,3': 41, '6,3': 42, '2,38,3': 43, '9,8,3': 44, '8,13,3': 45, '34': 46, 'P4-4,5,13,3': 47, 'P1-2,13,14': 48, 'P1-1,6,8,P4-2': 49, '6,8,5': 50, 'P1-1,13': 51, '14': 52, '34,3': 53, '5,10': 54, '8,5,3': 55, '6,P4-2': 56, '6,10,3': 57, '6,5,10,3,13': 58, '34,5,3': 59, '5,P1-1,3,2,13': 60, '8,5,2,3': 61, '6,5,38,3,8,2': 62, '18,5': 63, 'P1-1,8,13,3': 64, '5,13,3': 65, 'P1-3,14': 66, 'P1-1,6,13': 67, 'P1-1,8,13': 68, 'P1-3,P4-2': 69, '34,5,2,13': 70, '6,5,34,8,13': 71, 'P1-1,2,13': 72, '5,13': 73, '6,5,13'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['labels'] = train_df['violated_articles_str'].map(label_map)


In [None]:
# Load pre-trained model and tokenizer
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from tqdm.notebook import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

# Tokenize the input data with a progress bar and max_length
max_length = 512
batch_size = 1000  # Define batch size for tokenization

def batch_tokenize(texts, tokenizer, batch_size, max_length):
    encodings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        encodings.extend(tokenizer(batch, truncation=True, padding=True, max_length=max_length)['input_ids'])
    return encodings

train_texts = list(train_df['processed_text'])
test_texts = list(test_df['processed_text'])

train_encodings = batch_tokenize(train_texts, tokenizer, batch_size, max_length)
test_encodings = batch_tokenize(test_texts, tokenizer, batch_size, max_length)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).long() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.labels)

train_labels = list(train_df['labels'])
test_labels = list(test_df['labels'])

train_dataset = Dataset({'input_ids': train_encodings}, train_labels)
test_dataset = Dataset({'input_ids': test_encodings}, test_labels)

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Check the device and the tensor shapes
print("Device:", device)
print("Train Dataset Sample:", train_dataset[0])
print("Test Dataset Sample:", test_dataset[0])

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # number of training epochs
    per_device_train_batch_size=4,   # batch size for training
    per_device_eval_batch_size=4,    # batch size for evaluation
    warmup_steps=10,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=2,                 # reduce logging steps for quicker feedback
    report_to=[],                    # Disable W&B logging
)

trainer = Trainer(
    model=model.to(device),           # the instantiated 🤗 Transformers model to be trained
    args=training_args,               # training arguments, defined above
    train_dataset=train_dataset,      # training dataset
    eval_dataset=test_dataset         # evaluation dataset
)

# Print model and dataset information
print("Model:", model)
print("Training Arguments:", training_args)

trainer.train()

In [None]:
# Reinitialize the Trainer to ensure the internal state is correct
trainer = Trainer(
    model=model.to(device),           # the instantiated 🤗 Transformers model to be trained
    args=training_args,               # training arguments, defined above
    train_dataset=train_dataset,      # training dataset
    eval_dataset=test_dataset         # evaluation dataset
)

# Define the compute_metrics function
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Evaluate the model
results = trainer.evaluate(eval_dataset=test_dataset)
print("Evaluation Results:", results)