In [1]:
%%capture
!pip install --upgrade transformers bitsandbytes accelerate peft datasets wandb contractions spacy tqdm

In [2]:
import torch
import wandb
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset, DatasetDict
import numpy as np
import re
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from contractions import fix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map 

# Login to Weights & Biases (W&B) - Required in Kaggle
wandb.login(key="e772770782e92af492a82e59b3168d7f3d22258c")  # Replace with your actual API key
wandb.init(project="Spring2025")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdhruvjoshi892000[0m ([33mdhruvjoshi892000-pace-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# Ensure GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} for training")

Using cuda for training


In [14]:
import spacy

# Load spaCy model with GPU support
spacy.require_gpu()  # Ensure spaCy runs on GPU
nlp = spacy.load("en_core_web_sm")

# Verify if GPU is used
print("spaCy is using GPU:", spacy.prefer_gpu())


# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """
    Preprocess legal text for analysis using spaCy.
    Steps:
    - Lowercasing
    - Removing HTML tags
    - Removing URLs
    - Expanding contractions
    - Removing special characters
    - Removing numbers
    - Removing stop words
    - Lemmatization using spaCy
    - Removing extra whitespace
    """
    # Lowercase the text
    text = text.lower()
    
    # Remove HTML tags and URLs
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # Expand contractions (optional)
    try:
        text = fix(text)
    except ImportError:
        pass  # Skip if the 'contractions' package is not available
    
    # Remove special characters and numbers (excluding spaces)
    text = re.sub(r'[^a-z\s]', '', text)

    # Process text using spaCy
    doc = nlp(text)
    
    # Lemmatize and remove stopwords
    text = ' '.join([token.lemma_ for token in doc if not token.is_stop])

    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text


# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Load dataset from CSV
df = pd.read_csv("/kaggle/input/labels-web-of-law/15_labels_data.csv")  # Adjust file path if needed

# Apply preprocessing
# df = df[:100]
# df["text"] = df["text"].apply(preprocess_text)
processed_texts = [preprocess_text(text) for text in tqdm(df['text'], desc="Preprocessing texts")]

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split dataset into train and validation
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']


# Load tokenizer
model_name = "bert-base-uncased"  # Modify as needed
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)



spaCy is using GPU: True


Preprocessing texts: 100%|██████████| 8417/8417 [34:38<00:00,  4.05it/s]  


Map:   0%|          | 0/7575 [00:00<?, ? examples/s]

Map:   0%|          | 0/842 [00:00<?, ? examples/s]

In [5]:
# import spacy
# import re
# import pandas as pd
# from tqdm.contrib.concurrent import process_map  # Parallel processing
# from datasets import Dataset
# from transformers import AutoTokenizer
# from tqdm import tqdm

# # Load spaCy model once (required for multiprocessing)
# nlp = spacy.load("en_core_web_sm")

# def preprocess_text(text):
#     """
#     Preprocess legal text for analysis using spaCy.
#     Steps:
#     - Lowercasing
#     - Removing HTML tags
#     - Removing URLs
#     - Expanding contractions
#     - Removing special characters
#     - Removing numbers
#     - Removing stop words
#     - Lemmatization using spaCy
#     - Removing extra whitespace
#     """
#     # Lowercase the text
#     text = text.lower()
    
#     # Remove HTML tags and URLs
#     text = re.sub(r'<.*?>', '', text)
#     text = re.sub(r'http\S+|www\.\S+', '', text)
    
#     # Expand contractions (optional)
#     try:
#         from contractions import fix
#         text = fix(text)
#     except ImportError:
#         pass  # Skip if the 'contractions' package is not available
    
#     # Remove special characters and numbers (excluding spaces)
#     text = re.sub(r'[^a-z\s]', '', text)

#     # Process text using spaCy
#     doc = nlp(text)
    
#     # Lemmatize and remove stopwords
#     text = ' '.join([token.lemma_ for token in doc if not token.is_stop])

#     # Remove extra whitespace
#     text = ' '.join(text.split())
    
#     return text


# # Load dataset from CSV
# df = pd.read_csv("/kaggle/input/labels-web-of-law/15_labels_data.csv")  # Adjust file path if needed

# # Apply preprocessing in parallel
# processed_texts = process_map(preprocess_text, df['text'], max_workers=4, chunksize=2)

# # Assign processed text back to DataFrame
# df["text"] = processed_texts

# # Convert DataFrame to Hugging Face Dataset
# dataset = Dataset.from_pandas(df)

# # Split dataset into train and validation
# dataset = dataset.train_test_split(test_size=0.1, seed=42)
# train_dataset = dataset['train']
# eval_dataset = dataset['test']


# # Load tokenizer
# model_name = "bert-base-uncased"  # Modify as needed
# tokenizer = AutoTokenizer.from_pretrained(model_name)


# # Tokenization function optimized for Hugging Face Datasets
# def tokenize_function(examples):
#     return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# # Apply tokenization in parallel (using batched=True)
# train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4)
# eval_dataset = eval_dataset.map(tokenize_function, batched=True, num_proc=4)



In [6]:
# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [7]:
# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=15).to(device)

# Adjusted LoRA Configuration for Long-Form Text Classification
lora_config = LoraConfig(
    r=16,  # Lower rank to reduce memory usage while maintaining expressiveness
    lora_alpha=32,  # Balanced scaling factor to control adaptation
    target_modules=["query", "key", "value", "dense"],  # Apply LoRA to key attention layers and dense layers for better classification performance
    lora_dropout=0.08,  # Slightly higher dropout to prevent overfitting on longer sequences
    bias="none"
)
# Integrate LoRA with the model
model = get_peft_model(model, lora_config).to(device)
model.print_trainable_parameters()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 2,678,784 || all params: 112,172,559 || trainable%: 2.3881


In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    accuracy = accuracy_score(labels, predictions)
    balanced_acc = balanced_accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted', zero_division=0)
    recall = recall_score(labels, predictions, average='weighted', zero_division=0)
    f1 = f1_score(labels, predictions, average='weighted', zero_division=0)
    return {
        "accuracy": accuracy,
        "balanced_accuracy": balanced_acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [9]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="/kaggle/working/logs",
    label_names=["labels"],
    logging_steps=10,
    num_train_epochs=20,  # More epochs for better legal text learning
    per_device_train_batch_size=4,  # Lower batch size for stability
    per_device_eval_batch_size=4,
    learning_rate=5e-5,  # Reduce LR for better fine-tuning
    warmup_ratio=0.1,  # Stabilize early training
    weight_decay=0.01,
    metric_for_best_model="balanced_accuracy",
    load_best_model_at_end=True,
    report_to=["wandb"],  # Enable W&B logging
    fp16=torch.cuda.is_available(),  # Use FP16 if GPU is available
)




In [11]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)


Epoch,Training Loss,Validation Loss,Accuracy,Balanced Accuracy,Precision,Recall,F1
1,1.1916,1.068802,0.692399,0.383437,0.612604,0.692399,0.646472
2,0.7786,1.030221,0.698337,0.383157,0.609177,0.698337,0.644318
3,0.737,0.920015,0.71734,0.396027,0.644027,0.71734,0.673844
4,0.8568,0.918623,0.719715,0.436644,0.670367,0.719715,0.678244
5,0.7407,0.82068,0.745843,0.487685,0.728329,0.745843,0.730657
6,0.8505,0.878753,0.748219,0.520808,0.735609,0.748219,0.730262
7,0.5303,0.855161,0.760095,0.548244,0.750444,0.760095,0.745639
8,0.4758,0.849184,0.749406,0.526977,0.745602,0.749406,0.740669
9,0.4754,0.899486,0.744656,0.536913,0.750724,0.744656,0.742184
10,0.4445,0.884398,0.755344,0.585743,0.749519,0.755344,0.750659




{'eval_loss': 1.085618019104004, 'eval_accuracy': 0.7672209026128266, 'eval_balanced_accuracy': 0.6092895255746184, 'eval_precision': 0.7573317704097123, 'eval_recall': 0.7672209026128266, 'eval_f1': 0.761825570253718, 'eval_runtime': 21.4198, 'eval_samples_per_second': 39.309, 'eval_steps_per_second': 4.949, 'epoch': 20.0}


In [13]:
# Function to upload model to Hugging Face Hub
def upload_to_huggingface(trainer, model_name, hf_token):
    """Upload model to Hugging Face Model Hub."""
    login(token=hf_token)
    trainer.model.push_to_hub(model_name)
    tokenizer.push_to_hub(model_name)
    print(f"Model uploaded to Hugging Face Hub: https://huggingface.co/{model_name}")


hf_token = "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"  # Set your HF token here
upload_to_huggingface(trainer, model_name, hf_token)


NameError: name 'login' is not defined