In [11]:
# 3. Model Fine-Tuning
# Objective: Train NER models like XLM-Roberta.
# Libraries/Tools: transformers, datasets, torch, huggingface.

# Ensure transformers and datasets are installed
# %pip install transformers datasets

import os
import requests
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_from_disk

# Load dataset
dataset_path = r"../data/labeled_data.conll"  # Update this path to the dataset file path
try:
    dataset = load_dataset('csv', data_files=dataset_path, split='train')
except FileNotFoundError:
    print(f"File not found at {dataset_path}. Please check the file path and ensure the file exists.")
    # Optionally, you can load a sample dataset or handle the error as needed
    dataset = load_from_disk('path_to_sample_dataset')

# Disable SSL verification
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
os.environ['CURL_CA_BUNDLE'] = ''
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

# Load pre-trained model and tokenizer
model_name = "xlm-roberta-base"
local_model_path = r"../data/xlm-roberta-base"  # Update this path to the local directory containing the model files

try:
    tokenizer = AutoTokenizer.from_pretrained(local_model_path, use_auth_token=False, trust_remote_code=True, local_files_only=True)
    model = AutoModelForTokenClassification.from_pretrained(local_model_path, num_labels=len(label_list), use_auth_token=False, trust_remote_code=True, local_files_only=True)
except OSError:
    print(f"Local model not found at {local_model_path}. Falling back to downloading from Hugging Face Hub.")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

# Tokenize dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding=True)
    labels = []
    for i, label in enumerate(examples[f"{label_column}_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        labels.append([label[word_idx] if word_idx is not None else -100 for word_idx in word_ids])
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

# Training
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()



Local model not found at ../data/xlm-roberta-base. Falling back to downloading from Hugging Face Hub.


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ImportError: 
AutoModelForTokenClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
