<a href="https://colab.research.google.com/github/BlenSeleshi/LLM/blob/feature%2Ftokenizer/fine_tuning2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
! pip install transformers datasets
!pip install datasets
!pip install pandas




In [None]:
from datasets import Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def read_conll_file(file_path):
    tokens = []
    labels = []
    current_tokens = []
    current_labels = []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:

                parts = line.split()
                if len(parts) >= 2:
                    token, label = parts[0], parts[1]
                    current_tokens.append(token)
                    current_labels.append(label)
                else:
                    print(f"Skipping malformed line: {line}")
            else:
                if current_tokens:
                    tokens.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []


    if current_tokens:
        tokens.append(current_tokens)
        labels.append(current_labels)

    return tokens, labels


# Read the CONLL file
tokens, labels = read_conll_file("merged_output.conll")

# Verify lengths
print(f"Number of token sequences: {len(tokens)}")
print(f"Number of label sequences: {len(labels)}")


if len(tokens) != len(labels):
    raise ValueError("The number of token sequences does not match the number of label sequences.")

max_length = max(len(label_seq) for label_seq in labels)
formatted_labels = [label_seq + ["O"] * (max_length - len(label_seq)) for label_seq in labels]

df = pd.DataFrame({"tokens": tokens, "labels": formatted_labels})


label_encoder = LabelEncoder()

df['labels'] = df['labels'].apply(lambda x: label_encoder.fit_transform(x))


dataset = Dataset.from_pandas(df)

print(df.head())



Number of token sequences: 53671
Number of label sequences: 53671


In [7]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

dataset = Dataset.from_pandas(df)


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):

    tokenized_output = tokenizer(examples["tokens"], padding="max_length", truncation=True, is_split_into_words=True)

    tokenized_output["labels"] = examples["labels"]
    return tokenized_output


def split_dataset(dataset, test_size=0.2):
    train_indices, test_indices = train_test_split(range(len(dataset)), test_size=test_size, random_state=42)
    train_dataset = dataset.select(train_indices)
    test_dataset = dataset.select(test_indices)
    return train_dataset, test_dataset


train_dataset, test_dataset = split_dataset(dataset)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/42936 [00:00<?, ? examples/s]

Map:   0%|          | 0/10735 [00:00<?, ? examples/s]

In [8]:
small_train_dataset = tokenized_train_dataset.shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_test_dataset.shuffle(seed=42).select(range(1000))

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [11]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [12]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")



In [25]:
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

label_encoder = LabelEncoder()

df['labels'] = label_encoder.fit_transform(df['labels'])

dataset = Dataset.from_pandas(df)

print(dataset)


Dataset({
    features: ['tokens', 'labels'],
    num_rows: 53671
})


In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()




ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

In [None]:

trainer.save_model('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')