<a href="https://colab.research.google.com/github/BF667/ipynb/blob/main/LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!uv pip install torch transformers datasets tokenizers

In [None]:
import os

# Ensure 'your_text_file.txt' exists for demonstration.
# In a real scenario, this would be your actual training data.
text_file_path = "/content/data.txt"

from datasets import load_dataset
from transformers import AutoTokenizer, BertTokenizerFast

# Re-load the dataset if it's not already defined from a previous cell.
# This ensures the code can run independently if needed.
try:
    _ = dataset # Check if dataset is already defined
except NameError:
    print(f"Loading dataset from {text_file_path}")
    dataset = load_dataset("text", data_files={"train": text_file_path})

# Create an iterator over the text data for tokenizer training.
# This function yields batches of text from your dataset.
def get_training_corpus():
    # Iterate through the 'train' split of the dataset
    for i in range(0, len(dataset["train"]), 1000):
        yield dataset["train"][i : i + 1000]["text"]

# To train a NEW tokenizer from scratch (e.g., based on BertTokenizerFast architecture),
# you typically start by instantiating the tokenizer class and then calling `train_new_from_iterator`.
# We'll use a `BertTokenizerFast` as an example.

print("Initializing new tokenizer for training...")
# Start with a base tokenizer and define special tokens.
# vocab_size here is a placeholder; the actual vocab will be built from the corpus.
new_tokenizer = BertTokenizerFast.from_pretrained(
    "bert-base-uncased",
    unk_token="[UNK]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    mask_token="[MASK]"
)

# Train the tokenizer on your corpus.
# `vocab_size` here is a target size, `min_frequency` helps filter rare words.
print("Training new tokenizer from corpus...")
new_tokenizer = new_tokenizer.train_new_from_iterator(
    get_training_corpus(),
    vocab_size=30522, # This can be adjusted based on your desired vocabulary size
    min_frequency=2 # Words appearing less than this will be ignored
    # Removed special_tokens argument as it's already defined in new_tokenizer
)
print("Tokenizer training complete.")

# Save the trained tokenizer to a local directory.
tokenizer_name = "my_custom_bert_tokenizer"
save_path = "./" + tokenizer_name
new_tokenizer.save_pretrained(save_path)
print(f"New tokenizer saved to: {save_path}")

# You can now load and use your newly trained tokenizer:
# loaded_tokenizer = AutoTokenizer.from_pretrained(save_path)
# print(f"\nTesting new tokenizer: {loaded_tokenizer('Hello world, this is my new custom tokenizer!')}")
# print(f"New tokenizer vocabulary size: {len(loaded_tokenizer)}")

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer


# Load a pre-trained tokenizer (or train a new one)
tokenizer = AutoTokenizer.from_pretrained("/content/my_custom_bert_tokenizer")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
import torch.nn as nn
from transformers import AutoModelForSequenceClassification

# Example: Define a simple model (or load a pre-trained base for fine-tuning)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)

trainer.train()