# This is the beginning of Cloaky-LM

## Setting up the libraries

1. torch: This is PyTorch, the fundamental deep-learning framework we will use. Think of it as the engine and raw materials (like steel and circuits) for our model.

2. transformers: From Hugging Face, this is the most important library for our project. It provides pre-built architectures (like the Transformer) and high-level tools, including a Trainer class that will manage our training loop for us. It's our master blueprint and toolbox.

3. datasets: Also from Hugging Face, this library makes it incredibly simple to download, load, and process the vast amounts of text data our model needs to learn from.

4. tokenizers: An efficient library for the crucial step of converting our text into numbers that the model can understand.

5. accelerate: A helper library that works with transformers to automatically optimize our training code to run efficiently on whatever hardware we have (like the T4 GPU in Colab).


In [None]:
!pip install transformers datasets tokenizers torch accelerate
!pip install --upgrade datasets

In [None]:


from datasets import load_dataset

# Download and load the wikitext-2-raw-v1 configuration of the WikiText dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
print(dataset)


In [None]:
# trying some examples of the training data
print(dataset["train"][9]['text'])

## Tokenising the dataset with eos

In [None]:
from transformers import AutoTokenizer

# Load the tokenizer of the 'gpt2' model.
tokenizer = AutoTokenizer.from_pretrained("gpt2")

"""
The GPT-2 model was trained without a padding token.
Generally a End of sentence token is used as a padding Token...

I am going to use the eos token as the padding token
"""
tokenizer.pad_token=tokenizer.eos_token


def tokenize_function(examples):
  """it takes the text and converts it to 'input_ids'."""
  return tokenizer(examples["text"], truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])





*   We loaded a tokenizer that already knows a vocabulary of about 50,000 English tokens.
*   used .map() to apply this function across all splits (train, validation, test) of our dataset

*   remove_columns=["text"] because once we have the input_ids, we no longer need the original raw text


In [None]:
# Print the first 20 token IDs for the 10th example.
print(tokenized_datasets["train"][9]['input_ids'][:20])


# Blueprint

In [None]:
from transformers import AutoConfig, AutoModelForCausalLM

vocab_size = tokenizer.vocab_size

config = AutoConfig.from_pretrained(
    "gpt2",  # Use the GPT-2 architecture as a template.
    vocab_size=vocab_size,
    n_positions=128,      # Maximum sequence length the model can handle (matches our tokenizer).
    n_embd=256,           # The dimensionality of the token embeddings (the "richness" of the word vectors).
    n_layer=4,            # The number of Transformer layers (the "depth" of the model).
    n_head=4,             # The number of attention heads in each layer.
)

model = AutoModelForCausalLM.from_config(config)

print(model)

In [None]:
num_params=model.num_parameters()
print(f"model has {num_params:,} parameters.")

# Training

In [None]:
import torch


# Check if  GPU is available?

if torch.cuda.is_available():
  device= torch.device("cuda")
  print(f"Training on {torch.cuda.get_device_name(0)}")
else:
  device= torch.device("cpu")

In [None]:
from transformers import DataCollatorForLanguageModeling

# this collator will automatically create labels for our language modeling task
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False # We want casual (next-token) prediction not masked language modeling.
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./cloakylm-1",
    overwrite_output_dir=True,
    num_train_epochs=3,                # You can increase for better results
    per_device_train_batch_size=32,    # Adjust if you get out-of-memory errors
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    report_to="none",                  # Disable wandb/tensorboard for simplicity
    fp16=True,                         # Use mixed precision for faster training on GPU
)

# Initialize the trainer

In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("/content/drive/MyDrive/cloakylm-1")
tokenizer.save_pretrained("/content/drive/MyDrive/cloakylm-1")


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = torch.device("cpu")
print(f"Using device: {device}")

model_path = "/content/drive/MyDrive/cloakylm-1"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(model_path)
model.to(device)
model.eval()

prompt = "who are you"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

# Try generating text on CPU
output = model.generate(
    **inputs,
    max_length=80,
    num_return_sequences=1,
    pad_token_id=tokenizer.pad_token_id,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7
)

print(tokenizer.decode(output[0], skip_special_tokens=True))
