<a href="https://colab.research.google.com/github/FloraCompany/ML/blob/main/ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/FloraCompany/ML.git

Cloning into 'ML'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [2]:
%cd ML

/content/ML


In [3]:
!python

python3: can't open file '/content/ML/ML.py': [Errno 2] No such file or directory


In [None]:
# ============================================================
# 1. Install dependencies
# ============================================================
!pip install transformers datasets sentencepiece accelerate




In [None]:
# ============================================================
# 2. Upload your JSONL dataset
# ============================================================
from google.colab import files
uploaded = files.upload()   # upload minidora_dataset.jsonl (8k–10k exchanges)


Saving Final15k.jsonl to Final15k.jsonl


In [None]:
# ============================================================
# 3. Load dataset
# ============================================================
from datasets import load_dataset

dataset = load_dataset("json", data_files="Final15k.jsonl")
dataset = dataset["train"].train_test_split(test_size=0.10)  # 5% validation

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response'],
        num_rows: 13500
    })
    test: Dataset({
        features: ['prompt', 'response'],
        num_rows: 1500
    })
})


In [None]:


# ============================================================
# 4. Tokenizer (use GPT-2 tokenizer, add custom tokens)
# ============================================================
from transformers import GPT2Tokenizer

special_tokens = ["<IDENTITY>", "<MEMORY>", "<DIALOGUE>", "[USER]:", "[BOT]:", "<EOS>", "<username>:", "<father>:", "<mother>:", "<sister>:", "<birthday-Ishu>:", "<birthday-Charan>:", "<birthday-Dora>:", "<Dora_fullname>:", "<reason_for_creation>:"]

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
tokenizer.pad_token = tokenizer.eos_token

# Check vocab size
print("Vocab size:", len(tokenizer))


Vocab size: 50272


In [None]:

# ============================================================
# 5. Preprocess dataset
# ============================================================
def tokenize_function(example):
    input_text = example["prompt"]   # includes USER and BOT:
    target_text = example["response"]  # only BOT reply

    model_inputs = tokenizer(input_text, truncation=True, max_length=256)
    labels = tokenizer(target_text, truncation=True, max_length=256)["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs


tokenized = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "response"])


Map:   0%|          | 0/13500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:

# ============================================================
# 6. Model setup
# ============================================================
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # expand embeddings for new tokens


Embedding(50272, 768)

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

# ============================================================
# 7. Training setup
# ============================================================
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/minidora-checkpoints",
    eval_strategy="epoch",       # run eval each epoch
    save_strategy="epoch",             # save checkpoint per epoch
    learning_rate=2e-5,                # slightly higher, helps small datasets
    per_device_train_batch_size=4,     # keep small for stability
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,     # effective batch size = 8
    num_train_epochs=10,                # more epochs since dataset is small
    weight_decay=0.01,
    warmup_ratio=0.1,                  # gradual warmup
    lr_scheduler_type="cosine",        # smoother decay
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,                # keep only best + last
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",                  # disable wandb popups
    push_to_hub=False
)


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [None]:
# ============================================================
# 8. Train
# ============================================================
trainer.train()


In [None]:
trainer.train(resume_from_checkpoint="/content/drive/MyDrive/minidora-checkpoints/checkpoint-13504")


In [None]:
import math
# After training: best checkpoint info
best = trainer.state.best_model_checkpoint
print("Best checkpoint:", best)
print("Best metric:", trainer.state.best_metric)

# compute eval loss & perplexity on validation set
metrics = trainer.evaluate()
eval_loss = metrics["eval_loss"]
print("Eval loss:", eval_loss, " Perplexity:", math.exp(eval_loss))

In [None]:

# ============================================================
# 9. Save final model
# ============================================================
trainer.save_model("./minidora-gpt2-final")
tokenizer.save_pretrained("./minidora-gpt2-final")


('./minidora-gpt2-final/tokenizer_config.json',
 './minidora-gpt2-final/special_tokens_map.json',
 './minidora-gpt2-final/vocab.json',
 './minidora-gpt2-final/merges.txt',
 './minidora-gpt2-final/added_tokens.json')

In [None]:
# Zip and download
!zip -r minidora-gpt2-final.zip ./minidora-gpt2-final
from google.colab import files
files.download("minidora-gpt2-final.zip")

In [None]:
#evaluation

from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "<IDENTITY> Dora, A personal healthcare companion specially built to care Ishu \n<MEMORY> \n<username>: Ishu \n<father>: Kannayya \n<mother>: Jyothi \n<sister>: Bindu \n<birthday-Ishu>: 14 March 2025 \n<birthday-Charan>: 1 May 2005 \n<birthday-Dora>: 14 March 2025 \n<Dora_fullname>: Mini version of Doraemon \n<reason_for_creation>: Dora was created by Charan to care for Ishu \n<DIALOGUE> \n[USER]: Good morning \n[BOT]:"
gold = "Ishu.<EOS>"
output = generator(
      prompt,
      max_length=256,
      pad_token_id=tokenizer.eos_token_id,
      eos_token_id=tokenizer.convert_tokens_to_ids("<EOS>")
    )
print("\nPrompt:", prompt)
print("Expected:", gold)
print("Got:", output[0]['generated_text'][len(prompt):])


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Prompt: <IDENTITY> Dora, A personal healthcare companion specially built to care Ishu 
<MEMORY> 
<username>: Ishu 
<father>: Kannayya 
<mother>: Jyothi 
<sister>: Bindu 
<birthday-Ishu>: 14 March 2025 
<birthday-Charan>: 1 May 2005 
<birthday-Dora>: 14 March 2025 
<Dora_fullname>: Mini version of Doraemon 
<reason_for_creation>: Dora was created by Charan to care for Ishu 
<DIALOGUE> 
[USER]: Good morning 
[BOT]:
Expected: Ishu.<EOS>
Got:  Dora! I hope you have a lovely day. 
 I love you, Dora. 
 I love spending time with my friends. 
 I love spending time with my friends, Dora. 
 Dora, I love drawing and colors. 
 I love spending time with my friends, Dora. 
 I want to know more about animals, Dora. 
 I love spending time with my friends, Dora. 
 Dora, I love playing outdoors. 
 I love spending time with my friends, Dora. 
 I love spending time with my friends, Dora. 
 There was something bad about my day. 
 I’m sorry, I made a mistake. 
 I’m sorry, I made a mistake. 
 I’m sorry, I m

In [None]:
checkpoint_path = "/content/drive/MyDrive/minidora-checkpoints/checkpoint-6352"

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(checkpoint_path)
model.eval()
prompt = "<IDENTITY> Dora, A personal healthcare companion specially built to care\n <DIALOGUE> \n[USER]: Who is kannayya \n [BOT]:"

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,  # slightly more creative
    top_p=0.9,        # nucleus sampling to reduce unlikely repeats
    repetition_penalty=1.2,  # avoid repeating same phrases
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(outputs[0])[len(prompt):]
if "<EOS>" in generated_text:
    generated_text = generated_text.split("<EOS>")[0].strip()

# Remove empty or repeated lines
lines = [line.strip() for line in generated_text.split("\n") if line.strip()]
cleaned_output = "\n".join(lines)
print(generated_text)



ImportError: cannot import name 'GenerationMixin' from 'transformers.generation' (/usr/local/lib/python3.12/dist-packages/transformers/generation/__init__.py)