In [2]:
pip install torch transformers datasets peft pypdf accelerate sentencepiece

Collecting pypdf
  Downloading pypdf-6.7.1-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.7.1-py3-none-any.whl (331 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m331.0/331.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.7.1


In [3]:
from pypdf import PdfReader
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import torch

# Load my custom data(PDF FROM)

In [5]:
pdf_path = "/content/AI_and_Machine_Learning (1).pdf"
reader = PdfReader(pdf_path)

text = ""
for page in reader.pages:
    text += page.extract_text() + "\n"

print("PDF loaded!")

PDF loaded!


# split into chunks

In [6]:
chunks = text.split("\n\n")

data = []
for chunk in chunks:
    if len(chunk) > 100:
        data.append({
            "text": f"Explain this concept:\n{chunk}"
        })

dataset = Dataset.from_list(data)


# Load Model

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

model.config.use_cache = False

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [12]:
dataset = Dataset.from_list(data)

In [13]:
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized_dataset = dataset.map(tokenize, remove_columns=["text"])

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [14]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [15]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [16]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=1,
    num_train_epochs=2,
    learning_rate=2e-4,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

In [17]:
trainer.train()

Step,Training Loss
10,0.995013
20,0.754319


TrainOutput(global_step=20, training_loss=0.8746656894683837, metrics={'train_runtime': 14.0676, 'train_samples_per_second': 1.422, 'train_steps_per_second': 1.422, 'total_flos': 31814823444480.0, 'train_loss': 0.8746656894683837, 'epoch': 2.0})

In [26]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import os

base_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
adapter_path = "fine_tuned_model/checkpoint-20"   # LOCAL path

print("Files in adapter folder:", os.listdir(adapter_path))

# Load tokenizer from BASE model
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load base model
model = AutoModelForCausalLM.from_pretrained(base_model_name)

# Load LoRA adapter from LOCAL folder
model = PeftModel.from_pretrained(model, adapter_path)

model.eval()
print("Model loaded successfully!")

prompt = "Answer the following question clearly:\nWhat is Machine learning?"

inputs = tokenizer(prompt, return_tensors="pt")

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

print("\nModel Answer:")
print(tokenizer.decode(output[0], skip_special_tokens=True))

Files in adapter folder: ['training_args.bin', 'chat_template.jinja', 'trainer_state.json', 'README.md', 'rng_state.pth', 'adapter_model.safetensors', 'tokenizer_config.json', 'optimizer.pt', 'tokenizer.json', 'adapter_config.json', 'scheduler.pt']


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Model loaded successfully!

Model Answer:
Answer the following question clearly:
What is Machine learning? How does it work, and how does it improve our computer systems?

Answer: Machine learning is a type of AI that uses algorithms to learn from data without being explicitly programmed. It involves building models that can automatically improve over time based on new data. Machine learning algorithms work by identifying patterns in data and using them to make predictions or recommendations. For example, a machine learning model might be used to predict the likelihood of a user clicking on a specific ad based on their browsing history and other data. By learning from data, machine learning can improve the accuracy and efficiency of computer systems, making them more efficient and effective at performing tasks like image recognition, text analysis, and recommendation systems.


In [27]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
!cp -r fine_tuned_model /content/drive/MyDrive/