In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install "transformers==4.35" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.42.0" "trl==0.4.7" "safetensors>=0.3.1" "tiktoken"



# Zero Shot Fine Tuning

In [3]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# Loading pre-trained GPT-2 model and tokenizer
model_name = "gpt2" # Model size can be switched accordingly (e.g., "gpt2-medium")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)



In [4]:
def generate_text(prompt, max_length=40, temperature=0.8, top_k=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True
    )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

In [5]:
generate_text('Hi my name is mahmoud')

'Hi my name is mahmoud, and I am the best in the world."\n\nFor more than 30 years, Maud\'s life has been a roller coaster.\n\n"Every day'

# Few Shot Fine Tuning

In [6]:
!pip install PyPDF2



In [7]:
import pandas as pd
import numpy as np
import re
from PyPDF2 import PdfReader
import os


In [8]:
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

In [9]:
text_data=read_pdf(file_path='/content/Egyptian_Museum_Collection.pdf')
text_data = re.sub(r'\n+', '\n', text_data).strip()  # Remove excess newline characters

In [10]:
# Save the training and validation data as text files
with open("/content/train.txt", "w") as f:
    f.write(text_data)


In [11]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [12]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [13]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [14]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()


In [15]:
train_file_path = "/content/train.txt"
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/GPT2'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs =50
save_steps = 50000

In [16]:
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss
500,2.9793
1000,2.0366
1500,1.5248
2000,1.1789
2500,0.9509
3000,0.778
3500,0.6487
4000,0.5447
4500,0.4608
5000,0.3997


# Inference

In [17]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):

    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [19]:
model1_path = "/content/drive/MyDrive/GPT2"
prompot = "introduce Amenirdis I ?"
max_len =200
generate_text(model1_path,prompot, max_len)

introduce Amenirdis I? which is now in the Boston Museum of FineEgyptian Museum Collection
Arts. Khamerernebti is given the title King's Mother on the fragment.
=== Sarcophagus ===
In 1837, English army officer Richard William Howard Vyse, and engineer John Shae Perring began
excavations within the pyramid of Menkaure. In the main burial chamber of the pyramid they found a
large stone sarcophagus 8 feet 0 inches (244 cm) long, 3 feet 0 inches (91 cm) in width, and 2 feet
11 inches (89 cm) in height, made of basalt. The sarcophagus was removed from the pyramid
and was sent by ship to the British Museum in London, but the merchant ship Beatrice carrying it was lost after leaving port at Malta on
October 13, 1838. The other materials were sent by a separate ship, and those materials now reside

