In [1]:
!pip install transformers[torch]



In [2]:
!pip install transformers



In [3]:
!pip install -U PyPDF2
!pip install python-docx



In [4]:
import pandas as pd
import numpy as np
import re
from PyPDF2 import PdfReader
import os
import docx

from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [7]:
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text

In [8]:
train_directory = "/content"
text_data = read_documents_from_directory(train_directory)
text_data = re.sub(r'\n+', '\n', text_data).strip()

In [9]:
text_data

"Darshit Prajapati \nJunior Data Scientist \nI'm a motivated data science professional proﬁcient in\nPython and experienced in machine learning and Artiﬁcial\nIntelligence. With a strong foundation in problem-solving,\nI've successfully applied it in various projects.\nUnderstanding the business impact of data science, I excel\nat translating data insights into actionable strategies.\nCommitted to continuous learning, I stay updated on the\nlatest trends, ensuring I remain at the forefront of the ﬁeld. \ndarshitprajapati139@gmail.com \n9054602539 \nAhmedabad, India \nlinkedin.com/in/darshit-prajapati-3b1156247 \ngithub.com/Darshit139/Projects \nEDUCATION \nAdvanced Certiﬁcation Course in Data\nScience and AI \nIntelliPaat \n05/2023 - Present\n, \n \nBsc(IT) \nGLS University \n07/2020 - 05/2023\n, \n \nAhmedabad, Gujrat \nOver all CGPA = 8.8 \nH.S.C \nAnjali Vidhyalaya \n06/2018 - 04/2020\n, \n \nAhmedabad, Gujrat \nH.S.C = 77% \nPROJECTS \nMy Apartment App \nThe main aim of the project

In [10]:
with open("/content/train.txt", "w") as f:
    f.write(text_data)

In [11]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [12]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [13]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [14]:
train_file_path = "/content/train.txt"
model_name = 'gpt2'
#output_dir = '/content/drive/MyDrive/ColabNotebooks/models/chat_models/custom_full_text'
output_dir = '/content/output'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 50000

In [15]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Step,Training Loss


In [18]:
example_text = "The quick brown fox jumps over the lazy dog."

# Save example text to a file
with open("example.txt", "w") as file:
    file.write(example_text)
# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [30]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [31]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):

    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [36]:
model1_path = "/content/output"
sequence1 = "[Q] what is Cricket Match Score Predicator"
max_len = 50
generate_text(model1_path, sequence1, max_len)

[Q] what is Cricket Match Score Predicator (Cricket Match Score Predicator) and why can it be useful for your business?

A: Cricket Match Score Predicator is a project in collaboration with Google that aims at providing an


In [40]:
model1_path = "/content/output"
sequence1 = "[Q] write abcd"
max_len = 50
generate_text(model1_path, sequence1, max_len)

[Q] write abcd to cv-vars
src/test.rs
import os.path.join(/data/net/vars/, ['data', '', '', '', '', '', '']),
