In [None]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.31.0-py3-none-any.whl (7.4 MB)
Installing collected packages: transformers
Successfully installed transformers-4.31.0


In [None]:
!pip uninstall datasets transformers==4.28.0


Found existing installation: datasets 2.14.4
Uninstalling datasets-2.14.4:
  Would remove:
    /usr/local/bin/datasets-cli
    /usr/local/lib/python3.10/dist-packages/datasets-2.14.4.dist-info/*
    /usr/local/lib/python3.10/dist-packages/datasets/*
Proceed (Y/n)? y
  Successfully uninstalled datasets-2.14.4
Found existing installation: transformers 4.28.0
Uninstalling transformers-4.28.0:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.28.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? y
  Successfully uninstalled transformers-4.28.0


In [None]:
yy!pip install -U PyPDF2
!pip install python-docx

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/232.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184487 sha256=bd60a65023

In [None]:
import pandas as pd
import os
import numpy as np
import re
from PyPDF2 import PdfReader
import os
import docx

In [None]:
import transformers
import torch
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer

In [None]:
# Functions to read different file types
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text


In [None]:
# Read documents from the directory
#train_directory = '/content/drive/MyDrive/ColabNotebooks/data/chatbot_docs/training_data/full_text'
train_directory = '/content/filtered_data.txt'
text_data = read_txt(train_directory)


In [None]:
#text_data = read_pdf('/content/drive/MyDrive/ColabNotebooks/data/chatbot_docs/Cell_Biology.pdf')
#text_data = re.sub(r'\n+', '\n', text_data).strip()  # Remove excess newline characters

In [None]:
# Save the training and validation data as text files
#with open("/content/drive/MyDrive/ColabNotebooks/data/chatbot_docs/combined_text/full_text/train.txt", "w") as f:
 #   f.write(text_data)

In [None]:
def load_dataset(file_path, tokenizer, block_size = 1024):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [None]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [None]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=True)
  config.attn_config['attn_impl'] = 'triton'
  config.init_device = 'cuda:0' # For fast initialization directly on GPU!

  model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    config=config,
    torch_dtype=torch.bfloat16, # Load model weights in bfloat16
    trust_remote_code=True
  )

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [None]:

#train_file_path = "/content/drive/MyDrive/ColabNotebooks/data/chatbot_docs/combined_text/full_text/train.txt"
train_file_path = "/content/filtered_data.txt"
model_name = 'mosaicml/mpt-7b-instruct'
#output_dir = '/content/drive/MyDrive/ColabNotebooks/models/chat_models/custom_full_text'
output_dir = '/content/custom_outline'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5
save_steps = 50000

In [None]:
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



RuntimeError: ignored

In [None]:
!pip install huggingface tokenizers==0.10.1 transformers==4.4.2

Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)
Collecting tokenizers==0.10.1
  Downloading tokenizers-0.10.1.tar.gz (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting transformers==4.4.2
  Downloading transformers-4.4.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses (from transformers==4.4.2)
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tokenizers, sacre

Inference

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):

    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

This model got trained on the entire text and took much longer to train, and yet it fails to give meaningful results.

In [None]:
model1_path = "/content/drive/MyDrive/ColabNotebooks/models/chat_models/custom_full_text"
sequence1 = "[Q] What is the Babel fish?"
max_len = 50
generate_text(model1_path, sequence1, max_len)

[Q] What is the Babel fish?  Theyve  never  heard  of  it  They  
make  fish  Theyve  gone  back  into  space


The following model was trained on 100 questions and answers based on the original text and it trained in a few seconds (50 epochs). It gives very meaningful results.

In [None]:
model2_path = "/content/custom_outline"
sequence2 = "generate a course outline for a deep learning course and mention the book title"
max_len = 300
generate_text(model2_path, sequence2, max_len)

generate a course outline for a deep learning course and mention the book title in the footnotes.
    Information Theory
    Why Not Use One-hot Learning?
    The Hot Dog Principle
    The Statistical Tools for Deep Learning
    Building the Model
    Defining the Loss Function
    Training and Predicting
  AutoRec: Rating Prediction with Autoencoders
    Model
    Predicting
  Semi-Autoencoders
    Model Architectures
    Implemenation of Autoencoders
    Training and Predicting
  Deep Recurrent Neural Networks
  Gated Recurrent Neural Networks
    Generating the Dataset
    Reading the Dataset
    Defining the Model
    Training and Predicting
  Batch Recurrent Neural Networks
    Model Implementation
    Predicting
  Deep Convolutional Neural Networks
    The Computational Graph
    BPT Graph
    Statistics
    Predicting
  Deep Convolutional Neural Networks: Applications
  Sentiment Analysis and the Dataset
    The Sentiment Analysis Dataset
    Putting All Things Together
  Sentim

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("generate a course outline for a deep learning course", max_length=300, num_return_sequences=5)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "generate a course outline for a deep learning course. Then you get the opportunity to gain information about deep learning principles and design with our experienced, committed instructors. Once I created the course, I was able to get the whole concept out and start working on your course. The first thing to know is that you don't need to be perfect, only able to apply it. This is important in order to gain enough experience with both learning and training.\n\nAs a student, we usually take our curriculum as an opportunity to practice. We teach a course on what we've come to call computational systems, and we teach our students how to do many things. It's important for us to spend time studying the underlying concepts. You should really not have to learn everything by yourself with a course plan. One way of doing this is to take some time and learn from the best as well as our own instructors as they guide you, making sure you're doing everything possible in your sc

In [None]:
output

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-0.1639, -0.2081, -0.3631,  ..., -0.1327, -0.0659, -0.1325],
         [-0.1532,  0.0364, -0.7239,  ..., -0.1237,  0.4602, -0.1088],
         [-0.3179,  0.2230,  0.1059,  ..., -0.2953,  0.0253, -0.1307],
         ...,
         [-0.2223, -0.3087, -0.6501,  ...,  0.1771,  0.5849, -0.5163],
         [-0.2173, -0.4038, -2.9980,  ...,  0.2316, -0.0744,  0.2539],
         [ 0.0853, -0.4034, -0.3844,  ...,  0.1481,  0.0769, -0.0221]]],
       grad_fn=<ViewBackward0>), past_key_values=((tensor([[[[-1.4444,  1.9113,  0.6049,  ..., -1.0575, -0.2438,  1.2718],
          [-2.2031,  2.7234,  1.6833,  ..., -0.9224, -1.2803,  1.6551],
          [-2.3145,  2.7101,  1.5073,  ..., -0.5781, -1.9292,  2.2634],
          ...,
          [-3.0834,  2.5788,  2.3934,  ..., -0.3232, -1.2032,  1.0067],
          [-2.8576,  1.7590,  1.7940,  ...,  0.4602, -2.5942,  1.9817],
          [-2.5236,  2.5848,  2.1570,  ..., -0.1014, -2.0656,  1.6698]],