#Mounting drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Libraries

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

from torch.utils.data import TensorDataset, DataLoader

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer  #Uses subword tokinization, Byte Pair Encoding
from torch.utils.data import Dataset, DataLoader
import torch

#Device checking

In [None]:
import torch

import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cuda


#Defining class for custom class, loading pre-trained-GPT2 model, freezing some parameters and fine-tuning process.

In [None]:

# Define Dataset class for loading data from 'data.txt'
class MyDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = []

        with open(file_path, 'r', encoding='utf-8') as file:
            temp=file.read()
            self.data+=temp.split('//par//')
            self.data+=temp.split('[topic]')
            self.data+=temp.split('//chap//')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokenized_data = self.tokenizer.encode_plus(
            self.data[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return tokenized_data.input_ids, tokenized_data.attention_mask

# Load pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token     #To make all the sentence in a batch of same length
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

# Freeze parameters of the model except for the last layer
for param in model.parameters():
    param.requires_grad = False
for param in model.transformer.h[-1].parameters():
    param.requires_grad = True

# Prepare dataset and dataloader
dataset = MyDataset('/content/drive/My Drive/Subjects/NLP/Innovative/Data/feynman-lectures_parts.txt', tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Define training configuration
learning_rate = 5e-5
num_epochs = 30
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move model to device
model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Fine-tune the model
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, attention_masks in dataloader:
        inputs = inputs.to(device)
        attention_masks = attention_masks.to(device)

        # Forward pass
        outputs = model(input_ids=inputs, attention_mask=attention_masks, labels=inputs)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}')

# Save the fine-tuned model
model.save_pretrained('fine_tuned_gpt2_model')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Epoch [1/30], Loss: 4.3050
Epoch [2/30], Loss: 3.9661
Epoch [3/30], Loss: 3.6489
Epoch [4/30], Loss: 3.3587
Epoch [5/30], Loss: 3.0668
Epoch [6/30], Loss: 2.8784
Epoch [7/30], Loss: 2.7632
Epoch [8/30], Loss: 2.6977
Epoch [9/30], Loss: 2.6630
Epoch [10/30], Loss: 2.6319
Epoch [11/30], Loss: 2.5956
Epoch [12/30], Loss: 2.5796
Epoch [13/30], Loss: 2.5539
Epoch [14/30], Loss: 2.5262
Epoch [15/30], Loss: 2.5045
Epoch [16/30], Loss: 2.4876
Epoch [17/30], Loss: 2.4568
Epoch [18/30], Loss: 2.4372
Epoch [19/30], Loss: 2.4053
Epoch [20/30], Loss: 2.3837
Epoch [21/30], Loss: 2.3603
Epoch [22/30], Loss: 2.3237
Epoch [23/30], Loss: 2.2973
Epoch [24/30], Loss: 2.2774
Epoch [25/30], Loss: 2.2390
Epoch [26/30], Loss: 2.2192
Epoch [27/30], Loss: 2.1885
Epoch [28/30], Loss: 2.1540
Epoch [29/30], Loss: 2.1244
Epoch [30/30], Loss: 2.0941


#Fine-Tuning part-2

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, attention_masks in dataloader:
        inputs = inputs.to(device)
        attention_masks = attention_masks.to(device)

        # Forward pass
        outputs = model(input_ids=inputs, attention_mask=attention_masks, labels=inputs)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}')

# Save the fine-tuned model
model.save_pretrained('/content/drive/My Drive/Subjects/NLP/Innovative/model/fine_tuned_gpt2_model')

Epoch [1/30], Loss: 2.0624
Epoch [2/30], Loss: 2.0255
Epoch [3/30], Loss: 1.9907
Epoch [4/30], Loss: 1.9698
Epoch [5/30], Loss: 1.9441
Epoch [6/30], Loss: 1.8975
Epoch [7/30], Loss: 1.8724
Epoch [8/30], Loss: 1.8328
Epoch [9/30], Loss: 1.7988
Epoch [10/30], Loss: 1.7608
Epoch [11/30], Loss: 1.7320
Epoch [12/30], Loss: 1.6966
Epoch [13/30], Loss: 1.6752
Epoch [14/30], Loss: 1.6480
Epoch [15/30], Loss: 1.6221
Epoch [16/30], Loss: 1.5750
Epoch [17/30], Loss: 1.5470
Epoch [18/30], Loss: 1.5352
Epoch [19/30], Loss: 1.4846
Epoch [20/30], Loss: 1.4670
Epoch [21/30], Loss: 1.4331
Epoch [22/30], Loss: 1.4038
Epoch [23/30], Loss: 1.3879
Epoch [24/30], Loss: 1.3560
Epoch [25/30], Loss: 1.3382
Epoch [26/30], Loss: 1.3113
Epoch [27/30], Loss: 1.2921
Epoch [28/30], Loss: 1.2562
Epoch [29/30], Loss: 1.2303
Epoch [30/30], Loss: 1.2163


#Working demo

In [None]:
def generate_text(prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    attention_mask = torch.ones_like(input_ids).to(device)

    output = model.generate(input_ids, attention_mask=attention_mask, num_beams=5,
                            max_length=max_length, num_return_sequences=1,no_repeat_ngram_size=2,early_stopping=True)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
    return '.'.join(generated_text.split('.')[:-1])+'.'

# Question answer loop
#  (Please ensure you are on GPU for effective communication)
while True:
  query=input('[User]:')
  print('[Feynman]:',generate_text(query))


KeyboardInterrupt: Interrupted by user

**Checking the model architecture (Not mandatory)**

In [None]:
# for name, param in model.named_parameters():
#   print(name)
#  # or
#     if 'specific_layer' in name:  # Replace 'specific_layer' with the name of the layer you want to fine-tune
#         param.requires_grad = True
#     else:
#         param.requires_grad = False

GPT2Block(
  (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
GPT2Block(
  (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
GPT2Block(
  (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  

#**Comparision between fine-tuned model an Naive GPT2**

#Preparing general GPT2

In [None]:
model2 = GPT2LMHeadModel.from_pretrained('gpt2-medium')

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
model2.to(device)
def generate_text2(prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    attention_mask = torch.ones_like(input_ids).to(device)

    output = model2.generate(input_ids, attention_mask=attention_mask, num_beams=5,
                            max_length=max_length, num_return_sequences=1,no_repeat_ngram_size=2,early_stopping=True)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
    return '.'.join(generated_text.split('.')[:-1])+'.'

In [None]:
prompt='Explain what is matter according to physics'

#Naive model

In [None]:
generate_text2(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Explain what is matter according to physics. Explain how matter is created and how it interacts with other matter.\n\nWhat is the difference between matter and energy? Explain what energy is and why it is important to understand it.'

#Fine-Tuned model

In [None]:
generate_text(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Explain what is matter according to physics.\nThe first thing to do is to understand the difference between a particle and a solid. A solid is made of matter—neither of which can be understood without some kind of being, which is what makes the two different. In the way of the things we saw before, matter is not a substance at all, but only a combination of substances which we call matter.'

#**Save model**

In [None]:
model.save_pretrained('/content/drive/My Drive/Subjects/NLP/Innovative/model/fine_tuned_gpt2_model')

#**Load Trained model**

#Some important libraries and initialization of tokenizers.

In [None]:
from transformers import GPT2LMHeadModel
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

#Loading the model

In [None]:
# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained('/content/drive/My Drive/Subjects/NLP/Innovative/model/fine_tuned_gpt2_model')

#Method for generating text

In [None]:
model.to(device)
def generate_text(prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    attention_mask = torch.ones_like(input_ids).to(device)

    output = model.generate(input_ids, attention_mask=attention_mask, num_beams=5,
                            max_length=max_length, num_return_sequences=1,no_repeat_ngram_size=2,early_stopping=True)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
    return '.'.join(generated_text.split('.')[:-1])+'.'

#Go on brrrrr.....................

In [None]:
generate_text('What is the motion of molecules of the water in ice? ')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'What is the motion of molecules of the water in ice? \n\nWater molecules move in a very simple way—by passing through a thin membrane of water molecules called a pore. Water molecules have a diameter of about 1/8 of an inch (1/16 of a millimeter). When they are moving through water, they bump up and down a membrane which acts as a sort of transducer.'

#Additional examples

In [None]:
generate_text('What is the theory of relativity? Explain both genera and special theory of relativity')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'What is the theory of relativity? Explain both genera and special theory of relativity.\nIn the early days of space-time, there was no such thing as general relativity—there was only general-relativistic mechanics. General relativity says that all things in the universe are made of the same three-dimensional things, and that there is only one known form of such things.'

In [None]:
generate_text('Explain me the concept of atomic structure')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Explain me the concept of atomic structure.\nIn the early days of physics, there was a great deal of interest in the idea of what we now call atomic mechanics. In the late 19th century, the work of Hermann Heisenberg and other physicists, first demonstrated the laws of heat, and then more recently, of the conservation of energy, were put to the test in a series of high-energy nuclear devices (called fissile material physics devices).'

In [None]:
generate_text('Explain electrons in atom')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Explain electrons in atom- and molecule-level chemistry\nLet us now consider the chemical processes that take place in an object when it is in motion. In the process of chemical reactions, the atoms are jiggled around in a myriad of possible combinations until they find their final form.\nIn a chemical reaction, a substance is combined with a large number of partners, and the product is then broken down by the various partners into smaller and smaller molecules.'

In [None]:
generate_text('how electrons revolves around proton ? ')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'how electrons revolves around proton? \n\nYes, it does. In the early days of physics, the equations of the atom, and indeed all of nature, were written in such a way that the number of electrons in an atom was proportional to the square of its age. This law was known as the "Eureka" law, after the famous experiment in which a beam of high-energy particles was deflected by a thin slab of graphite and back again.'

In [None]:
generate_text('what is biology ? ')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'what is biology??"\n\n"Well, biology is the study of living things. It is all about the processes that take place in the body of an animal, and how they affect its behavior. In this chapter, we shall consider the functions of the nervous system.\nIn the early days of biology, there was a great deal of interest in finding out what made the hairs on the back of our backs stand up.'

In [None]:
generate_text('who is Sahib Parmar ')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'who is Sahib Parmar?"\n\n"Yes, sir."\nLet us now consider what is meant by "sahib" in connection with the meaning of the word "parmar." It has been emphasized earlier that in the sense in which it is ordinarily employed, it has the same meaning as the one we have just used.'

In [None]:
generate_text2('who is Sahib Parmar ')  # A naive approach

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'who is Sahib Parmar \xa0and who is also a member of the BJP\'s national executive committee). He has also written a book on the history of Islam in India. He is the author of several books on Indian history, politics and religion. His latest book is titled "The Indian Muslim: A History of India\'s Muslim Minority" (Harvard University Press).'

In [None]:
generate_text('Explain what is molecules with its example ')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Explain what is molecules with its example \n\nmolecules are made of atoms, which are the building blocks of all living things.'