<a href="https://colab.research.google.com/github/Anjana2002/LLM/blob/main/Small_Language_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
!pip install torch torchtext transformers sentencepiece pandas tqdm datasets



In [32]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time

In [33]:
#load_data
data_sample = load_dataset("QuyenAnhDE/Diseases_Symptoms")

Repo card metadata block was not found. Setting CardData to empty.


In [34]:
data_sample

DatasetDict({
    train: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments'],
        num_rows: 400
    })
})

In [35]:
updated_data = [{'Name': item['Name'], 'Symptoms': item['Symptoms']} for item in data_sample['train']]
df = pd.DataFrame(updated_data)
df.head()

Unnamed: 0,Name,Symptoms
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro..."
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala..."


In [36]:
#extract the symptoms
df['Symptoms'] = df['Symptoms'].apply(lambda x: ','.join(x.split(',')))

In [37]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import numpy as np

In [38]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [39]:
device

device(type='cpu')

In [40]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)

In [41]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [42]:
BATCH_SIZE = 8
df.describe()

Unnamed: 0,Name,Symptoms
count,400,400
unique,392,395
top,Sciatica,"Swelling, pain, dry mouth, bad taste"
freq,3,3


In [43]:
#dataset prep
class languageDataset(Dataset):
  def __init__(self, df, tokenizer):
    self.labels = df.columns
    self.data = df.to_dict(orient='records')
    self.tokenizer = tokenizer
    x = self.fittest_max_length(df)
    self.max_length = x

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    x = self.data[idx][self.labels[0]]
    y = self.data[idx][self.labels[1]]
    text = f'{x} | {y}'
    tokens = self.tokenizer.encode_plus(text, return_tensors = 'pt', max_length = 128, padding = 'max_length', truncation = True)
    return tokens

  def fittest_max_length(self, df):
    max_length = max(len(max(df[self.labels[0]], key=len)), len(max(df[self.labels[1]], key=len)))
    x = 2
    while x < max_length: x = x * 2
    return x

In [44]:
data_sample = languageDataset(df, tokenizer)
data_sample

<__main__.languageDataset at 0x78f6238c8430>

In [45]:
train_size = int(0.8 * len(data_sample))
val_size = len(data_sample) - train_size
train_data, val_data = random_split(data_sample, [train_size, val_size])

In [46]:
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle = True)
val_loader = DataLoader(val_data, batch_size = BATCH_SIZE)

In [47]:
num_epochs = 4
batch_size = BATCH_SIZE
model_name = 'distilgpt2'
gpu = 0

In [48]:
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
tokenizer.pad_token = tokenizer.eos_token

In [49]:
results = pd.DataFrame(columns = ['epoch', 'transformer', 'batch_size', 'gpu','training_loss', 'validation_loss', 'epoch_duration_sec'])

In [50]:
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    epoch_training_loss = 0
    train_iteration = tqdm(train_loader, desc=f'Training Epoch {epoch + 1}/{num_epochs} Batch Size {batch_size} Transformer: {model_name}')

    for batch in train_iteration:
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(1).to(device)
        targets = inputs.clone()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_iteration.set_postfix({'Training loss': loss.item()})
        epoch_training_loss += loss.item()

    avg_epoch_training_loss = epoch_training_loss / len(train_loader)

    # Validation
    model.eval()
    epoch_validation_loss = 0
    total_loss = 0
    valid_iterator = tqdm(val_loader, desc=f"Validating Epoch {epoch + 1}/{num_epochs}")

    with torch.no_grad():
        for batch in valid_iterator:
            inputs = batch['input_ids'].squeeze(1).to(device)
            targets = inputs.clone()
            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            total_loss += loss.item()
            valid_iterator.set_postfix({'Validation loss': loss.item()})
            epoch_validation_loss += loss.item()

    avg_epoch_validation_loss = epoch_validation_loss / len(val_loader)
    end_time = time.time()
    epoch_duration_sec = end_time - start_time

    new_row = {
        'transformer': model_name,
        'batch_size': batch_size,
        'gpu': gpu,
        'epoch': epoch + 1,
        'training_loss': avg_epoch_training_loss,
        'validation_loss': avg_epoch_validation_loss,
        'epoch_duration_sec': epoch_duration_sec
    }
    results.loc[len(results)] = new_row

    print(f"Epoch: {epoch + 1}, Validation Loss: {total_loss / len(val_loader)}")


Training Epoch 1/4 Batch Size 8 Transformer: distilgpt2: 100%|██████████| 40/40 [08:05<00:00, 12.15s/it, Training loss=0.66]
Validating Epoch 1/4: 100%|██████████| 10/10 [00:38<00:00,  3.84s/it, Validation loss=0.46]


Epoch: 1, Validation Loss: 0.6390852451324462


Training Epoch 2/4 Batch Size 8 Transformer: distilgpt2: 100%|██████████| 40/40 [08:14<00:00, 12.36s/it, Training loss=0.45]
Validating Epoch 2/4: 100%|██████████| 10/10 [00:39<00:00,  3.92s/it, Validation loss=0.43]


Epoch: 2, Validation Loss: 0.6067761927843094


Training Epoch 3/4 Batch Size 8 Transformer: distilgpt2: 100%|██████████| 40/40 [08:07<00:00, 12.19s/it, Training loss=0.385]
Validating Epoch 3/4: 100%|██████████| 10/10 [00:38<00:00,  3.86s/it, Validation loss=0.445]


Epoch: 3, Validation Loss: 0.6319384217262268


Training Epoch 4/4 Batch Size 8 Transformer: distilgpt2: 100%|██████████| 40/40 [08:11<00:00, 12.29s/it, Training loss=0.396]
Validating Epoch 4/4: 100%|██████████| 10/10 [00:39<00:00,  3.99s/it, Validation loss=0.461]

Epoch: 4, Validation Loss: 0.6729913026094436





In [51]:
input_str = "Kidney Failure"
input_ids = tokenizer.encode(input_str, return_tensors="pt").to(device)
input_ids

tensor([[48374,  1681, 25743]])

In [52]:
output = model.generate(
    input_ids,
    max_length=20,
    num_return_sequences=1,
    do_sample = True,
    top_k = 8,
    top_p = 0.95,
    temperature = 0.5,
    repetition_penalty = 1.2,

    )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [53]:
output

tensor([[48374,  1681, 25743,   930, 12301, 15212,    11,  1790,  1108,   286,
          8033,    11,  5801, 36051, 50256]])

In [54]:
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
decoded_output

'Kidney Failure | Fatigue, shortness of breath, rapid heartbeat'

In [55]:
torch.save(model, 'SmallDiseaseLM.pt')