In [1]:
import torch
import torch.nn as nn
from torch.nn.functional import cross_entropy
from torch.utils.data import DataLoader, Dataset

from gpt_arcitecture import GPTModel, GPTDataset

import tiktoken
import pandas as pd
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

if torch.cuda.is_available():
    torch.set_default_device("cuda")
device = torch.get_default_device()
generator = torch.Generator(device = device)

tokenizer = tiktoken.encoding_for_model("gpt2")

CONFIG = {
    "n_vocab": 50257,    # Vocabulary size
    "n_ctx": 256, # Context length
    "n_embd": 768,         # Embedding dimension
    "n_head": 12,          # Number of attention heads
    "n_layer": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

model = GPTModel(CONFIG)

train = True
epochs = 50

In [2]:
data_df = pd.read_csv("./archive/job_dataset.csv", encoding="utf-8")
data_df = data_df.sample(frac=1).reset_index(drop=True)


data_series = data_df["Title"]+"-" +data_df["ExperienceLevel"] + " : " +  data_df["Responsibilities"]

train_text = ""
val_text = ""

train_ratio = 0.99
split_idx = int(len(data_series)*train_ratio)
print(split_idx)

for row in data_series[:split_idx]:
    train_text += str(row) + "<|endoftext|>"

for row in data_series[split_idx:]:
    val_text += str(row) + "<|endoftext|>"

len(train_text), len(val_text)

1057


(302606, 3494)

In [3]:
batch_size = 5

train_data = GPTDataset(train_text, tokenizer, CONFIG["n_ctx"], CONFIG["n_ctx"])
val_data = GPTDataset(val_text, tokenizer, CONFIG["n_ctx"], CONFIG["n_ctx"])
train_loader = DataLoader(train_data, batch_size, shuffle = True, drop_last = False, generator= generator)
val_loader = DataLoader(val_data, batch_size, shuffle = True, drop_last = False, generator= generator)

for x,y in train_loader:
    print(x.shape, y.shape)
    break
print(len(train_loader))
print(len(val_loader))

torch.Size([5, 256]) torch.Size([5, 256])
43
1


In [4]:
def calculate_loss_batch(input_batch, target_batch, model):
    pred_batch = model(input_batch)
    loss = cross_entropy(pred_batch.flatten(0,1), target_batch.flatten())

    return loss

def calculate_loss_loader(data_loader, model, num_batches=None):
    total_loss = 0

    for i, (input_batch, target_batch) in enumerate(data_loader):
        loss = calculate_loss_batch(input_batch, target_batch, model)
        total_loss += loss.item()

    return total_loss/len(data_loader)


def get_pred(model, inputs, output_tokens = 1, sample = False):

    for i in range(output_tokens):
        with torch.no_grad():
            output = model(inputs)

        last_row = output[:,-1,:]
        probs = torch.softmax(last_row, dim=-1)

        if sample:
            output = torch.multinomial(probs,num_samples = 1)
        else:
            output = probs.argmax(dim=-1,keepdim=True)[0][0]
        inputs.append(output.item())

    return inputs


with torch.no_grad():
    val_loss = calculate_loss_loader(val_loader, model)
    train_loss = calculate_loss_loader(train_loader, model)


print(train_loss, val_loss)

torch.cuda.empty_cache()

11.026251704193825 11.003654479980469


In [5]:
# training loop
history = []
iteration = 0
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-4, weight_decay=0.1)
sample_input = "Python Developer-Fresher :"

In [None]:
if not train:
    epochs = 0
for epoch in tqdm(range(epochs)):
    iteration += 1

    for input_batch, output_batch in train_loader:
        optimizer.zero_grad()

        loss = calculate_loss_batch(input_batch, output_batch, model)
        loss.backward()

        optimizer.step()

    with torch.no_grad():
        perplexity = torch.exp(loss).item()
        val_loss = calculate_loss_loader(val_loader, model)
        train_loss = calculate_loss_loader(train_loader, model)
        sample_output = tokenizer.decode(get_pred(model, tokenizer.encode(sample_input), 50, False))
    
    history.append({
        "Epoch" : iteration,
        "Training Loss": train_loss,
        "Validation Loss": val_loss,
        "Perplexity": perplexity,
        "Sample": sample_output
    })

    if (epoch+1) % 10 == 0:
        print(f"Epoch : {epoch+1} : Train Loss = {train_loss:.4f}, Validation Loss = {val_loss:.4f}, Perplexity = {perplexity:.2f}")
        print(f"Output : {sample_output}\n")

        torch.save(model.state_dict(), "./archive/job_model.pth")
        history_df = pd.DataFrame(history)
        history_df.to_csv("./archive/job_history.csv", index=False)

 18%|█▊        | 9/50 [02:02<09:15, 13.54s/it]

Epoch : 10 : Train Loss = 0.1440, Validation Loss = 2.7502, Perplexity = 1.30
Output : Python Developer-Fresher : Support data ingestion processes; Work with ML models using Python and mentor junior developers; Assist in documenting user stories; Learn cloud deployment and automation tools; Participate in version control with Matplotlib for data analysis; Document workflows and processes; Document research



 38%|███▊      | 19/50 [04:20<07:00, 13.56s/it]

Epoch : 20 : Train Loss = 0.0315, Validation Loss = 3.0962, Perplexity = 1.03
Output : Python Developer-Fresher : Assist in Python application development; Implement basic programming logic; Support web app development with Django/Flask; Perform data analysis using Pandas and NumPy; Create visualizations with Matplotlib; Write unit tests; Document code and processes; Collaborate



 58%|█████▊    | 29/50 [06:32<04:28, 12.76s/it]

Epoch : 30 : Train Loss = 0.0392, Validation Loss = 3.1522, Perplexity = 1.06
Output : Python Developer-Fresher : Assist in Python application development; Implement basic programming logic; Support web app development with Django/Flask; Perform data analysis using Pandas and NumPy; Create visualizations with Matplotlib; Write unit tests; Document code and processes; Collaborate



 78%|███████▊  | 39/50 [08:40<02:18, 12.59s/it]

Epoch : 40 : Train Loss = 0.0247, Validation Loss = 3.2520, Perplexity = 1.06
Output : Python Developer-Fresher : Assist in Python application development; Implement basic programming logic; Support web app development with Django/Flask; Perform data analysis using Pandas and NumPy; Create visualizations with Matplotlib; Write unit tests; Document code and processes; Collaborate



 98%|█████████▊| 49/50 [10:47<00:12, 12.59s/it]

Epoch : 50 : Train Loss = 0.0267, Validation Loss = 3.5261, Perplexity = 1.03
Output : Python Developer-Fresher : Assist in Python application development; Implement basic programming logic; Support web app development with Django/Flask; Perform data analysis using Pandas and NumPy; Create visualizations with Matplotlib; Write unit tests; Document code and processes; Collaborate



100%|██████████| 50/50 [11:02<00:00, 13.25s/it]


In [7]:
import plotly.express as px

fig = px.line(history_df, "Epoch", ["Training Loss", "Validation Loss"])
fig.show()
fig = px.line(history_df, "Epoch", "Perplexity")
fig.show()

In [8]:
sample_output = tokenizer.decode(get_pred(model, tokenizer.encode("AI Engineer-Experienced :"), 250, True))
for sample in sample_output.split("<|endoftext|>"):
    print(sample)

AI Engineer-Experienced : Lead advanced robotics projects; Integrate sensors, control systems, and AI; Simulate and validate robotic applications; Mentor engineering teams; Drive innovation in robotics technology
Cloud Engineer - Fresher-Entry-Level : Deploy cloud resources under guidance; Assist in containerization with Docker; Implement CI/CD pipelines for learning projects; Support cloud infrastructure deployments; Collaborate on academic or personal projects
 board
BI Analyst - Experienced- metrics and CI/CD pipelines; Oversee monitoring; Optimize business data pipelines; Assist with DevOps and optimize dashboards for decision-Fresher : Assist in designing basic datasets and decision- ■ and compliance; requirement gathering and feature engineering; Participate in CI/CD setup tasks; Learn blockchain platforms and review code/CD Drive innovation and learning models; Document workflows and principles and development teams
Web Developer-Fresher : Develop Learn developers and apply fron