In [7]:
from sklearn.model_selection import train_test_split
import pandas as pd


data = pd.read_csv("/Users/florianhaglsperger/Desktop/Coding/application_tracking/job_applicant_dataset.csv")
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

with open("train_data.csv", "w") as train_file:
    train_data.to_csv(train_file, index=False)
with open("test_data.csv", "w") as test_file:
    test_data.to_csv(test_file, index=False)

In [19]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np
from transformers import GPT2Tokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

class JobsDataset():
    def __init__(self, tokenizer, path):
        self.tokenizer = tokenizer

        self.data = pd.read_csv(path)
        self.job_description = self.data['Job Description'].tolist()
        self.resume = self.data['Resume'].tolist()
        self.age = self.data['Age'].tolist()
        self.best_match = self.data['Best Match'].tolist()

    def __len__(self):
        return len(self.job_description)

    def __getitem__(self, idx):
        description = self.job_description[idx]
        resume = self.resume[idx]
        age = self.age[idx]
        best_match = self.best_match[idx]

        description_encoding = self.tokenizer.encode(description)
        resume_encoding = self.tokenizer.encode(resume)

        return [
            [description_encoding, resume_encoding, age],
            [best_match]
        ]
    
def coallate_fn(batch):
    inputs, targets = zip(*batch)
    description_encoding, resume_encoding, age = zip(*inputs)
    best_match = [target[0] for target in targets]

    description_encoding = [torch.tensor(t) for t in description_encoding]
    resume_encoding = [torch.tensor(t) for t in resume_encoding]
    age = torch.tensor(age, dtype=torch.float32)
    best_match = torch.tensor(best_match, dtype=torch.float32)

    description_encoding = pad_sequence(description_encoding, batch_first=True, padding_value=0)
    resume_encoding = pad_sequence(resume_encoding, batch_first=True, padding_value=0)
    return [description_encoding, resume_encoding, age], best_match

dataset = JobsDataset(GPT2Tokenizer.from_pretrained('gpt2'), "train_data.csv")
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=coallate_fn)

In [27]:
class JobMatching(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size = 256, output_size = 2):
        super(JobMatching, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, hidden_size //2)
        self.fc2 = nn.Linear(hidden_size //2, hidden_size //4)
        self.fc3 = nn.Linear(hidden_size //4, hidden_size //8)

        self.age_fc = nn.Linear(1, 8)

        self.final_fc = nn.Linear(hidden_size //8 + hidden_size // 8 + 8 , output_size)

        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, description, resume, age):

        #description branch
        desc_embed = self.embedding(description)
        desc_lstm, _ = self.lstm(desc_embed)
        desc_feat = desc_lstm[:, -1, :]
        desc_feat = self.gelu(self.fc1(desc_feat))
        desc_feat = self.dropout(desc_feat)
        desc_feat = self.gelu(self.fc2(desc_feat))
        desc_feat = self.dropout(desc_feat)
        desc_feat = self.gelu(self.fc3(desc_feat))
        desc_feat = self.dropout(desc_feat)

        #resume branch
        resume_embed = self.embedding(resume)
        resume_lstm, _ = self.lstm(resume_embed)
        resume_feat = resume_lstm[:, -1, :]
        resume_feat = self.gelu(self.fc1(resume_feat))
        resume_feat = self.dropout(resume_feat)
        resume_feat = self.gelu(self.fc2(resume_feat))
        resume_feat = self.dropout(resume_feat)
        resume_feat = self.gelu(self.fc3(resume_feat))
        resume_feat = self.dropout(resume_feat)

        #age branch
        age = age.unsqueeze(1)
        age_feat = self.gelu(self.age_fc(age))
        age_feat = self.dropout(age_feat)

        #combine 
        combined = torch.cat((desc_feat, resume_feat, age_feat), dim=1)
        output = self.final_fc(combined)
        return output
    
model = JobMatching(vocab_size=GPT2Tokenizer.from_pretrained("gpt2").vocab_size, embed_size=128)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for i in range(num_epochs):
    model.train()
    for i, (batch_inputs, labels) in enumerate(train_loader):
        description, resume, age = batch_inputs
        optimizer.zero_grad()
        outputs = model(description, resume, age)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()

    print("Epoch finished")
        

Epoch finished
Epoch finished
Epoch finished
Epoch finished
Epoch finished
Epoch finished
Epoch finished
Epoch finished
Epoch finished
Epoch finished


In [None]:
model.eval()
test_dataset = JobsDataset(GPT2Tokenizer.from_pretrained('gpt2'), "test_data.csv")
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=coallate_fn)

with torch.no_grad():
    total = 0
    correct = 0
    for batch_inputs, labels in test_loader:
        description, resume, age = batch_inputs
        outputs = model(description, resume, age)
        predicted = torch.argmax(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Test Accuracy: {100 * correct / total}%')

TypeError: JobsDataset.__init__() got an unexpected keyword argument 'embed_size'