In [11]:
!pip install transformers sentencepiece




In [12]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Choose model
model_name = 'google/flan-t5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)


Using device: cuda


In [13]:
# TESTING

import time

job_ad = "We are looking for a software developer to join our team in our downtown office in New York. Must be available to work in person 5 days a week."

# Format the prompt

prompt = f"Classify the work arrangement of the following job ad: {job_ad}"

start_time = time.time()
# Tokenize
input_ids = tokenizer.encode(prompt, return_tensors='pt', truncation=True).to(device) # Move input_ids to the same device as the model

# Generate output
job_ids = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)

stop_time = time.time()
execution_time = stop_time - start_time
print("Execution time:", execution_time, "seconds")

# Decode and print
print("Summary:", tokenizer.decode(job_ids[0], skip_special_tokens=True))

Execution time: 0.18985843658447266 seconds
Summary: in person


In [14]:
import os
from google.colab import drive

drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/AB_job_data_files'

import os
from google.colab import drive

# find location of different files:
file_location = {}
# Check if the folder exists
if os.path.exists(folder_path):
  os.chdir(folder_path)
  for num, f in enumerate(os.listdir()):
    file_location[f] = os.path.join(folder_path, f)

else:
  print(f"Folder not found: {folder_path}")

print(file_location)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'seniority_labelled_test_set.csv': '/content/drive/MyDrive/AB_job_data_files/seniority_labelled_test_set.csv', 'unlabelled_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/unlabelled_development_set.csv', 'salary_labelled_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/salary_labelled_development_set.csv', 'seniority_labelled_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/seniority_labelled_development_set.csv', 'work_arrangements_test_set.csv': '/content/drive/MyDrive/AB_job_data_files/work_arrangements_test_set.csv', 'salary_labelled_test_set.csv': '/content/drive/MyDrive/AB_job_data_files/salary_labelled_test_set.csv', 'work_arrangements_development_set.csv': '/content/drive/MyDrive/AB_job_data_files/work_arrangements_development_set.csv', 'results': '/content/drive/MyDrive/AB_job_data_files/results', 'wandb

In [15]:
def work_arrang(job_ad):

    # Format the prompt
    prompt = f"Classify the work arrangement of the following job ad as one of the following: on-site, Remote, or Hybrid: {job_ad}"

    # Tokenize
    input_ids = tokenizer.encode(prompt, return_tensors='pt', truncation=True).to(device)

    # Generate output
    job_ids = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)

    # Decode and return
    return tokenizer.decode(job_ids[0], skip_special_tokens=True)

In [16]:
# DEV set

import pandas as pd

# Read the development set
work_arrangement_dev = pd.read_csv(file_location['work_arrangements_development_set.csv'])

# Lists to hold predictions and true labels
y_pred = []
y_true = []

correct = 0

# Prediction loop
for index, row in work_arrangement_dev.iterrows():
    predicted_label = work_arrang(row['job_ad'])

    if predicted_label == "on-site":
        predicted_label = "OnSite"

    true_label = row['y_true']
    match = predicted_label == true_label

    # Append the prediction and true label
    y_pred.append(predicted_label)
    y_true.append(true_label)

    # Print individual results
    print(predicted_label, '----', true_label)
    print(match)

    if match:
        correct += 1

# Calculate and print overall accuracy
total = len(work_arrangement_dev)
accuracy = correct / total
print(f"count: {total}")
print(f"correctly identified: {correct}")
print(f"accuracy: {accuracy}")

Remote ---- Remote
True
Remote ---- Remote
True
Remote ---- Hybrid
False
OnSite ---- OnSite
True
OnSite ---- OnSite
True
Remote ---- OnSite
False
OnSite ---- Hybrid
False
OnSite ---- OnSite
True
OnSite ---- Hybrid
False
OnSite ---- Remote
False
OnSite ---- OnSite
True
Remote ---- OnSite
False
OnSite ---- Hybrid
False
Remote ---- Remote
True
OnSite ---- Hybrid
False
OnSite ---- OnSite
True
OnSite ---- Remote
False
OnSite ---- OnSite
True
OnSite ---- Remote
False
OnSite ---- OnSite
True
Remote ---- Remote
True
OnSite ---- Remote
False
Remote ---- Remote
True
OnSite ---- OnSite
True
OnSite ---- Remote
False
Remote ---- Remote
True
OnSite ---- OnSite
True
Remote ---- Remote
True
OnSite ---- OnSite
True
Remote ---- Remote
True
OnSite ---- Remote
False
Remote ---- Remote
True
OnSite ---- Hybrid
False
OnSite ---- Hybrid
False
OnSite ---- Remote
False
OnSite ---- OnSite
True
OnSite ---- OnSite
True
OnSite ---- OnSite
True
Remote ---- Hybrid
False
Remote ---- OnSite
False
OnSite ---- Remote
Fal

In [17]:
# TEST set


# Read the development set
work_arrangement_test = pd.read_csv(file_location['work_arrangements_test_set.csv'])

# Lists to hold predictions and true labels
y_pred = []
y_true = []

correct = 0

# Prediction loop
for index, row in work_arrangement_test.iterrows():
    predicted_label = work_arrang(row['job_ad'])

    if predicted_label == "on-site":
        predicted_label = "OnSite"

    true_label = row['y_true']
    match = predicted_label == true_label

    # Append the prediction and true label
    y_pred.append(predicted_label)
    y_true.append(true_label)

    # Print individual results
    #print(predicted_label, '----', true_label)
    #print(match)

    if match:
        correct += 1

# Calculate and print overall accuracy
total = len(work_arrangement_dev)
accuracy = correct / total
print(f"count: {total}")
print(f"correctly identified: {correct}")
print(f"accuracy: {accuracy}")

count: 99
correctly identified: 54
accuracy: 0.5454545454545454
