# First attempt for concurrent API calls

In [3]:
# Import packages
import asyncio
import aiohttp
import ssl
import certifi
import pandas as pd
import json
import tiktoken
import requests
import matplotlib.pyplot as plt

In [4]:
# Read data (job advertisments)
df = pd.read_csv(f"{os.getcwd()}/wi_dataset.csv",
                 index_col=None,
                 header=0,
                 engine='python',
                 encoding='utf-8')

In [5]:
# Read taxonomy
tax = pd.read_excel("ISCO-08 EN Structure and definitions.xlsx")

In [6]:
# Clean data a bit to tokenize
df = df[df.description.notna()].copy()
df.description = df.description.astype(str)

In [7]:
# Count tokens using tiktoken (as ChatGPT does)

# Initialize the tokenizer for the GPT-3.5-turbo model
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Define function to count tokens in a description
def count_tokens(description):
    tokens = tokenizer.encode(description)
    return len(tokens)

# Apply the function to the 'description' column
df['description_token_count'] = df['description'].apply(count_tokens)

In [8]:
# Get Level 1 taxonomies
level_1_titles = "\n".join([f"{row['ISCO 08 Code']}. {row['Title EN']}" for idx, row in tax[tax["Level"] == 1].iterrows()])
print(level_1_titles)

1. Managers
2. Professionals
3. Technicians and Associate Professionals
4. Clerical Support Workers
5. Service and Sales Workers
6. Skilled Agricultural, Forestry and Fishery Workers
7. Craft and Related Trades Workers
8. Plant and Machine Operators, and Assemblers
9. Elementary Occupations
0. Armed Forces Occupations


In [9]:
# Get dataframe sample (n=10)
df_sample = df.sample(10)
print(df_sample.description)

110111    Oferta pracy Pracuj.pl Wygodniej z aplikacją P...
109498    Το ξενοδοχείο 5* Fodele Beach & Water Park Hol...
92354     Humniska, powiat: brzozowski, woj: podkarpacki...
32505     Voltar à listagem Reportar anúncio Se tem dúvi...
103477    Descriere Caut bona pe termen lung pentru un b...
14623     Key solutions växer så det knakar. Vi söker nu...
18012     Lavoro - Annunci di lavoro - Offerte di lavoro...
77975     Kunniga och engagerade medarbetare med goda fö...
35286     Vloeiende kennis Nederlands en Engels (kennis ...
1018      Το εστιατόριο Dos Hermanos στο κέντρο της Κηφι...
Name: description, dtype: object


In [40]:
# Function to create the prompt
def create_prompt(desc):
    prompt = (
        f"We have the following 10 job types based on the ISCO-08 classification:\n"
        f"{level_1_titles}\n\n"
        f"Given the following job description, please classify it into one of the above job types by providing the corresponding number and job type:\n\n"
        f"Job Description: \"{desc}\"\n\n"
        f"Your answer needs to have strictly this format, and no additional output: '<classnumber>. <jobclass>'"
    )
    return prompt

# Apply the function to create a new column 'prompt'
df_sample['prompt'] = df_sample['description'].apply(create_prompt)

In [48]:
# Check
print(df_sample['prompt'].iloc[0])

We have the following 10 job types based on the ISCO-08 classification:
1. Managers
2. Professionals
3. Technicians and Associate Professionals
4. Clerical Support Workers
5. Service and Sales Workers
6. Skilled Agricultural, Forestry and Fishery Workers
7. Craft and Related Trades Workers
8. Plant and Machine Operators, and Assemblers
9. Elementary Occupations
0. Armed Forces Occupations

Given the following job description, please classify it into one of the above job types by providing the corresponding number and job type:

Job Description: "Oferta pracy Pracuj.pl Wygodniej z aplikacją Pracuj.pl Przeglądaj oferty i łatwo aplikuj Oferty pracy Porady Oferty pracy IT Profile pracodawców Kreator CV Zarobki Dla firm Pracuj.pl Zaloguj się Załóż konto Dla firm Str. główna Szukaj ofert Zapisane Konto Praca Warszawa Radom Płock Siedlce Pruszków Ostrołęka Kraków Łódź Wrocław IT - Administracja IT - Rozwój oprogramowania Administrowanie systemami Architektura Programowanie Aplikuj Aplikuj Zap

In [41]:
import os
import openai
import pandas as pd
import concurrent.futures
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set your OpenAI API key
OPENAI_API_KEY = os.environ("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    raise ValueError("OpenAI API key not found. Make sure it's defined in your .env file.")

openai.api_key = OPENAI_API_KEY


In [42]:
# Function to get job class from OpenAI
def get_job_class(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )
        job_class = response.choices[0].message['content'].strip()
        return job_class
    except Exception as e:
        print(f"Failed to get response from OpenAI: {e}")
        return None

# Function to process a batch of prompts
def process_batch(prompts):
    return [get_job_class(prompt) for prompt in prompts]

In [46]:
# Set number of workers for concurrent processing
num_workers = 5

# Get prompts
prompts = df_sample['prompt'].tolist()

# Splitting the prompts into batches
chunk_size = len(prompts) // num_workers
chunks = [prompts[i:i + chunk_size] for i in range(0, len(prompts), chunk_size)]

results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    future_to_chunk = {executor.submit(process_batch, chunk): chunk for chunk in chunks}
    for future in concurrent.futures.as_completed(future_to_chunk):
        chunk_results = future.result()
        results.extend(chunk_results)

# Update the DataFrame with the job classifications
df_sample['jobclass'] = results

In [44]:
pd.options.display.max_colwidth = 100
from IPython.display import display, HTML

In [47]:
display(HTML(df_sample.apply(lambda x: pd.Series({'description': f"{x['description'][:50]}..." if len(x['description']) > 50 else x['description'], 'jobclass': x['jobclass']}), axis=1).to_html()))

Unnamed: 0,description,jobclass
110111,Oferta pracy Pracuj.pl Wygodniej z aplikacją Pracu...,9. Elementary Occupations
109498,Το ξενοδοχείο 5* Fodele Beach & Water Park Holiday...,5. Service and Sales Workers
92354,"Humniska, powiat: brzozowski, woj: podkarpackie\nPo...",This job description falls under job type 4. Clerical Support Workers.
32505,Voltar à listagem Reportar anúncio Se tem dúvidas ...,3. Technicians and Associate Professionals
103477,Descriere Caut bona pe termen lung pentru un baiet...,3. Technicians and Associate Professionals
14623,Key solutions växer så det knakar. Vi söker nu en ...,3. Technicians and Associate Professionals
18012,Lavoro - Annunci di lavoro - Offerte di lavoro - M...,3. Technicians and Associate Professionals
77975,Kunniga och engagerade medarbetare med goda föruts...,2. Professionals
35286,Vloeiende kennis Nederlands en Engels (kennis Fran...,4. Clerical Support Workers
1018,Το εστιατόριο Dos Hermanos στο κέντρο της Κηφισιάς...,4. Clerical Support Workers


#

# Conclusions

- Concurrent requests work with openai=0.28
- Seems non deterministic as fk
- Maybe translate

# With Google API translations

In [71]:
%pip install -U deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting beautifulsoup4<5.0.0,>=4.9.1 (from deep-translator)
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4<5.0.0,>=4.9.1->deep-translator)
  Downloading soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.9/147.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading soupsieve-2.5-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, deep-translator
Successfully installed beautifulsoup4-4.12.3 deep-translator-1.11.4 soupsieve-2.5
Note: you may need to restart the kernel to use u

In [76]:
from deep_translator import GoogleTranslator

def func_t(txt):
    return GoogleTranslator(source='auto', target='en').translate(txt) 

In [78]:
# Just whatever to test
df_sample["description_translated"] = df_sample.description.apply(lambda x: func_t(x))

In [79]:
df_sample["description_translated"]

110111    Job offer Pracuj.pl More convenient with the Pracuj.pl application Browse offers and apply easil...
109498    The 5* hotel Fodele Beach & Water Park Holiday Resort in Fodele, Heraklion, Crete wishes to hire...
92354     Humniska, Brzozowski district, Podkarpackie province\nOther office staff\nOTHER OFFICE SERVICE E...
32505     Return to list Report ad If you have doubts about the veracity of this ad, tell us the reason fo...
103477    Description I am looking for a long-term nanny for a 1.3-month-old boy. Location: Iulius Mall ar...
14623     Key solutions is growing like crazy. We are now looking for a new field salesperson who loves to...
18012     Work - Job adverts - Job offers - CV Market To continue browsing our portal, contact +370 5 219 ...
77975     Knowledgeable and committed employees with good conditions are one of the most important success...
35286     Fluent knowledge of Dutch and English (knowledge of French is a strong asset!) Analytical and pr...
1018      

In [86]:
df_sample['prompt'] = df_sample['description_translated'].apply(create_prompt)

In [87]:
# Set number of workers for concurrent processing
num_workers = 5

# Get prompts
prompts = df_sample['prompt'].tolist()

# Splitting the prompts into batches
chunk_size = len(prompts) // num_workers
chunks = [prompts[i:i + chunk_size] for i in range(0, len(prompts), chunk_size)]

results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    future_to_chunk = {executor.submit(process_batch, chunk): chunk for chunk in chunks}
    for future in concurrent.futures.as_completed(future_to_chunk):
        chunk_results = future.result()
        results.extend(chunk_results)

# Update the DataFrame with the job classifications
df_sample['jobclass'] = results

In [88]:
display(HTML(df_sample.apply(lambda x: pd.Series({'description_translated': f"{x['description_translated'][:50]}..." if len(x['description_translated']) > 50 else x['description_translated'], 'jobclass': x['jobclass']}), axis=1).to_html()))

Unnamed: 0,description_translated,jobclass
110111,Job offer Pracuj.pl More convenient with the Pracu...,3. Technicians and Associate Professionals
109498,The 5* hotel Fodele Beach & Water Park Holiday Res...,5. Service and Sales Workers
92354,"Humniska, Brzozowski district, Podkarpackie provin...",4. Clerical Support Workers
32505,Return to list Report ad If you have doubts about ...,3. Technicians and Associate Professionals
103477,Description I am looking for a long-term nanny for...,3. Technicians and Associate Professionals
14623,Key solutions is growing like crazy. We are now lo...,4. Clerical Support Workers
18012,Work - Job adverts - Job offers - CV Market To con...,5. Service and Sales Workers
77975,Knowledgeable and committed employees with good co...,5. Service and Sales Workers
35286,Fluent knowledge of Dutch and English (knowledge o...,4. Clerical Support Workers
1018,The Dos Hermanos restaurant in the center of Kifis...,2. Professionals


try again

In [89]:
# Set number of workers for concurrent processing
num_workers = 5

# Get prompts
prompts = df_sample['prompt'].tolist()

# Splitting the prompts into batches
chunk_size = len(prompts) // num_workers
chunks = [prompts[i:i + chunk_size] for i in range(0, len(prompts), chunk_size)]

results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    future_to_chunk = {executor.submit(process_batch, chunk): chunk for chunk in chunks}
    for future in concurrent.futures.as_completed(future_to_chunk):
        chunk_results = future.result()
        results.extend(chunk_results)

# Update the DataFrame with the job classifications
df_sample['jobclass'] = results

In [90]:
display(HTML(df_sample.apply(lambda x: pd.Series({'description_translated': f"{x['description_translated'][:50]}..." if len(x['description_translated']) > 50 else x['description_translated'], 'jobclass': x['jobclass']}), axis=1).to_html()))

Unnamed: 0,description_translated,jobclass
110111,Job offer Pracuj.pl More convenient with the Pracu...,3. Technicians and Associate Professionals
109498,The 5* hotel Fodele Beach & Water Park Holiday Res...,5. Service and Sales Workers
92354,"Humniska, Brzozowski district, Podkarpackie provin...",4. Clerical Support Workers
32505,Return to list Report ad If you have doubts about ...,3. Technicians and Associate Professionals
103477,Description I am looking for a long-term nanny for...,4. Clerical Support Workers
14623,Key solutions is growing like crazy. We are now lo...,3. Technicians and Associate Professionals
18012,Work - Job adverts - Job offers - CV Market To con...,5. Service and Sales Workers
77975,Knowledgeable and committed employees with good co...,5. Service and Sales Workers
35286,Fluent knowledge of Dutch and English (knowledge o...,3. Technicians and Associate Professionals
1018,The Dos Hermanos restaurant in the center of Kifis...,4. Clerical Support Workers


# SLightly better but still not the same results... Level 1 to Level 4 does not work!!!