# Read subsector occupations CSV

# For each occupation, read each jobposts job_description, extract competences, add column and save file with ...competences_llm.csv

In [7]:
import os
import pandas as pd
import openai
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.globals import set_debug
import tiktoken

In [3]:
#OPEN AI KEY

os.environ["OPENAI_API_KEY"] = "sk-proj-rBVJlXCeswQsTIRXgosYT3BlbkFJRupEcTiKXToudT9qUFwU"


In [4]:
OPENAI_API_KEY: str = os.environ.get("OPENAI_API_KEY")

In [12]:
occ_csv = "./infocomm/infocomm_occupations.csv"
batch_date = "7-junio-sfco"


occs_df = pd.read_csv(occ_csv)

for occ in occs_df["OccupationQS"]:
    csv_file_string = f"./infocomm/{batch_date}-raw/{occ}.csv"

    fileEmpty = False

    print(f"Trying with: {csv_file_string}")

    occ_df = pd.DataFrame()

    try:
        # Read the CSV file into a DataFrame
        occ_df = pd.read_csv(csv_file_string)
    except pd.errors.EmptyDataError:
        print("The CSV file is empty.")
        fileEmpty = True
        pass
    except FileNotFoundError:
        print("File not found.")
    except Exception as e:
        print("An error occurred:", e)
    
    if fileEmpty:
        pass
    else:
        print("Working")
        competences = []

        llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature = 0)

        set_debug(False)

        template = """ From the given document: {document}
        Question: {question}"""
        prompt = PromptTemplate(template=template, input_variables=["question", "document"])
        llm_chain = LLMChain(prompt=prompt, llm=llm)
        for x in occ_df["job_description"]:
            if len(str(x)) > 3:
                #question = "List all the competences, skills, abilities and knowledge found in the text."
                question = "Provide a concise list of competences, skills, abilities, and knowledge mentioned in the text. Split lengthy items into smaller components. Your response should solely consist of a list of competences. If an item contains commas, break it down into separate entities."
                #clean x
                clean_x = x.strip()

                # Check that the 'clean_x' string is no longer than 4096 tokens as that's the limit for gpt-turbo-3.5 context
                # If it's longer, truncate it to 4096 tokens.

                # Initialize the tokenizer
                tokenizer = tiktoken.get_encoding("cl100k_base")

                # Tokenize the input string
                tokens = tokenizer.encode(clean_x)
                
                # Check if the number of tokens exceeds 4096, I use 3700 because we also need to allocate the question in the prompt.
                if len(tokens) > 3700:
                    print(f"Truncating jobpost")
                    truncated_tokens = tokens[:3700]
                    # Decode the tokens back to a string
                    clean_x = tokenizer.decode(truncated_tokens)

                output_competences = llm_chain.run(question = question, document = clean_x)
                competences.append(output_competences)    
            else:
                print("Job description not found")
                competences.append("Job description not found")
        
        occ_df['competences_llm'] = competences
        occ_df.to_csv(f"./infocomm/{batch_date}-processed/{occ}_competences_llm.csv")

Trying with: ./infocomm/7-junio-sfco-raw/IT%20security%20operations.csv
Working
Truncating jobpost
Trying with: ./infocomm/7-junio-sfco-raw/Information%20Security%20Analysts.csv
Working
Trying with: ./infocomm/7-junio-sfco-raw/Product%20Security%20and%20IT%20Security%20Integration%20Specialist.csv
The CSV file is empty.
Trying with: ./infocomm/7-junio-sfco-raw/Product%20risk%20specialist.csv
Working
Trying with: ./infocomm/7-junio-sfco-raw/Security%20architect.csv
Working
Trying with: ./infocomm/7-junio-sfco-raw/Database%20support%20engineer.csv
Working
Trying with: ./infocomm/7-junio-sfco-raw/Data%20center%20operations%20engineer.csv
Working
Trying with: ./infocomm/7-junio-sfco-raw/Support%20systems%20engineer.csv
Working
Trying with: ./infocomm/7-junio-sfco-raw/Computer%20Network%20Support%20Specialists.csv
Working
