In [1]:
from groq import Groq
import pandas as pd
import re
import os

# Import the data

In [2]:
df = pd.read_csv('../../data_acquisition/data/df_with_salary.csv')

In [3]:
df.head()

Unnamed: 0,title,company,contract_type,description,min_workload,max_workload,min_salary,max_salary
0,Software Engineer .Net,ELCA Informatik AG,Festanstellung,"Über ELCA Wir sind ELCA, einer der grössten Sc...",80,100,100000,130000
1,Digital Analyst / Web Analyst,Unic AG,Festanstellung,Deine Aufgaben als Digital Analyst bei Unic ...,80,100,90000,113000
2,ICT Supporter 100% (a),Spitex Zürich,Festanstellung,Möchtest du auch etwas bewegen und deine Arbei...,100,100,78000,96000
3,KAUFMÄNNISCHES PRAKTIKUM (100%) - MIT FOKUS KU...,gebana AG,Praktikum,DEINE AUFGABEN IM KUND:INNENSERVICE – EINSATZ ...,100,100,28300,28300
4,System Engineer für Microsoft-Technologien und...,konekkt GmbH,Festanstellung,Wir präsentieren eine faszinierende Karrieremö...,80,100,95000,125000


# Prompt blocks definition
Block 0: Few shot 5 examples with salary
Block 1: Job title
Block 2: company
Block 3: workload (min workload and max workload)
Block 4: Contract type
Block 5: description

# Set up for ussage

In [4]:
client = Groq(
    api_key="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",
)


def extract_salaries(response):
    # General regex pattern - covers most cases
    general_pattern = re.compile(r"(\d{1,3}(?:[',\s]\d{3})*)\s*-\s*(\d{1,3}(?:[',\s]\d{3})*)\s*CHF")
    # Special regex pattern - handles "CHF" before the numbers and potential markdown
    special_pattern = re.compile(r"CHF\s*(\d{1,3}(?:[',\s]\d{3})*)\s*[-–]\s*CHF\s*(\d{1,3}(?:[',\s]\d{3})*)")
    special_pattern2 = re.compile(r"CHF\s*(\d{1,3}(?:['\s]\d{3})*)\s*[-–—]\s*(\d{1,3}(?:['\s]\d{3})*)")

    # Try the general pattern first
    matches = general_pattern.findall(response)
    if matches:
        min_salary, max_salary = [int(salary.replace("'", "").replace(",", "").replace(" ", "")) for salary in matches[0]]
        return min_salary, max_salary
    
    # If no matches, try the special pattern
    matches = special_pattern.findall(response)
    if matches:
        min_salary, max_salary = [int(salary.replace("'", "").replace(",", "").replace(" ", "")) for salary in matches[0]]
        return min_salary, max_salary
    
    matches = special_pattern2.findall(response)
    if matches:
        min_salary, max_salary = [int(salary.replace("'", "").replace(" ", "")) for salary in matches[0]]
        return min_salary, max_salary
    
    return None, None

## LLM request with Block 1

In [5]:
# change here for new prompt and message below ------------
results_file = 'data/llm_block1.csv'
min = 'block1_min_salary'
max = 'block1_max_salary'
answer = 'block1_answer'
# Setup and load initial data
if os.path.exists(results_file):
    df = pd.read_csv(results_file)
else:
    df[min] = None  # Initialize columns if starting fresh
    df[max] = None
    df[answer] = None 

# change content here for new prompt and message below ------------
def generate_query(row):
    return {
        "role": "user",
        "content": f"Gib mir nur das Gehalt im Format (\d{1,3}(?:'\d{3})*) und (\d{1,3}(?:'\d{3})*) CHF für die Position {row['title']} in Zürich, Schweiz. Wenn du kein exaktes Gehalt hast gib eine Schätzung an. Der angegebene Gehaltsbereich sollte dabei eine maximale Spanne von 20000 CHF nicht überschreiten."
    }

def run_model(limit):
    start_index = df[min].last_valid_index() + 1 if df[max].last_valid_index() != None else 0

    
    for index, row in df.iloc[start_index:start_index + limit].iterrows():
        query_message = generate_query(row)
        chat_completion = client.chat.completions.create(
            messages= [query_message], # change message for new prompt ------------
            model="llama3-70b-8192",
            temperature=0.0
        )
        min_salary, max_salary = extract_salaries(chat_completion.choices[0].message.content)
        print(index)
        print(min_salary, max_salary)
        
        
        # Update DataFrame
        df.at[index, min] = min_salary
        df.at[index, max] = max_salary
        df.at[index, answer] = chat_completion.choices[0].message.content
        
    # Save updated DataFrame in chunks or fully, depending on your preference
    df.to_csv(results_file, index=False)

# Testing the function with a limited number of entries
run_model(10000)

0
90000 110000
1
80000 100000
2
80000 100000
3
45000 65000
4
90000 110000
5
120000 140000
6
80000 100000
7
90000 110000
8
80000 95000
9
80000 100000
10
110000 130000
11
80000 100000
12
90000 110000
13
80000 100000
14
90000 110000
15
120000 140000
16
80000 100000
17
80000 100000
18
60000 80000
19
80000 100000
20
90000 110000
21
80000 100000
22
80000 100000
23
90000 110000
24
80000 100000
25
80000 100000
26
90000 110000
27
90000 110000
28
90000 110000
29
90000 110000
30
80000 100000
31
90000 110000
32
90000 110000
33
90000 110000
34
90000 110000
35
110000 130000
36
90000 110000
37
90000 110000
38
90000 110000
39
90000 110000
40
90000 110000
41
90000 110000
42
90000 110000
43
90000 110000
44
120000 140000
45
90000 110000
46
90000 110000
47
90000 110000
48
120000 140000
49
90000 110000
50
120000 140000
51
90000 110000
52
80000 100000
53
90000 110000
54
90000 110000
55
90000 110000
56
90000 110000
57
90000 110000
58
90000 110000
59
90000 110000
60
90000 110000
61
80000 100000
62
90000 11000

In [7]:
df_block1 = pd.read_csv('data/llm_block1.csv')
df_block1.head()

Unnamed: 0,title,company,contract_type,description,min_workload,max_workload,min_salary,max_salary,block1_min_salary,block1_max_salary,block1_answer
0,Software Engineer .Net,ELCA Informatik AG,Festanstellung,"Über ELCA Wir sind ELCA, einer der grössten Sc...",80,100,100000,130000,90000,110000,"Based on national averages and online sources,..."
1,Digital Analyst / Web Analyst,Unic AG,Festanstellung,Deine Aufgaben als Digital Analyst bei Unic ...,80,100,90000,113000,80000,100000,"Based on national averages and online sources,..."
2,ICT Supporter 100% (a),Spitex Zürich,Festanstellung,Möchtest du auch etwas bewegen und deine Arbei...,100,100,78000,96000,80000,100000,"Based on national averages and online sources,..."
3,KAUFMÄNNISCHES PRAKTIKUM (100%) - MIT FOKUS KU...,gebana AG,Praktikum,DEINE AUFGABEN IM KUND:INNENSERVICE – EINSATZ ...,100,100,28300,28300,45000,65000,"Based on national averages and online sources,..."
4,System Engineer für Microsoft-Technologien und...,konekkt GmbH,Festanstellung,Wir präsentieren eine faszinierende Karrieremö...,80,100,95000,125000,90000,110000,"Based on national averages and online sources,..."


In [8]:
# all block1_answers where block1_max_salary is null and block1_min_salary is null
df_block1[df_block1['block1_max_salary'].isnull() & df_block1['block1_min_salary'].isnull()]


Unnamed: 0,title,company,contract_type,description,min_workload,max_workload,min_salary,max_salary,block1_min_salary,block1_max_salary,block1_answer


# Block 1 and Block 2 

In [17]:
# change here for new prompt and message below ------------
results_file = 'data/llm_block1_2.csv'
min = 'block1_2_min_salary'
max = 'block1_2_max_salary'
answer = 'block1_2_answer'
# Setup and load initial data
if os.path.exists(results_file):
    df = pd.read_csv(results_file)
else:
    df[min] = None  # Initialize columns if starting fresh
    df[max] = None
    df[answer] = None 

# change content here for new prompt and message below ------------
def generate_query(row):
    return {
        "role": "user",
        "content": f"Gib mir nur das Gehalt im Format (\d{1,3}(?:'\d{3})) und (\d{1,3}(?:'\d{3})) CHF für die Position {row['title']} bei der Firma {row['company']} in Zürich, Schweiz. Wenn du kein exaktes Gehalt hast gib eine Schätzung an. Der angegebene Gehaltsbereich sollte dabei eine maximale Spanne von 20000 CHF nicht überschreiten."
    }

def run_model(limit):
    start_index = df[min].last_valid_index() + 1 if df[max].last_valid_index() != None else 0

    
    for index, row in df.iloc[start_index:start_index + limit].iterrows():
        query_message = generate_query(row)
        chat_completion = client.chat.completions.create(
            messages= [query_message], # change message for new prompt ------------
            model="llama3-70b-8192",
            temperature=0.0
        )
        min_salary, max_salary = extract_salaries(chat_completion.choices[0].message.content)
        print(index)
        print(min_salary, max_salary)
        
        
        # Update DataFrame
        df.at[index, min] = min_salary
        df.at[index, max] = max_salary
        df.at[index, answer] = chat_completion.choices[0].message.content
        
    # Save updated DataFrame in chunks or fully, depending on your preference
    df.to_csv(results_file, index=False)

# Testing the function with a limited number of entries
run_model(10000)

0
90000 110000
1
80000 100000
2
80000 100000
3
58000 68000
4
90000 110000
5
120000 140000
6
80000 100000
7
110000 130000
8
80000 100000
9
80000 100000
10
120000 140000
11
90000 110000
12
120000 140000
13
80000 100000
14
90000 110000
15
120000 140000
16
80000 100000
17
80000 100000
18
80000 100000
19
80000 100000
20
90000 110000
21
80000 100000
22
80000 100000
23
90000 110000
24
80000 100000
25
80000 100000
26
80000 100000
27
90000 110000
28
90000 110000
29
80000 100000
30
80000 100000
31
90000 110000
32
90000 110000
33
90000 110000
34
80000 100000
35
110000 130000
36
90000 110000
37
90000 110000
38
90000 110000
39
90000 110000
40
80000 100000
41
90000 110000
42
90000 110000
43
90000 110000
44
120000 140000
45
90000 110000
46
90000 110000
47
90000 110000
48
120000 140000
49
90000 110000
50
120000 140000
51
90000 110000
52
80000 100000
53
90000 110000
54
90000 110000
55
90000 110000
56
90000 110000
57
90000 110000
58
90000 110000
59
90000 110000
60
90000 110000
61
80000 100000
62
90000 1

In [18]:
df_block1_2 = pd.read_csv('data/llm_block1_2.csv')
# all block1_answers where block1_max_salary is null and block1_min_salary is null
df_block1_2[df_block1_2['block1_2_max_salary'].isnull() & df_block1_2['block1_2_min_salary'].isnull()]

Unnamed: 0,title,company,contract_type,description,min_workload,max_workload,min_salary,max_salary,block1_min_salary,block1_max_salary,block1_answer,block1_2_min_salary,block1_2_max_salary,block1_2_answer


# LLM request with Block 1 2 3 

In [19]:
# change here for new prompt and message below ------------
results_file = 'data/llm_block1_2_3.csv'
min = 'block1_2_3_min_salary'
max = 'block1_2_3_max_salary'
answer = 'block1_2_3_answer'
# Setup and load initial data
if os.path.exists(results_file):
    df = pd.read_csv(results_file)
else:
    df[min] = None  # Initialize columns if starting fresh
    df[max] = None
    df[answer] = None 

# change content here for new prompt and message below ------------
def generate_query(row):
    return {
        "role": "user",
        "content": f"Gib mir nur das Gehalt im Format (\d{1,3}(?:'\d{3})*) und (\d{1,3}(?:'\d{3})*) CHF für die Position {row['title']} bei der Firma {row['company']} in Zürich, Schweiz. Mit einer Arbeitszeit von {row['min_workload']} bis {row['max_workload']} Prozent. Wenn du kein exaktes Gehalt hast gib eine Schätzung an. Der angegebene Gehaltsbereich sollte dabei eine maximale Spanne von 20000 CHF nicht überschreiten."
    }

def run_model(limit):
    start_index = df[min].last_valid_index() + 1 if df[max].last_valid_index() != None else 0

    
    for index, row in df.iloc[start_index:start_index + limit].iterrows():
        query_message = generate_query(row)
        chat_completion = client.chat.completions.create(
            messages= [query_message], # change message for new prompt ------------
            model="llama3-70b-8192",
            temperature=0.0
        )
        min_salary, max_salary = extract_salaries(chat_completion.choices[0].message.content)
        print(index)
        print(min_salary, max_salary)
        
        
        # Update DataFrame
        df.at[index, min] = min_salary
        df.at[index, max] = max_salary
        df.at[index, answer] = chat_completion.choices[0].message.content
        
    # Save updated DataFrame in chunks or fully, depending on your preference
    df.to_csv(results_file, index=False)

# Testing the function with a limited number of entries
run_model(10000)

0
90000 110000
1
70000 85000
2
60000 80000
3
45000 65000
4
90000 110000
5
90000 110000
6
80000 100000
7
90000 110000
8
60000 80000
9
80000 100000
10
90000 110000
11
80000 100000
12
90000 110000
13
80000 100000
14
90000 110000
15
120000 140000
16
80000 100000
17
80000 100000
18
80000 100000
19
90000 110000
20
90000 110000
21
90000 110000
22
90000 110000
23
80000 100000
24
80000 100000
25
80000 100000
26
80000 100000
27
90000 110000
28
90000 110000
29
90000 110000
30
80000 100000
31
90000 110000
32
90000 110000
33
90000 110000
34
80000 100000
35
90000 110000
36
90000 110000
37
90000 110000
38
90000 110000
39
80000 100000
40
80000 100000
41
80000 100000
42
90000 110000
43
80000 100000
44
90000 110000
45
90000 110000
46
90000 110000
47
90000 110000
48
120000 140000
49
90000 110000
50
90000 110000
51
80000 100000
52
80000 100000
53
80000 100000
54
90000 110000
55
80000 100000
56
80000 100000
57
90000 110000
58
90000 110000
59
90000 110000
60
90000 110000
61
80000 100000
62
90000 110000
63
9

In [20]:
df_block1_2_3 = pd.read_csv('data/llm_block1_2_3.csv')
df_block1_2_3.head()

Unnamed: 0,title,company,contract_type,description,min_workload,max_workload,min_salary,max_salary,block1_min_salary,block1_max_salary,block1_answer,block1_2_min_salary,block1_2_max_salary,block1_2_answer,block1_2_3_min_salary,block1_2_3_max_salary,block1_2_3_answer
0,Software Engineer .Net,ELCA Informatik AG,Festanstellung,"Über ELCA Wir sind ELCA, einer der grössten Sc...",80,100,100000,130000,90000,110000,"Based on national averages and online sources,...",90000,110000,"Based on national averages and online sources,...",90000,110000,"Based on national averages and online sources,..."
1,Digital Analyst / Web Analyst,Unic AG,Festanstellung,Deine Aufgaben als Digital Analyst bei Unic ...,80,100,90000,113000,80000,100000,"Based on national averages and online sources,...",80000,100000,"Based on national averages and online sources,...",70000,85000,"Based on national averages and online sources,..."
2,ICT Supporter 100% (a),Spitex Zürich,Festanstellung,Möchtest du auch etwas bewegen und deine Arbei...,100,100,78000,96000,80000,100000,"Based on national averages and online sources,...",80000,100000,"Based on national averages and online sources,...",60000,80000,"Based on national averages and online sources,..."
3,KAUFMÄNNISCHES PRAKTIKUM (100%) - MIT FOKUS KU...,gebana AG,Praktikum,DEINE AUFGABEN IM KUND:INNENSERVICE – EINSATZ ...,100,100,28300,28300,45000,65000,"Based on national averages and online sources,...",58000,68000,"Based on national averages and online sources,...",45000,65000,"Based on national averages and online sources,..."
4,System Engineer für Microsoft-Technologien und...,konekkt GmbH,Festanstellung,Wir präsentieren eine faszinierende Karrieremö...,80,100,95000,125000,90000,110000,"Based on national averages and online sources,...",90000,110000,"Based on national averages and online sources,...",90000,110000,"Based on national averages and online sources,..."


In [21]:
# all block1_answers where block1_max_salary is null and block1_min_salary is null
df_block1_2_3[df_block1_2_3['block1_2_3_max_salary'].isnull() & df_block1_2_3['block1_2_3_min_salary'].isnull()]


Unnamed: 0,title,company,contract_type,description,min_workload,max_workload,min_salary,max_salary,block1_min_salary,block1_max_salary,block1_answer,block1_2_min_salary,block1_2_max_salary,block1_2_answer,block1_2_3_min_salary,block1_2_3_max_salary,block1_2_3_answer


# Block 1,2,3 and 4

In [22]:
# change here for new prompt and message below ------------
results_file = 'data/llm_block1_2_3_4.csv'
min = 'block1_2_3_4_min_salary'
max = 'block1_2_3_4_max_salary'
answer = 'block1_2_3_4_answer'
# Setup and load initial data
if os.path.exists(results_file):
    df = pd.read_csv(results_file)
else:
    df[min] = None  # Initialize columns if starting fresh
    df[max] = None
    df[answer] = None 

# change content here for new prompt and message below ------------
def generate_query(row):
    return {
        "role": "user",
        "content": f"Gib mir nur das Gehalt im Format (\d{1,3}(?:'\d{3})) und (\d{1,3}(?:'\d{3})) CHF für die Position {row['title']} bei der Firma {row['company']} in Zürich, Schweiz. Mit einer Arbeitszeit von {row['min_workload']} bis {row['max_workload']} Prozent und dem Vertragsart {row['contract_type']}. Wenn du kein exaktes Gehalt hast gib eine Schätzung an. Der angegebene Gehaltsbereich sollte dabei eine maximale Spanne von 20000 CHF nicht überschreiten."
    }

def run_model(limit):
    start_index = df[min].last_valid_index() + 1 if df[max].last_valid_index() != None else 0

    
    for index, row in df.iloc[start_index:start_index + limit].iterrows():
        query_message = generate_query(row)
        chat_completion = client.chat.completions.create(
            messages= [query_message], # change message for new prompt ------------
            model="llama3-70b-8192",
            temperature=0.0
        )
        min_salary, max_salary = extract_salaries(chat_completion.choices[0].message.content)
        print(index)
        print(min_salary, max_salary)
        
        
        # Update DataFrame
        df.at[index, min] = min_salary
        df.at[index, max] = max_salary
        df.at[index, answer] = chat_completion.choices[0].message.content
        
    # Save updated DataFrame in chunks or fully, depending on your preference
    df.to_csv(results_file, index=False)

# Testing the function with a limited number of entries
run_model(10000)

0
90000 110000
1
90000 110000
2
80000 100000
3
45000 55000
4
90000 110000
5
90000 110000
6
80000 100000
7
90000 110000
8
80000 100000
9
90000 110000
10
114000 134000
11
80000 100000
12
110000 130000
13
80000 100000
14
90000 110000
15
120000 140000
16
80000 100000
17
80000 100000
18
80000 100000
19
80000 100000
20
90000 110000
21
80000 100000
22
80000 100000
23
90000 110000
24
80000 100000
25
80000 100000
26
80000 100000
27
90000 110000
28
90000 110000
29
80000 100000
30
80000 100000
31
90000 110000
32
90000 110000
33
90000 110000
34
80000 100000
35
95000 115000
36
90000 110000
37
90000 110000
38
90000 110000
39
90000 110000
40
90000 110000
41
90000 110000
42
90000 110000
43
90000 110000
44
90000 110000
45
90000 110000
46
90000 110000
47
90000 110000
48
120000 140000
49
90000 110000
50
90000 110000
51
80000 100000
52
80000 100000
53
80000 100000
54
90000 110000
55
90000 110000
56
80000 100000
57
90000 110000
58
95000 115000
59
90000 110000
60
90000 110000
61
80000 100000
62
90000 110000

In [23]:
df_block1_2_3_4 = pd.read_csv('data/llm_block1_2_3_4.csv')
df_block1_2_3_4.head()

Unnamed: 0,title,company,contract_type,description,min_workload,max_workload,min_salary,max_salary,block1_min_salary,block1_max_salary,block1_answer,block1_2_min_salary,block1_2_max_salary,block1_2_answer,block1_2_3_min_salary,block1_2_3_max_salary,block1_2_3_answer,block1_2_3_4_min_salary,block1_2_3_4_max_salary,block1_2_3_4_answer
0,Software Engineer .Net,ELCA Informatik AG,Festanstellung,"Über ELCA Wir sind ELCA, einer der grössten Sc...",80,100,100000,130000,90000,110000,"Based on national averages and online sources,...",90000,110000,"Based on national averages and online sources,...",90000,110000,"Based on national averages and online sources,...",90000.0,110000.0,"Based on national averages and online sources,..."
1,Digital Analyst / Web Analyst,Unic AG,Festanstellung,Deine Aufgaben als Digital Analyst bei Unic ...,80,100,90000,113000,80000,100000,"Based on national averages and online sources,...",80000,100000,"Based on national averages and online sources,...",70000,85000,"Based on national averages and online sources,...",90000.0,110000.0,"Based on national averages and online sources,..."
2,ICT Supporter 100% (a),Spitex Zürich,Festanstellung,Möchtest du auch etwas bewegen und deine Arbei...,100,100,78000,96000,80000,100000,"Based on national averages and online sources,...",80000,100000,"Based on national averages and online sources,...",60000,80000,"Based on national averages and online sources,...",80000.0,100000.0,"Based on national averages and online sources,..."
3,KAUFMÄNNISCHES PRAKTIKUM (100%) - MIT FOKUS KU...,gebana AG,Praktikum,DEINE AUFGABEN IM KUND:INNENSERVICE – EINSATZ ...,100,100,28300,28300,45000,65000,"Based on national averages and online sources,...",58000,68000,"Based on national averages and online sources,...",45000,65000,"Based on national averages and online sources,...",45000.0,55000.0,"Based on national averages and online sources,..."
4,System Engineer für Microsoft-Technologien und...,konekkt GmbH,Festanstellung,Wir präsentieren eine faszinierende Karrieremö...,80,100,95000,125000,90000,110000,"Based on national averages and online sources,...",90000,110000,"Based on national averages and online sources,...",90000,110000,"Based on national averages and online sources,...",90000.0,110000.0,"Based on national averages and online sources,..."


In [24]:
# all block1_answers where block1_max_salary is null and block1_min_salary is null
df_block1_2_3_4[df_block1_2_3_4['block1_2_3_4_max_salary'].isnull() & df_block1_2_3_4['block1_2_3_4_min_salary'].isnull()]


Unnamed: 0,title,company,contract_type,description,min_workload,max_workload,min_salary,max_salary,block1_min_salary,block1_max_salary,block1_answer,block1_2_min_salary,block1_2_max_salary,block1_2_answer,block1_2_3_min_salary,block1_2_3_max_salary,block1_2_3_answer,block1_2_3_4_min_salary,block1_2_3_4_max_salary,block1_2_3_4_answer
87,Application Manager (m/w/d) 80 – 100%,konekkt GmbH,Festanstellung,Unser Partner ist ein Pionier in seiner Nische...,80,100,90000,115000,90000,110000,"Based on national averages and online sources,...",90000,110000,"Based on national averages and online sources,...",80000,100000,"Based on national averages and online sources,...",,,"Based on national averages and online sources,..."


### manually enter the data for index 87

In [25]:
# Manually set the salary values for specific indices
indices_to_update = {
    87: (90000, 110000)
}

# Update the DataFrame
for index, (min_salary, max_salary) in indices_to_update.items():
    df_block1_2_3_4.at[index, 'block1_2_3_4_min_salary'] = min_salary
    df_block1_2_3_4.at[index, 'block1_2_3_4_max_salary'] = max_salary

# Save the updated DataFrame to CSV
df_block1_2_3_4.to_csv('data/llm_block1_2_3_4.csv', index=False)

# Optionally print the updated rows to verify
print(df_block1_2_3_4.loc[[87]])


                                    title       company   contract_type  \
87  Application Manager (m/w/d) 80 – 100%  konekkt GmbH  Festanstellung   

                                          description  min_workload  \
87  Unser Partner ist ein Pionier in seiner Nische...            80   

    max_workload  min_salary  max_salary  block1_min_salary  \
87           100       90000      115000              90000   

    block1_max_salary                                      block1_answer  \
87             110000  Based on national averages and online sources,...   

    block1_2_min_salary  block1_2_max_salary  \
87                90000               110000   

                                      block1_2_answer  block1_2_3_min_salary  \
87  Based on national averages and online sources,...                  80000   

    block1_2_3_max_salary                                  block1_2_3_answer  \
87                 100000  Based on national averages and online sources,...   

    blo

In [26]:
df_block1_2_3_4[df_block1_2_3_4['block1_2_3_4_max_salary'].isnull() & df_block1_2_3_4['block1_2_3_4_min_salary'].isnull()]

Unnamed: 0,title,company,contract_type,description,min_workload,max_workload,min_salary,max_salary,block1_min_salary,block1_max_salary,block1_answer,block1_2_min_salary,block1_2_max_salary,block1_2_answer,block1_2_3_min_salary,block1_2_3_max_salary,block1_2_3_answer,block1_2_3_4_min_salary,block1_2_3_4_max_salary,block1_2_3_4_answer


# Block 1,2,3,4 and 5

In [27]:
# change here for new prompt and message below ------------
results_file = 'data/llm_block1_2_3_4_5.csv'
min = 'block1_2_3_4_5_min_salary'
max = 'block1_2_3_4_5_max_salary'
answer = 'block1_2_3_4_5_answer'
# Setup and load initial data
if os.path.exists(results_file):
    df = pd.read_csv(results_file)
else:
    df[min] = None  # Initialize columns if starting fresh
    df[max] = None
    df[answer] = None 

# change content here for new prompt and message below ------------
def generate_query(row):
    return {
        "role": "user",
        "content": f"Gib mir nur das Gehalt im Format (\d{1,3}(?:'\d{3})) und (\d{1,3}(?:'\d{3})) CHF für die Position {row['title']} bei der Firma {row['company']} in Zürich, Schweiz. Mit einer Arbeitszeit von {row['min_workload']} bis {row['max_workload']} Prozent und dem Vertragsart {row['contract_type']}. Wenn du kein exaktes Gehalt hast gib eine Schätzung an. Der angegebene Gehaltsbereich sollte dabei eine maximale Spanne von 20000 CHF nicht überschreiten."
    }

def run_model(limit):
    start_index = df[min].last_valid_index() + 1 if df[max].last_valid_index() != None else 0

    
    for index, row in df.iloc[start_index:start_index + limit].iterrows():
        query_message = generate_query(row)
        chat_completion = client.chat.completions.create(
            messages= [query_message], # change message for new prompt ------------
            model="llama3-70b-8192",
            temperature=0.0
        )
        min_salary, max_salary = extract_salaries(chat_completion.choices[0].message.content)
        print(index)
        print(min_salary, max_salary)
        
        
        # Update DataFrame
        df.at[index, min] = min_salary
        df.at[index, max] = max_salary
        df.at[index, answer] = chat_completion.choices[0].message.content
        
    # Save updated DataFrame in chunks or fully, depending on your preference
    df.to_csv(results_file, index=False)

# Testing the function with a limited number of entries
run_model(10000)

0
90000 110000
1
80000 100000
2
80000 100000
3
45000 55000
4
90000 110000
5
90000 110000
6
80000 100000
7
90000 110000
8
80000 100000
9
90000 110000
10
114000 134000
11
80000 100000
12
90000 110000
13
80000 100000
14
90000 110000
15
120000 140000
16
80000 100000
17
80000 100000
18
80000 100000
19
80000 100000
20
90000 110000
21
80000 100000
22
80000 100000
23
90000 110000
24
80000 100000
25
80000 100000
26
80000 100000
27
90000 110000
28
90000 110000
29
90000 110000
30
80000 100000
31
90000 110000
32
90000 110000
33
90000 110000
34
80000 100000
35
95000 115000
36
90000 110000
37
90000 110000
38
90000 110000
39
90000 110000
40
90000 110000
41
90000 110000
42
90000 110000
43
90000 110000
44
90000 110000
45
90000 110000
46
90000 110000
47
90000 110000
48
120000 140000
49
90000 110000
50
90000 110000
51
80000 100000
52
80000 100000
53
80000 100000
54
90000 110000
55
90000 110000
56
90000 110000
57
90000 110000
58
95000 115000
59
90000 110000
60
90000 110000
61
80000 100000
62
90000 110000


In [28]:
df_block1_2_3_4 = pd.read_csv('data/llm_block1_2_3_4.csv')
df_block1_2_3_4[df_block1_2_3_4['block1_2_3_4_max_salary'].isnull() & df_block1_2_3_4['block1_2_3_4_min_salary'].isnull()]

Unnamed: 0,title,company,contract_type,description,min_workload,max_workload,min_salary,max_salary,block1_min_salary,block1_max_salary,block1_answer,block1_2_min_salary,block1_2_max_salary,block1_2_answer,block1_2_3_min_salary,block1_2_3_max_salary,block1_2_3_answer,block1_2_3_4_min_salary,block1_2_3_4_max_salary,block1_2_3_4_answer
