# Evaluating the LLM on the dataset without fewshot learning

In [8]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv('data/llm_block1_2_3_4_5.csv')

# Ensure data types are correct; convert if necessary
salary_columns = ['min_salary', 'max_salary', 'block1_min_salary', 'block1_max_salary', 'block1_2_min_salary', 'block1_2_max_salary', 'block1_2_3_4_5_min_salary', 'block1_2_3_4_5_max_salary']
for col in salary_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [12]:
print(df)

                                                 title  \
0                               Software Engineer .Net   
1                        Digital Analyst / Web Analyst   
2                               ICT Supporter 100% (a)   
3    KAUFMÄNNISCHES PRAKTIKUM (100%) - MIT FOKUS KU...   
4    System Engineer für Microsoft-Technologien und...   
..                                                 ...   
105                                Mainframe Developer   
106                                      SAP Architect   
107                             Office & Event Support   
108                     IT-Test Engineer (m/w) 80-100%   
109                    Automatisierungstechniker (m/w)   

                    company   contract_type  \
0        ELCA Informatik AG  Festanstellung   
1                   Unic AG  Festanstellung   
2             Spitex Zürich  Festanstellung   
3                 gebana AG       Praktikum   
4              konekkt GmbH  Festanstellung   
..                   

In [19]:
def calculate_rmse(row, min_col, max_col, pred_min_col, pred_max_col):
    actual_avg = (row[min_col] + row[max_col]) / 2
    predicted_avg = (row[pred_min_col] + row[pred_max_col]) / 2
    return np.sqrt((actual_avg - predicted_avg) ** 2)

def calculate_overlap(row, min_col, max_col, pred_min_col, pred_max_col):
    actual_min, actual_max = row[min_col], row[max_col]
    predicted_min, predicted_max = row[pred_min_col], row[pred_max_col]
    intersection_min = max(actual_min, predicted_min)
    intersection_max = min(actual_max, predicted_max)
    if intersection_max > intersection_min:
        intersection = intersection_max - intersection_min
        smaller_range = min(actual_max - actual_min, predicted_max - predicted_min)
        return intersection / smaller_range
    return 0

In [20]:
blocks = [
    ('block1_min_salary', 'block1_max_salary'),
    ('block1_2_min_salary', 'block1_2_max_salary'),
    ('block1_2_3_min_salary', 'block1_2_3_max_salary'),
        ('block1_2_3_4_min_salary', 'block1_2_3_4_max_salary'),
    ('block1_2_3_4_5_min_salary', 'block1_2_3_4_5_max_salary')
]

# Iterate over each block and calculate RMSE and overlap
for min_col, max_col in blocks:
    df[min_col + '_rmse'] = df.apply(calculate_rmse, axis=1, min_col='min_salary', max_col='max_salary', pred_min_col=min_col, pred_max_col=max_col)
    df[min_col + '_overlap'] = df.apply(calculate_overlap, axis=1, min_col='min_salary', max_col='max_salary', pred_min_col=min_col, pred_max_col=max_col)

# Calculate average RMSE and overlap for each block
for min_col, _ in blocks:
    average_rmse = df[min_col + '_rmse'].mean()
    average_overlap = df[min_col + '_overlap'].mean()
    print(f"Average RMSE for {min_col}: {average_rmse}")
    print(f"Average Overlap Coefficient for {min_col}: {average_overlap}")


Average RMSE for block1_min_salary: 20453.563636363637
Average Overlap Coefficient for block1_min_salary: 0.425
Average RMSE for block1_2_min_salary: 20800.8
Average Overlap Coefficient for block1_2_min_salary: 0.4067777777777778
Average RMSE for block1_2_3_min_salary: 23080.836363636365
Average Overlap Coefficient for block1_2_3_min_salary: 0.29556565656565653
Average RMSE for block1_2_3_4_min_salary: 21448.550458715596
Average Overlap Coefficient for block1_2_3_4_min_salary: 0.3883939393939394
Average RMSE for block1_2_3_4_5_min_salary: 21540.29357798165
Average Overlap Coefficient for block1_2_3_4_5_min_salary: 0.3838484848484849
