# Loading

In [1]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
import string
import json
import copy
import math 
import re
import time

import os
from dotenv import load_dotenv


In [44]:
load_dotenv()
model="gemini-1.5-flash"

gemini_api_key = os.getenv("GEMINI_API_KEY_1_5_FIVE")

client = OpenAI(
    api_key=gemini_api_key,
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

try:
    resp = client.chat.completions.create(
        model="gemini-1.5-flash",
        messages=[{"role": "user", "content": "Ping"}]
    )
    print("Success:", resp.choices[0].message.content)
except Exception as e:
    # <-- note the closing ')' here
    print("Request failed:", type(e).__name__, e)

Success: Pong!



# Salary

In [3]:
train_df = pd.read_csv('../data/salary_labelled_development_set_cleaned.csv')
test_df = pd.read_csv('../data/salary_labelled_test_set_cleaned.csv')

# RBIC Functions

### RB


In [4]:
#== step 1: role explanation
def step_1():
    messages = [
        {"role": "system", "content": "You are an expert job ad annotator. Your task is to extract structured salary information from job descriptions in the format: min-max-currency-frequency. If salary is not found, return: 0-0-None-None."},
        {"role": "user", "content": "Based on your role, can you briefly explain what constitutes a salary range, currency and frequency in a job listing?"}
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    return response.choices[0].message.content, messages

In [5]:
#== few-shot examples
def fewshot(messages):
    # messages.append({
    #     "role": "user",
    #     "content": "I will provide you with some examples on how to accomplish your task"
    # })

    # response = client.chat.completions.create(
    #     model=model,
    #     messages=messages
    # )
    # messages.append({"role": "assistant", "content": response.choices[0].message.content})

    few_shot_indices = [1030, 1041, 1025, 1022, 1011]

    for i in few_shot_indices:
        desc = {
          "job_title": train_df.iloc[i].job_title,
          "job_ad_details": train_df.iloc[i].job_ad_details,
          "nation_short_desc": train_df.iloc[i].nation_short_desc,
          "salary_additional_text": train_df.iloc[i].salary_additional_text,
        }
        desc_str = str(desc)

        # add the description
        messages.append({
          "role": "user",
          "content": desc_str
        })

        label = train_df.iloc[i].y_true
        label_str = str(label)

        # add the output
        messages.append({
            "role": "assistant",
            "content": label_str
        })

    return messages

In [6]:
#== step 2: setting sub-task --> ask for salary patterns
def step_2(messages):
    messages.append({
        "role": "user",
        "content": "As a salary extractor, what are some common phrases or numeric patterns that indicate a salary range, currency and frequency in a job description?"
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    return response.choices[0].message.content, messages

### IC

In [7]:
#== step 3: ask if salary info is in the description
def step_3(messages_static, desc_str):
    messages = copy.deepcopy(messages_static)

    messages.append({
        "role": "user",
        "content": f"{desc_str} does this job description include any salary-related information? Just respond with 'Yes' or 'No'."
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    p3_content = response.choices[0].message.content

    # clean and check the response
    response_p3 = p3_content.translate(str.maketrans('', '', string.punctuation))
    response_p3 = response_p3.strip().lower()[:3]
    #print(f"step3 messages: {len(messages)}")
    return response_p3, messages

In [8]:
#== step 4: iterative coaching/finding clues to prevent hallucination
def step_4(response_p3, messages):
    if response_p3 == "yes":
        messages.append({
            "role": "user",
            "content": "Extract the salary range or salary-related phrases from the text verbatim. Respond in JSON: {\"Clue\": \"\"}."
        })
    elif response_p3 == "no":
        messages.append({
            "role": "user",
            "content": "Briefly explain why there is no salary information (e.g., 'No mention of pay or compensation'). Respond in JSON: {\"Clue\": \"\"}."
        })
    else:
        raise Exception(f"Unexpected model output: {response_p3}")

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})
    #print(f"step4 messages: {len(messages)}")

    return response.choices[0].message.content, messages

In [9]:
#== step 5: use the clue to generate the final output
def step_5(messages):
    messages.append({
        "role": "user",
        "content": (
            "Based on the extracted salary clue, return a structured salary in JSON. "
            "Use 'nation_short_desc' to determine the correct currency. "
            "Use this format: {\"MinSalary\": \"\", \"MaxSalary\": \"\", \"Currency\": \"\", \"Frequency\": \"\"}. "
            "If not provided explicitly, output 0 for \"MinSalary\" and \"MaxSalary\", and \"None\" for \"Currency\" and \"Frequency\". "
            "If the salary is mentioned, always output a range, where MinSalary and MaxSalary can be equal. "
            "Output the currency as 3 letters. Use adverb to output frequency (annual, monthly, daily or hourly)."
            "Perform floor function on the float type salary"
        )
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    answer_str = response.choices[0].message.content

    #print(f"step5 messages: {len(messages)}")

    # format and print the output
    try:
        answer_str_ = answer_str[answer_str.find('{'):answer_str.find('}') + 1]
        answer_str_ = answer_str_.replace('“', '"')
        answer_str_ = answer_str_.replace('”', '"')
        answer = json.loads(answer_str_)

        # min_salary = int(round(float(answer['MinSalary'])))
        # max_salary = int(round(float(answer['MaxSalary'])))

        # if  min_salary == 0 and  max_salary == 0:
        #   label = "0-0-None-None"
        # else:
        #   label = f"{min_salary}-{max_salary}-{answer['Currency'].upper()}-{answer['Frequency'].upper()}"
        # label = f"{min_salary}-{max_salary}-{answer['Currency'].upper()}-{answer['Frequency'].upper()}"

        label = f"{answer['MinSalary']}-{answer['MaxSalary']}-{answer['Currency'].upper()}-{answer['Frequency'].upper()}"



    except json.JSONDecodeError:
        print(f"Failed to parse model output as JSON: {answer_str}")
        label = "ERROR " + answer_str

    return label, answer_str

In [10]:
def RBIC_static_messages(verbose=False, add_fewshot=True):
    response, messages = step_1()
    if verbose: print(f"RB step 1: {response}\n")

    response, messages = step_2(messages)
    if verbose: print(f"RB step 2: {response}\n")

    if add_fewshot:
        messages = fewshot(messages)
        if verbose: print(f"Fewshot examples added\n")
    return messages

In [12]:
def RBIC(messages, desc_str, verbose=False):
    response_p3, messages_local = step_3(messages, desc_str)
    if verbose: print(f"IC step 1: {response_p3}\n")

    response, messages_local = step_4(response_p3, messages_local)
    if verbose: print(f"IC step 2: {response}\n")

    label, answer_str = step_5(messages_local)
    if verbose: print(f"IC step 3 (Final): {label}\n")
    if verbose: print(f"IC step 3 (Final Raw): {answer_str}\n")

    return label

# Testing

### Qualitative Tests

In [13]:
messages_static = RBIC_static_messages(verbose=True, add_fewshot=True)

RB step 1: A salary range in a job listing specifies the minimum and maximum compensation an employer is willing to pay for a particular role.  It's typically expressed as a numerical range (e.g., 60000-80000).

The currency indicates the unit of the salary (e.g., USD, EUR, GBP).  This clarifies whether the figures are in US Dollars, Euros, British Pounds, etc.

The frequency specifies how often the salary is paid (e.g., annually, monthly, bi-weekly).  This is crucial because a yearly salary is very different from a monthly salary.


RB step 2: To reliably extract salary information, I look for several key phrases and patterns, categorized as follows:

**Salary Range Indicators:**

* **Explicit Ranges:**  The most straightforward: "£30,000 - £40,000", "$60,000-$80,000", "€50,000 - 60,000".  I pay close attention to the delimiters used (hyphens, "to", "and").  Sometimes ranges are written as "between $X and $Y".

* **"Up to" or "To":** Phrases like "$70,000 per year up to $90,000", indi

In [14]:
ind = 345

desc = {
    "job_title": train_df.iloc[ind].job_title,
    "job_ad_details": train_df.iloc[ind].job_ad_details,
    "nation_short_desc": train_df.iloc[ind].nation_short_desc,
    "salary_additional_text": train_df.iloc[ind].salary_additional_text,
  }
desc_str = str(desc)

label = train_df.iloc[ind].y_true
label_str = str(label)

print(f"len of messages_static {len(messages_static)}")
label_pred = RBIC(messages_static, desc_str, verbose=True)


print(f"pred = {label_pred}")
print(f"truth = {label_str}")

len of messages_static 15
IC step 1: yes

IC step 2: ```json
{"Clue": "$70049.73 - $98351.39 + Super"}
```


IC step 3 (Final): 70049-98351-AUD-ANNUAL

IC step 3 (Final Raw): ```json
{
  "MinSalary": 70049,
  "MaxSalary": 98351,
  "Currency": "AUD",
  "Frequency": "annual"
}
```


pred = 70049-98351-AUD-ANNUAL
truth = 70049-98351-AUD-ANNUAL


### Quantitative Tests

In [15]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [16]:
test_pred_df

Unnamed: 0,y_pred


In [17]:
print(len(test_df))

567


In [21]:
test_pred_df2 = pd.DataFrame(columns=["y_pred"])

In [26]:
test_pred_df3 = pd.DataFrame(columns=["y_pred"])
test_pred_df3

Unnamed: 0,y_pred


In [27]:
for i in tqdm(range(522,len(test_df))):
    desc = {
        "job_title":            test_df.iloc[i].job_title,
        "job_ad_details":       test_df.iloc[i].job_ad_details,
        "nation_short_desc":    test_df.iloc[i].nation_short_desc,
        "salary_additional_text": test_df.iloc[i].salary_additional_text,
    }
    desc_str = str(desc)

    # keep retrying this same i until it succeeds
    while True:
        try:
            label_pred = RBIC(messages_static, desc_str)
            break
        except Exception as e:
            # catch any error (rate‐limit, network blip, parsing error, etc.)
            print(f"Error on row {i} ({e.__class__.__name__}): {e}. "
                  "Sleeping 60 s before retry…")
            time.sleep(60)

    # append the successful prediction to the next empty row
    test_pred_df3.loc[len(test_pred_df3)] = label_pred

# finally, save your results
test_pred_df3.to_csv(
    'salary_labelled_test_set_gemini_predictions.csv',
    index=False
)



 13%|█▎        | 6/45 [00:16<01:41,  2.61s/it]

Error on row 528 (RateLimitError): Error code: 429 - [{'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-1.5-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '19s'}]}}]. Sleeping 60 s before retry…


 24%|██▍       | 11/45 [01:33<04:09,  7.33s/it]

Error on row 533 (RateLimitError): Error code: 429 - [{'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'model': 'gemini-1.5-flash', 'location': 'global'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '1s'}]}}]. Sleeping 60 s before retry…


 36%|███▌      | 16/45 [02:50<03:47,  7.84s/it]

Error on row 538 (RateLimitError): Error code: 429 - [{'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-1.5-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '44s'}]}}]. Sleeping 60 s before retry…


 53%|█████▎    | 24/45 [04:17<01:41,  4.85s/it]

Error on row 546 (RateLimitError): Error code: 429 - [{'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-1.5-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '18s'}]}}]. Sleeping 60 s before retry…


 64%|██████▍   | 29/45 [05:35<02:02,  7.67s/it]

Error on row 551 (RateLimitError): Error code: 429 - [{'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-1.5-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '58s'}]}}]. Sleeping 60 s before retry…


 89%|████████▉ | 40/45 [07:17<00:17,  3.48s/it]

Error on row 562 (RateLimitError): Error code: 429 - [{'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-1.5-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '19s'}]}}]. Sleeping 60 s before retry…


100%|██████████| 45/45 [08:35<00:00, 11.45s/it]


In [28]:
# export the dataframe to a new csv file
test_pred_df3.to_csv('salary_labelled_test_set_gemini_predictions3.csv', index=False)

In [30]:
test_df[['y_true']].to_csv('salary_set_ans.csv', index=False)

In [3]:
from sklearn.metrics import classification_report, f1_score

# 1. Load your files
preds = pd.read_csv('salary_labelled_test_set_gemini_predictions.csv')
true  = pd.read_csv('salary_set_ans.csv')

# 2. Extract the columns
df = pd.DataFrame({
    'y_pred': preds['y_pred'],
    'y_true': true['y_true']
})

# 3. Split both prediction and truth into 4 columns
pred_cols = df['y_pred'].str.split('-', expand=True)
pred_cols.columns = ['min_pred','max_pred','curr_pred','freq_pred']

true_cols = df['y_true'].str.split('-', expand=True)
true_cols.columns = ['min_true','max_true','curr_true','freq_true']

df = pd.concat([df, pred_cols, true_cols], axis=1)

# 4. Convert salary columns to numeric for a fair comparison
for col in ['min_pred','max_pred','min_true','max_true']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 5. Compute per‐field accuracies
n = len(df)
results = {}
results['min_salary'] = (df['min_pred'] == df['min_true']).sum() / n
results['max_salary'] = (df['max_pred'] == df['max_true']).sum() / n
results['currency']   = (df['curr_pred'] == df['curr_true']).sum() / n
results['frequency']  = (df['freq_pred'] == df['freq_true']).sum() / n

# 6. Print them out
for field, acc in results.items():
    correct = int(acc * n)
    print(f"{field:12} accuracy: {acc:.2%} ({correct}/{n})")


    all_match = (
    (df['min_pred']   == df['min_true']) &
    (df['max_pred']   == df['max_true']) &
    (df['curr_pred']  == df['curr_true']) &
    (df['freq_pred']  == df['freq_true']))

overall_acc = all_match.sum() / n

print(f"overall accuracy: {overall_acc:.2%} ({all_match.sum()}/{n})")

min_salary   accuracy: 88.36% (501/567)
max_salary   accuracy: 88.54% (502/567)
currency     accuracy: 96.65% (548/567)
frequency    accuracy: 92.42% (524/567)
overall accuracy: 82.19% (466/567)


# Regular prompting

In [36]:
# df to store model predictions
test_pred_df4 = pd.DataFrame(columns=["y_pred"])

In [46]:
test_pred_df5 = pd.DataFrame(columns=["y_pred"])
test_pred_df5

Unnamed: 0,y_pred


In [37]:
messages_static = [
    {"role": "system", "content": "You are an expert job ad annotator. Your task is to extract structured salary information from job descriptions in the format: min-max-currency-frequency. If salary is not found, return: 0-0-None-None."},
]

In [47]:
for i in tqdm(range(487,len(test_df))):
    desc = {
        "job_title":              test_df.iloc[i].job_title,
        "job_ad_details":         test_df.iloc[i].job_ad_details,
        "nation_short_desc":      test_df.iloc[i].nation_short_desc,
        "salary_additional_text": test_df.iloc[i].salary_additional_text,
    }
    desc_str = str(desc)

    # rebuild the prompt messages for this row
    messages = copy.deepcopy(messages_static)
    messages.append({
        "role": "user",
        "content": (
            f"{desc_str} Extract structured salary information from this job description in the format: "
            "min-max-currency-frequency. Respond in JSON: "
            "{\"MinSalary\": \"\", \"MaxSalary\": \"\", \"Currency\": \"\", \"Frequency\": \"\"}. "
            "If not provided explicitly, output 0 for MinSalary and MaxSalary, and \"None\" for Currency and Frequency. "
            "If the salary is mentioned, always output a range, where MinSalary and MaxSalary can be equal. "
            "Use nation_short_desc to determine the correct currency as 3 letters, and use an adverb for frequency "
            "(annual, monthly, daily or hourly)."
        )
    })

    # retry the API call on any error, pausing 60s each time
    while True:
        try:
            response = client.chat.completions.create(
                model=model,
                messages=messages
            )
            break
        except Exception as e:
            print(f"Error on row {i} ({e.__class__.__name__}): {e}. Sleeping 60s before retry…")
            time.sleep(60)

    answer_str = response.choices[0].message.content

    # extract the JSON snippet and parse it
    try:
        # grab the first {...} block
        answer_json = answer_str[answer_str.find('{'): answer_str.rfind('}')+1]
        # replace any fancy quotes
        answer_json = answer_json.replace('“', '"').replace('”', '"')
        data = json.loads(answer_json)

        label = (
            f"{data['MinSalary']}-"
            f"{data['MaxSalary']}-"
            f"{data['Currency'].upper()}-"
            f"{data['Frequency'].upper()}"
        )
    except (ValueError, json.JSONDecodeError, KeyError) as parse_err:
        print(f"Failed to parse JSON on row {i}: {answer_str}")
        label = "ERROR " + answer_str

    # append to the next empty row
    test_pred_df5.loc[len(test_pred_df5)] = label



 20%|██        | 16/80 [00:13<01:01,  1.03it/s]

Error on row 503 (RateLimitError): Error code: 429 - [{'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-1.5-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '39s'}]}}]. Sleeping 60s before retry…


 49%|████▉     | 39/80 [01:32<00:37,  1.10it/s]

Error on row 526 (RateLimitError): Error code: 429 - [{'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-1.5-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '19s'}]}}]. Sleeping 60s before retry…


 69%|██████▉   | 55/80 [02:47<00:20,  1.21it/s]

Error on row 542 (RateLimitError): Error code: 429 - [{'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-1.5-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '5s'}]}}]. Sleeping 60s before retry…


 89%|████████▉ | 71/80 [04:03<00:07,  1.13it/s]

Error on row 558 (RateLimitError): Error code: 429 - [{'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-1.5-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '49s'}]}}]. Sleeping 60s before retry…


100%|██████████| 80/80 [05:13<00:00,  3.92s/it]


In [48]:
test_pred_df5.to_csv(
    'salary_labelled_test_set_gemini_preds_with_simple_prompt2.csv',
    index=False)

In [52]:
import math

# 1. Load your CSV (adjust filename as needed)
df = pd.read_csv('salary_labelled_test_set_gemini_preds_with_simple_prompt.csv')

# 2. Define a function to floor the two salary values
def floor_salary_range(s: str) -> str:
    try:
        # split from the right into exactly 4 parts: min, max, currency, frequency
        min_str, max_str, currency, freq = s.rsplit('-', 3)
        # floor each numeric part
        min_fl = math.floor(float(min_str))
        max_fl = math.floor(float(max_str))
        # reassemble
        return f"{min_fl}-{max_fl}-{currency}-{freq}"
    except Exception:
        # if anything goes wrong (e.g. unexpected format), leave as-is
        return s

# 3. Apply to the column
df['y_pred'] = df['y_pred'].apply(floor_salary_range)

# 4. Save out
df.to_csv('y_pred_floored.csv', index=False)


In [2]:
import pandas as pd
import numpy as np

# 1. Load your files
preds = pd.read_csv('y_pred_floored.csv')
true  = pd.read_csv('salary_set_ans.csv')

# 2. Build a single DataFrame
df = pd.DataFrame({
    'y_pred': preds['y_pred'],
    'y_true': true['y_true']
})

# 3. Split both prediction and truth into 4 columns
pred_cols = df['y_pred'].str.split('-', expand=True)
pred_cols.columns = ['min_pred','max_pred','curr_pred','freq_pred']

true_cols = df['y_true'].str.split('-', expand=True)
true_cols.columns = ['min_true','max_true','curr_true','freq_true']

df = pd.concat([df, pred_cols, true_cols], axis=1)

# 4. Convert salary columns to numeric for a fair comparison
for col in ['min_pred','max_pred','min_true','max_true']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 5. Compute per‐field accuracies
n = len(df)
results = {}
results['min_salary'] = (df['min_pred'] == df['min_true']).sum() / n
results['max_salary'] = (df['max_pred'] == df['max_true']).sum() / n
results['currency']   = (df['curr_pred'] == df['curr_true']).sum() / n
results['frequency']  = (df['freq_pred'] == df['freq_true']).sum() / n

# 6. Print them out
for field, acc in results.items():
    correct = int(acc * n)
    print(f"{field:12} accuracy: {acc:.2%} ({correct}/{n})")


all_match = (
    (df['min_pred']   == df['min_true']) &
    (df['max_pred']   == df['max_true']) &
    (df['curr_pred']  == df['curr_true']) &
    (df['freq_pred']  == df['freq_true']))

overall_acc = all_match.sum() / n

print(f"overall accuracy: {overall_acc:.2%} ({all_match.sum()}/{n})")

min_salary   accuracy: 88.18% (500/567)
max_salary   accuracy: 87.83% (498/567)
currency     accuracy: 96.83% (549/567)
frequency    accuracy: 96.65% (548/567)
overall accuracy: 83.42% (473/567)
