In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

# import my_secrets as sc
# import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd

## Notebook Overview
Goal: repair responses for fine-tuned Mistral. The ideal response is: {'categorie': *category of the doc*}. However after training on 2-epochs or 3-epochs, Mistral's responses make the same mistake over again, it misses the openening curly bracket alot. 

*Previous notebook: GetPredictions*

*Next notebook: baseline*

#### Check for ALL prediction files if predictions were correctly extracted
The Regex pattern that's used to extract the prediction from the response has been updated, after running the experiments. I found out that the regex pattern, which just matches everything within {} was not strict enough, thus the pattern was adjusted to only match when 'categorie' is named within the curly brackets. This means all predictions need to be checked on errors when extracting with previous regex pattern.

In [3]:
import sys
sys.path.append('../src/') 
import prediction_helperfunctions as ph
import prompt_template as pt
import re
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter

""" Check for each response if the extracted prediction would be the same for the old regex pattern and the new one. """
def check_predictions_using_new_regex(list_of_dfs):
    errors = 0
    for df in list_of_dfs:
        df['new_prediction'] = df['response'].apply(ph.get_prediction_from_response)
        if len(df.loc[df['new_prediction']!=df['prediction']]) > 0:
            print(f"Following run_id has responses of which the prediction was not correctly extracted using the old regex pattern: {set(df['run_id'])}.")
            print(f"ACTION REQUIRED: the predictions need to be re-extracted from the response, this time using the new regex pattern. Don't forget to update the evaluation scores in the overview file.")
            errors += 1

    if errors == 0:
        print('All predictions from all dataframes were already correctly extracted!')

    else:
        print(f"There are {errors} dataframes of which not all predictions were not alrady completely correctly extracted.")

""" Given the response, extract the prediction using the updated regex pattern """
def extract_predictions_using_new_regex(file_path):
    df = pd.read_pickle(file_path)
    df['prediction'] = df['response'].apply(ph.get_prediction_from_response)
    df.to_pickle(file_path)
       
def check_multi_predictions(df):
    errors = df.loc[df['prediction']=='MultiplePredictionErrorInFormatting']
    print(len(errors))
    for index, row in errors.iterrows():
        print("NEW RESPONSE:")
        print(row['response'], '\n')

""" Check if response contains opening and closing brackets, and 'categorie' is named within them """
def match_complete_regex(response):
    pattern = r'\{[^{}]*categorie[^{}]*\}'
    matches = re.findall(pattern, response)
    return len(matches)

""" Check if response contains closing brackets, and 'categorie' is named """
def match_adjusted_regex(response):
    pattern = r'[^{}]*categorie[^{}]*\}'
    matches = re.findall(pattern, response)
    return len(matches)


""" Given the string response, extract the prediction using adjusted regex pattern (no opening bracket) """
def get_prediction_from_response(response):
    # get a list of the possible classes
    classes_list = pt.get_class_list()
    
    # check if part of string matches given output format to prompt
    # pattern = r'\{[^{}]+\}'
    pattern = r'[^{}]*categorie[^{}]*\}'
    matches = re.findall(pattern, response)
    if len(matches) == 1:
        prediction_output = matches[0]
        predictions = [True if category.lower() in prediction_output.lower() else False for category in classes_list]

        # check if multiple classes were named, this is a prediction error
        if Counter(predictions)[True] > 1:
            return "MultiplePredictionErrorInOutput"

        # check if exactly one class is named, this is the prediction
        elif Counter(predictions)[True] == 1:
            prediction = [category.lower() for category in classes_list if category.lower() in prediction_output.lower()]
            return prediction[0]

        # if no class is named, then this is a no prediction error
        else:
            return 'NoPredictionInOutput'
        
    elif len(matches) > 1:
        return 'MultiplePredictionErrorInFormatting'
    else:
        return 'NoPredictionFormat'
    
""" Calculate evaluation scores give preditions """
# avrg = either 'weighted' or 'macro'
def calculate_evaluation_metrics(predictions, prediction_col, avrg, print_statement=False):
    y_true = predictions['label']
    y_pred = predictions[prediction_col]

    report = classification_report(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=avrg)
    recall = recall_score(y_true, y_pred, average=avrg)
    f1 = f1_score(y_true, y_pred, average=avrg)

    if print_statement == True:
        print(report)

    return accuracy, precision, recall, f1, report

""" Given the new predictions, update the evaluation scores in the overview file """
def update_scores_in_overview(overview_path, predictions, run_id):
    accuracy, weighted_precision, weighted_recall, weighted_f1, report = calculate_evaluation_metrics(predictions, 'prediction', 'weighted')
    accuracy, macro_precision, macro_recall, macro_f1, report = calculate_evaluation_metrics(predictions, 'prediction', 'macro')

    overview = pd.read_pickle(overview_path)

    overview.loc[overview['run_id']==run_id, 'accuracy'] = accuracy
    overview.loc[overview['run_id']==run_id, 'macro_avg_precision'] = macro_precision
    overview.loc[overview['run_id']==run_id, 'macro_avg_recall'] = macro_recall
    overview.loc[overview['run_id']==run_id, 'macro_avg_f1'] = macro_f1
    overview.loc[overview['run_id']==run_id, 'weighted_avg_precision'] = weighted_precision
    overview.loc[overview['run_id']==run_id, 'weighted_avg_recall'] = weighted_recall
    overview.loc[overview['run_id']==run_id, 'weighted_avg_f1'] = weighted_f1
    overview.loc[overview['run_id']==run_id, 'classification_report'] = report

    overview.to_pickle(overview_path)


##### GEITje

In [4]:
# IC
g_fewshot = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/GEITje/fewshot_prompt_no_template/First200Last0Predictions.pkl")
g_ic_100 = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/GEITje/zeroshot_prompt_geitje/First100Last0Predictions.pkl")
g_ic_200 = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/GEITje/zeroshot_prompt_geitje/First200Last0Predictions.pkl")
g_ic_100_100 = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/GEITje/zeroshot_prompt_geitje/First100Last100Predictions.pkl")
check_predictions_using_new_regex([g_fewshot, g_ic_100, g_ic_200, g_ic_100_100])

# FT
g_1ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/1epochs/GEITjeFirst200Last0Predictions.pkl")
g_2ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/2epochs/GEITjeFirst200Last0Predictions.pkl")
g_3ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/3epochs/GEITjeFirst200Last0Predictions.pkl")
check_predictions_using_new_regex([g_1ep, g_2ep, g_3ep])



All predictions from all dataframes were already correctly extracted!
All predictions from all dataframes were already correctly extracted!


##### Llama

In [5]:
# IC
l_fewshot = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/Llama/fewshot_prompt_with_template/First200Last0Predictions.pkl")
l_ic_100 = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/Llama/zeroshot_prompt_mistral_llama/First100Last0Predictions.pkl")
l_ic_200 = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/Llama/zeroshot_prompt_mistral_llama/First200Last0Predictions.pkl")
l_ic_100_100 = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/Llama/zeroshot_prompt_mistral_llama/First100Last100Predictions.pkl")
check_predictions_using_new_regex([l_fewshot, l_ic_100, l_ic_200, l_ic_100_100])

# FT
l_1ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/1epochs/LlamaFirst200Last0Predictions.pkl")
l_2ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/2epochs/LlamaFirst200Last0Predictions.pkl")
l_3ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/3epochs/LlamaFirst200Last0Predictions.pkl")
check_predictions_using_new_regex([l_1ep, l_2ep, l_3ep])



All predictions from all dataframes were already correctly extracted!
All predictions from all dataframes were already correctly extracted!


##### Mistral

In [6]:
# IC
m_fewshot = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/Mistral/fewshot_prompt_with_template/First200Last0Predictions.pkl")
m_ic_100 = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/Mistral/zeroshot_prompt_mistral_llama/First100Last0Predictions.pkl")
m_ic_200 = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/Mistral/zeroshot_prompt_mistral_llama/First200Last0Predictions.pkl")
m_ic_100_100 = pd.read_pickle(f"{cf.output_path}/predictionsFinal/in_context/Mistral/zeroshot_prompt_mistral_llama/First100Last100Predictions.pkl")
check_predictions_using_new_regex([m_fewshot, m_ic_100, m_ic_200, m_ic_100_100])

# FT
m_1ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/1epochs/MistralFirst200Last0Predictions.pkl")
m_2ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/2epochs/MistralFirst200Last0Predictions.pkl")
m_3ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/3epochs/MistralFirst200Last0Predictions.pkl")
check_predictions_using_new_regex([m_1ep, m_2ep, m_3ep])

# We originally found that m_2ep had incorrectly extracted predictions, below is the code to replace the original predictions column, with the new predictions extracted using the new/correct regex pattern
# extract_predictions_using_new_regex(f"{cf.output_path}/predictionsFinal/finetuning/2epochs/MistralFirst200Last0Predictions.pkl")

All predictions from all dataframes were already correctly extracted!
Following run_id has responses of which the prediction was not correctly extracted using the old regex pattern: {'FT_AmsterdamDocClassificationMistral200T2Epochszeroshot_prompt_mistral_llamaLlamaTokens200_0traintest_numEx0'}.
ACTION REQUIRED: the predictions need to be re-extracted from the response, this time using the new regex pattern. Don't forget to update the evaluation scores in the overview file.
Following run_id has responses of which the prediction was not correctly extracted using the old regex pattern: {'FT_AmsterdamDocClassificationMistral200T3Epochszeroshot_prompt_mistral_llamaLlamaTokens200_0traintest_numEx0'}.
ACTION REQUIRED: the predictions need to be re-extracted from the response, this time using the new regex pattern. Don't forget to update the evaluation scores in the overview file.
There are 2 dataframes of which not all predictions were not alrady completely correctly extracted.


### Check responses without opening bracket
Fine-tuned Mistral has problems with returning the format output, and return the format without the opening bracket, resulting in many prediction errors. 

Below we can see that the responses for 1epoch were correctly formatted and thus extracted.

However, 2 and 3 epoch is very problematic. The response that only miss { will also be extracted. 

In [7]:
m_1ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/1epochs/MistralFirst200Last0Predictions.pkl")
m_2ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/2epochs/ORIGINALMistralFirst200Last0Predictions.pkl")
m_3ep = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/3epochs/ORIGINALMistralFirst200Last0Predictions.pkl")

In [8]:
print(f" Epoch 1 has {len(m_1ep[~m_1ep['response'].str.startswith('{')])} responses that do not start with {'{'}")
print(f" Epoch 1 has {len(m_1ep.loc[m_1ep['prediction']=='NoPredictionFormat'])} predictions that are NoPredictionFormat errors.")
print(f" Epoch 2 has {len(m_2ep[~m_2ep['response'].str.startswith('{')])} responses that do not start with {'{'}")
print(f" Epoch 2 has {len(m_2ep.loc[m_2ep['prediction']=='NoPredictionFormat'])} predictions that are NoPredictionFormat errors.")
pred3 = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/3epochs/MistralFirst200Last0Predictions.pkl")
print(f" Epoch 3 has {len(m_3ep[~m_3ep['response'].str.startswith('{')])} responses that do not start with {'{'}")
print(f" Epoch 3 has {len(m_3ep.loc[m_3ep['prediction']=='NoPredictionFormat'])} predictions that are NoPredictionFormat errors.")


 Epoch 1 has 0 responses that do not start with {
 Epoch 1 has 0 predictions that are NoPredictionFormat errors.
 Epoch 2 has 200 responses that do not start with {
 Epoch 2 has 200 predictions that are NoPredictionFormat errors.
 Epoch 3 has 1100 responses that do not start with {
 Epoch 3 has 1097 predictions that are NoPredictionFormat errors.


In [9]:
m_1ep['matches_complete_regex'] = m_1ep['response'].apply(match_complete_regex)
m_1ep['matches_adjusted_regex'] = m_1ep['response'].apply(match_adjusted_regex)
print("Amount of responses that include format, but format misses {:",len(m_1ep.loc[(m_1ep['matches_complete_regex']==0) & (m_1ep['matches_adjusted_regex']>0)]))
print(f"Amount of formats in responses (adjusted_regex): {Counter(m_1ep['matches_adjusted_regex'])}")
print("Amount of responses that already followed format (complete_regex):", len(m_1ep.loc[m_1ep['matches_complete_regex']>0]))

m_2ep['matches_complete_regex'] = m_2ep['response'].apply(match_complete_regex)
m_2ep['matches_adjusted_regex'] = m_2ep['response'].apply(match_adjusted_regex)
print("Amount of responses that include format, but format misses {:",len(m_2ep.loc[(m_2ep['matches_complete_regex']==0) & (m_2ep['matches_adjusted_regex']>0)]))
print(f"Amount of formats in responses (adjusted_regex): {Counter(m_2ep['matches_adjusted_regex'])}")
print("Amount of responses that already followed format (complete_regex):", len(m_2ep.loc[m_2ep['matches_complete_regex']>0]))

m_3ep['matches_complete_regex'] = m_3ep['response'].apply(match_complete_regex)
m_3ep['matches_adjusted_regex'] = m_3ep['response'].apply(match_adjusted_regex)
print("Amount of responses that include format, but format misses {:",len(m_3ep.loc[(m_3ep['matches_complete_regex']==0) & (m_3ep['matches_adjusted_regex']>0)]))
print(f"Amount of formats in responses (adjusted_regex): {Counter(m_3ep['matches_adjusted_regex'])}")
print("Amount of responses that already followed format (complete_regex):", len(m_3ep.loc[m_3ep['matches_complete_regex']>0]))


Amount of responses that include format, but format misses {: 0
Amount of formats in responses (adjusted_regex): Counter({1: 1100})
Amount of responses that already followed format (complete_regex): 1100
Amount of responses that include format, but format misses {: 199
Amount of formats in responses (adjusted_regex): Counter({1: 199, 0: 1})
Amount of responses that already followed format (complete_regex): 0
Amount of responses that include format, but format misses {: 981
Amount of formats in responses (adjusted_regex): Counter({1: 982, 0: 116, 2: 2})
Amount of responses that already followed format (complete_regex): 3


- Complete regex = {categorie}  -> match opening an closing brackets, that include the word categorie.
- Adjusted regex = categorie}  -> match everything between categorie and closing bracket.

All models were checked for NoPredictionFormat errors. We found that Llama and Mistral did not have them for any of the epochs. Mistral, however, does for epoch=2&3.
Above we can see that the original predictions (extracted using complete regex), results in many NoPredictionFormat errors. Next, we seee that many of those response do match with the adjusted regex, but not with the complete regex, meaning that only the beginning bracket is missing. Thus we create a new column with new prediction, of which the predictions are extarcted using the adjusted regex.

In [10]:
# repair predictions. For this we use the new regex pattern, with a minor adjustment. We match instead of '{categorie }', we match 'categorie }'. 
# So we do not require the opening bracket.
m_2ep.rename(columns={'prediction':'Original_Prediction'}, inplace=True)
m_3ep.rename(columns={'prediction':'Original_Prediction'}, inplace=True)

m_2ep['prediction'] = m_2ep['response'].apply(get_prediction_from_response)
m_3ep['prediction'] = m_3ep['response'].apply(get_prediction_from_response)

# save repaired predictions
# m_2ep.to_pickle(f"{cf.output_path}/predictionsFinal/finetuning/2epochs/MistralFirst200Last0Predictions.pkl")
# m_3ep.to_pickle(f"{cf.output_path}/predictionsFinal/finetuning/3epochs/MistralFirst200Last0Predictions.pkl")

Next, we need to adjust the evaluation scores in the overview file to match the new predictions. We save the old scores. 

In [11]:
overview_path = f"{cf.output_path}/predictionsFinal/finetuning/3epochs/overview.pkl"
predictions = pd.read_pickle(f"{cf.output_path}/predictionsFinal/finetuning/3epochs/MistralFirst200Last0Predictions.pkl")
run_id = 'FT_AmsterdamDocClassificationMistral200T3Epochszeroshot_prompt_mistral_llamaLlamaTokens200_0traintest_numEx0'
# update_scores_in_overview(overview_path, predictions, run_id)
