In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [1]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

# import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 
# pip install bitsandbytes
# pip install bnb
# pip install wandb==0.13.3 --upgrade


## Notebook Overview
Goal: get predictions of the finetuned models

In [9]:
import pandas as pd
import sys
sys.path.append('../scripts/') 
import prompt_template as pt
import prediction_helperfunctions as ph
import truncation as tf


In [3]:
# Load model
from transformers import pipeline, Conversation

chatbot = pipeline(task='conversational', model='FemkeBakker/GEITjeSmallData200Tokens',
                   device_map='auto', model_kwargs={'offload_buffers':True})

2024-04-29 11:27:58.375087: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [14]:
import time
import os
import pandas as pd
from bm25 import BM25


""" Given a dataframe with txt, return a df with predictions """
# docs_df = dataframe with the documents that need to be predicted
# text_column = name of the column that includes the input_text. Can be different based on the text representation method. 
# prompt_function = prompt template 
# train_df = dataframe with docs, which can be used as examples/training data/context data
# num_examples = number of examples in the prompt

def predictions_incontextlearning(chatbot, docs_df, text_column, prompt_function, train_df, num_examples):
    results_df = pd.DataFrame(columns = ['id', 'path', 'text_column', 'prompt_function', 'response', 'prediction', 'label', 'runtime', 'date', 'prompt'])


    if prompt_function == pt.fewshot_prompt_bm25:
        BM25_model = BM25()
        BM25_model.fit(train_df[text_column])

    
    # prompt each document
    for index, row in docs_df.iterrows():
        if (index + 1) % 200 == 0:
            print(f"Iteration {index +1}/{len(docs_df)} completed.")

        start_time = time.time()

        # get the prompt, with the doc filled in
        txt = row[text_column]

        # each prompt function takes different arguments
        # simple function is zeroshot+simple instruction
        if prompt_function == pt.simple_prompt:
            prompt = prompt_function(txt)
      
        # select fewshot examples using bm25
        elif prompt_function == pt.fewshot_prompt_bm25:
            prompt = prompt_function(txt, train_df, num_examples, text_column, BM25_model)

        else:
            raise ValueError("Prompt function not recognised. Check if prompt function is in prompt_template.py and included in the options above.")

        # prompt and get the response
        converse = chatbot(Conversation(prompt))
        response = converse[1]['content']
        print("label: ", row['label'].lower())
        print("response: ", response)

        # extract prediction from response
        prediction = ph.get_prediction_from_response(response)
        print("prediction:", prediction)

        # save results in dataframe
        results_df.loc[len(results_df)] = {
            'id': row['id'],
            'path' : row['path'],
            'text_column' : docs_df.iloc[0]['trunc_col'],
            'prompt_function': ph.get_promptfunction_name(prompt_function),
            'response':response,
            'prediction':prediction,
            'label':row['label'].lower(),
            'runtime':time.time()-start_time,
            'date': ph.get_datetime(),
            'prompt':prompt
        }
    return results_df



In [15]:
import os
import time
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

"""
Function to run GEITje In-Context Learning experiment. 
The function allows to resume experiment, if run_id matches.
"""
# df = dataframe with all docs that need to have a prediction (docs still need to be predict + already predicted)
# run_id = unqiue for each experiment. 
# prompt_function = which prompt from prompt_template.py to use
# text_col = colum in df where the text is. (Needs to be already truncated)
# split_col = column with the dataset split. Either '2split' (train and test)or '4split'(train, test, dev and val)
# subset_train = indicates which subset to use as training. either 'train' or 'dev'
# subset_test = indicates which subset to use for testing. either 'test' or 'val'
# label_col = column with the true label
# prediction_path = path to file where predictions need to be saved.
# overview_path = path to file where results of each run need to be saved.
# model_name = name of the model. string.
# num_exmples = number of exaples given to prompt. zero in case of zeroshot. 

def run_experiment(chatbot, df, run_id, prompt_function, text_col, split_col, subset_train, subset_test, label_col, prediction_path, overview_path, model_name, num_examples=0):
    print(num_examples)
    start_time = time.time()
    test_df = df.loc[df[split_col]==subset_test]
    train_df = df.loc[df[split_col]==subset_train]
    
    # get rows of df that still need to be predicted for the specific run_id
    to_predict, previous_predictions = ph.get_rows_to_predict(test_df, prediction_path, run_id)

    # devide to_predict into subsection of 50 predictions at a time. 
    # Allows to rerun without problem. And save subsections of 50 predictions.
    step_range = list(range(0, len(to_predict), 25))

    for i in range(len(step_range)):
        try:
            sub_to_predict = to_predict.iloc[step_range[i]:step_range[i+1]]
            print(f'Starting...{step_range[i]}:{step_range[i+1]} out of {len(to_predict)}')
        except Exception as e:
            sub_to_predict = to_predict[step_range[i]:]
            print(f'Starting...last {len(sub_to_predict)} docs')

        # prompt geitje
        predictions = predictions_incontextlearning(chatbot, sub_to_predict, text_col, prompt_function, train_df, num_examples)

        # save info
        predictions['run_id'] = run_id
        predictions['train_set'] = subset_train
        predictions['test_set'] = subset_test
        predictions['shots'] = num_examples

        # save new combinations in file
        ph.combine_and_save_df(predictions, prediction_path)

        # if previous predictions, combine previous with new predictions, to get update classification report
        try:
            predictions = pd.concat([predictions, previous_predictions])

            # set previous predictions to all predictions made until now. Necessary for next loop
            previous_predictions = predictions
        except Exception as e:
            # set previous predictions to all predictions made until now. Necessary for next loop
            previous_predictions = predictions

        # save results in overview file
        date = ph.get_datetime()
        y_test = predictions['label']
        y_pred = predictions['prediction']
        report = classification_report(y_test, y_pred)

        overview = pd.DataFrame(
            [{
                'model':model_name,
                'run_id':run_id,
                'date': date,
                'train_set': subset_train,
                'test_set': subset_test,
                'train_set_support':len(df.loc[df[split_col]==subset_train]),
                'test_set_support':len(predictions),
                'split_col':split_col,
                'text_col':df.iloc[0]['trunc_col'],
                'runtime':sum(predictions['runtime']),
                'accuracy': accuracy_score(y_test, y_pred),
                'macro_avg_precision': precision_score(y_test, y_pred, average='macro'),
                'macro_avg_recall': recall_score(y_test, y_pred, average='macro'),
                'macro_avg_f1': f1_score(y_test, y_pred, average='macro'),
                'classification_report':report
            }   ]
        )
        # remove previous results of run_id, replace with new/updated results
        ph.replace_and_save_df(overview, overview_path, run_id)



In [16]:
#set  variables, same for each model
TRAIN_SET = 'dev' # must be dev or train
TEST_SET = 'val' # must be val or test
SPLIT_COLUMN = '4split' #must be either 2split or 4split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
LABEL_COLUMN = 'label'
TEXT_COLUMN = 'trunc_txt'

In [17]:
txt = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

In [18]:

PREDICTION_PATH = f"{cf.output_path}/predictions/GeitjeFinetuningPredictions.pkl"
OVERVIEW_PATH = f"{cf.output_path}/overview/GeitjeFinetuningPredictions.pkl"


MODEL_NAME = 'FemkeBakker/GEITjeSmallData200Tokens'
PROMPT = pt.simple_prompt
PROMPT_NAME = ph.get_promptfunction_name(PROMPT)
TOKENS_COL = 'LlamaTokens' # column with text split using tokenizer of either mistral (MistralTokens) or Llama (LlamaTokens). Using Llama, because Llama split into more tokens. 
FRONT_THRESHOLD = 200
BACK_THRESHOLD = 0
NUMBER_EXAMPLES = 0


In [19]:
# ----- EXPERIMENT --------

# add new column with truncated text -> new dataframe with column + new column name
trunc_df = tf.add_truncation_column(txt,'text', TOKENS_COL, FRONT_THRESHOLD, BACK_THRESHOLD)

# if new run MAKE SURE RUN_ID IS UNIQUE, if want to resume run, pass in that run_id
run_experiment(chatbot, trunc_df, f'{PROMPT_NAME}{TOKENS_COL}{FRONT_THRESHOLD}_{BACK_THRESHOLD}{TRAIN_SET}{TEST_SET}_numEx{NUMBER_EXAMPLES}', PROMPT, TEXT_COLUMN, SPLIT_COLUMN, TRAIN_SET, TEST_SET, LABEL_COLUMN, PREDICTION_PATH, OVERVIEW_PATH, MODEL_NAME, NUMBER_EXAMPLES)


0
Starting...0:25 out of 209
label:  agenda
response:  {'categorie': Agenda}
prediction: agenda
label:  brief
response:  {'categorie': Brief}
prediction: brief
label:  agenda
response:  {'categorie': Agenda}
prediction: agenda
label:  onderzoeksrapport
response:  {'categorie': Onderzoeksrapport}
prediction: onderzoeksrapport
label:  factsheet
response:  {'categorie': Factsheet}
prediction: factsheet
label:  voordracht
response:  {'categorie': Voordracht}
prediction: voordracht
label:  motie
response:  {'categorie': Raadsadres}
prediction: raadsadres
label:  motie
response:  {'categorie': Motie}
prediction: motie
label:  schriftelijke vraag
response:  {'categorie': Schriftelijke Vraag}
prediction: schriftelijke vraag
label:  motie
response:  {'categorie': Motie}
prediction: motie


--- Logging error ---
Traceback (most recent call last):
  File "/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/logging/__init__.py", line 1083, in emit
    msg = self.format(record)
  File "/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/logging/__init__.py", line 927, in format
    return fmt.format(record)
  File "/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/logging/__init__.py", line 663, in format
    record.message = record.getMessage()
  File "/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/logging/__init__.py", line 367, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/anaconda/envs/AmsterdamI

label:  raadsadres
response:  {'categorie': Raadsadres}
prediction: raadsadres
label:  raadsadres
response:  {'categorie': Raadsadres}
prediction: raadsadres
label:  schriftelijke vraag
response:  {'categorie': Schriftelijke Vraag}
prediction: schriftelijke vraag
label:  brief
response:  {'categorie': Brief}
prediction: brief
label:  agenda
response:  {'categorie': Agenda}
prediction: agenda
label:  agenda
response:  {'categorie': Agenda}
prediction: agenda
label:  schriftelijke vraag
response:  {'categorie': Schriftelijke Vraag}
prediction: schriftelijke vraag
label:  voordracht
response:  {'categorie': Voordracht}
prediction: voordracht
label:  schriftelijke vraag
response:  {'categorie': Schriftelijke Vraag}
prediction: schriftelijke vraag
label:  motie
response:  {'categorie': Motie}
prediction: motie
label:  agenda
response:  {'categorie': Agenda}
prediction: agenda
label:  schriftelijke vraag
response:  {'categorie': Motie}
prediction: motie
label:  actualiteit
response:  {'categ

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Starting...50:75 out of 209
label:  schriftelijke vraag
response:  {'categorie': Schriftelijke Vraag}
prediction: schriftelijke vraag
label:  schriftelijke vraag
response:  {'categorie': Schriftelijke Vraag}
prediction: schriftelijke vraag
label:  raadsadres
response:  {'categorie': Aanbiedingsformulier}
prediction: NoPredictionInOutput
label:  agenda
response:  {'categorie': Agenda}
prediction: agenda
label:  agenda
response:  {'categorie': Agenda}
prediction: agenda
label:  motie
response:  {'categorie': Motie}
prediction: motie
label:  motie
response:  {'categorie': Motie}
prediction: motie
label:  agenda
response:  {'categorie': Agenda}
prediction: agenda
label:  motie
response:  {'categorie': Motie}
prediction: motie
label:  motie
response:  {'categorie': Motie}
prediction: motie
label:  brief
response:  {'categorie': Brief}
prediction: brief
label:  brief
response:  {'categorie': Brief}
prediction: brief
label:  schriftelijke vraag
response:  {'categorie': Schriftelijke Vraag}
pr

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Starting...75:100 out of 209
label:  motie
response:  {'categorie': Motie}
prediction: motie
label:  motie
response:  {'categorie': Motie}
prediction: motie
label:  voordracht
response:  {'categorie': Voordracht}
prediction: voordracht
label:  onderzoeksrapport
response:  {'categorie': Onderzoeksrapport}
prediction: onderzoeksrapport
label:  voordracht
response:  {'categorie': Voordracht}
prediction: voordracht
