In [4]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [5]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf

- Load data 
- text truncation
- load file with prompts


In [6]:
def text_truncation(tokens_list, maximum=500):
    selected_tokens = tokens_list[:maximum]  # Select the first 300 tokens
    return ' '.join(selected_tokens)  # Convert the list back to text

In [7]:
# import pandas as pd
# df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")

# df = df.loc[df['set']=='val'].head(2)
# df['text_trunc_100'] = df['tokens'].apply(text_truncation,100)
# df['text_trunc_1000'] = df['tokens'].apply(text_truncation,1000)

# # import sys
# # sys.path.append('../scripts/') 
# # import prompt_template as pt


### Experiment set-up
Prompt GEITje for each document and save the prediction, return response, response time and the prompt version

In [9]:
from transformers import pipeline, Conversation

chatbot = pipeline(task='conversational', model='Rijgersberg/GEITje-7B-chat-v2',
                   device_map='auto')

2024-04-05 12:36:01.793166: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-04-05 12:36:01.793199: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Function to prompt the model. This is code for one complete run.

In [10]:
import time
import sys
sys.path.append('../scripts/') 
import prompt_template as pt
import prediction_helperfunctions as ph
import os
import pandas as pd


""" Given a dataframe with txt, return a df with predictions """
# docs_df = dataframe with the documents that need to be predicted
# text_column = name of the column that includes the input_text. Can be different based on the text representation method. 
# prompt_function = prompt template -> ONLY prompt templates that take doc as input (ZERO SHOT)

def zero_shot_predictions_incontextlearning(docs_df, text_column, prompt_function):
    results_df = pd.DataFrame(columns = ['id', 'path', 'text_column', 'prompt_function', 'response', 'prediction', 'label', 'runtime', 'date'])
    
    # prompt each document
    for index, row in docs_df.iterrows():
        if (index + 1) % 200 == 0:
            print(f"Iteration {index +1}/{len(docs_df)} completed.")

        start_time = time.time()

        # get the prompt, with the doc filled in
        txt = row[text_column]
        prompt = prompt_function(txt)

        # prompt and get the response
        converse = chatbot(Conversation(prompt))
        response = converse[1]['content']

        # extract prediction from response
        prediction = ph.get_prediction_from_response(response)

        # save results in dataframe
        results_df.loc[len(results_df)] = {
            'id': row['id'],
            'path' : row['path'],
            'text_column' : text_column,
            'prompt_function': ph.get_promptfunction_name(prompt_function),
            'response':response,
            'prediction':prediction,
            'label':row['label'].lower(),
            'runtime':time.time()-start_time,
            'date': ph.get_datetime()
        }
    return results_df

""" Run a prediction function -> can be ZeroShot or FewShot """
def run_prediction(docs_df, text_column, prompt_function, set_run_id, save_predictions_path, subset, learning='ZeroShot'):
    # check input
    ph.check_input_set_run_id(set_run_id)
    ph.check_input_subset(subset)

    # set run id, based on whether part of the documents of the run are already predicted
    # if set_run_id == new, then this is a new run
    # if set_run_id == previous, resume previous run. The df should only contain docs that have not been predicted before in the run. 
    # this is a safe fail incase the predictions gets cut off halfway through.
    run_id = ph.get_runid(set_run_id)


    if learning == 'ZeroShot':
        # get the predictions
        res = zero_shot_predictions_incontextlearning(docs_df, text_column, prompt_function)
        res['run_id'] = run_id

        # INSERT ELSE STATEMENT HERE FOR FEWSHOT

        # combine earlier predictions with new ones
        # path = f"{cf.output_path}/predictions/ICgeitje_predictions.pkl"
        all_predictions = ph.combine_current_with_previous_predictions(save_predictions_path, res)

        # save predictions
        all_predictions.to_pickle(save_predictions_path)

        # select all predictions of current run
        predictions = all_predictions.loc[all_predictions['run_id']==run_id]

        # save the evaluation metrics for each run
        ph.update_overview_results(predictions, 'Rijgersberg/GEITje-7B-chat-v2', save_predictions_path, subset)
        display(all_predictions)
        return all_predictions
    

# gestart om 10.15
# path = f"{cf.output_path}/predictions/ICgeitje_predictions.pkl"
# res = run_prediction(df, 'text_trunc_100', pt.simple_prompt, 'new',path, 'val')
# display(res)


The dataset is very big, thus we make subsections of the df, so that it will save in between runs. 

In [11]:
def run_in_subsections(df, path, set_run_id):
    # df = all the docs that need to be predicted. Make sure that it does not include docs that already have been predicted for the run. That would result in duplicates.
    # path = file where previous predictions of the run are saved, and where the new predictions will be saved.
    # set_run_id = 'new', 'previous' or an integer. 


    # devide df into sections of 50 docs
    iterations = list(range(0, len(df)+50, 50))

    # for each sections, get the predictions
    for i in range(len(iterations)):
        try:
            print("\n", "iterations", iterations[i], iterations[i+1], "\n")
            subdf = df.iloc[iterations[i]:iterations[i+1]]
            

        except IndexError:
            print("\n", "iterations", iterations[i], "\n")
            subdf = df.iloc[iterations[i]:]
            print(len(subdf))
        
        if len(subdf) != 0:
            if set_run_id == 'new' and iterations[i]==0:
                run_prediction(subdf, 'text_trunc_100', pt.simple_prompt, 'new', path, 'val')
            else:
                run_prediction(subdf, 'text_trunc_100', pt.simple_prompt, set_run_id, path, 'val')



# path = f"{cf.output_path}/predictions/ICgeitje_predictions_tryout.pkl"
# run_in_subsections(df.head(2), path, 11)

Since the running can take very long, this function allows to pick-up the run where it left off

In [12]:
def resume_predictions(df, save_predictions_path, run_id):
    # df = all the docs that need to be included in a run. Should inculde ALL docs, also the ones that have already been predicted. 
    # path = file where previous predictions of the run are saved, and where the new predictions will be saved.
    # set_run_id = should be an integer. 

    # get previous predictions for this specific run
    previous_predictions = pd.read_pickle(save_predictions_path)
    predictions_of_run = previous_predictions.loc[previous_predictions['run_id']==run_id]

    # if no earlier predictions are made for the run, get new_id
    if len(previous_predictions) == 0:
        run_id='new'

    # select the docs that have not been predicted yet. 
    not_predicted = df.loc[~df['path'].isin(predictions_of_run['path'])]

    # predict the not predicted docs in subsections
    run_in_subsections(not_predicted,save_predictions_path,run_id)

    

# path = f"{cf.output_path}/predictions/ICgeitje_predictions.pkl"
# resume_predictions(df.tail(5), path, 2)

In [13]:
import pandas as pd
df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")
df = df.loc[df['set']=='val']
df['text_trunc_100'] = df['tokens'].apply(text_truncation,100)
df['text_trunc_1000'] = df['tokens'].apply(text_truncation,1000)

  df['text_trunc_100'] = df['tokens'].apply(text_truncation,100)
  df['text_trunc_1000'] = df['tokens'].apply(text_truncation,1000)


In [14]:
path = f"{cf.output_path}/predictions/ICgeitje_predictions.pkl"
resume_predictions(df, path, 2)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



 iterations 0 50 



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

# dummy code

In [None]:
def run_in_subsections(df, path, set_run_id):

    iterations = list(range(0, len(df)+50, 50))
    for i in range(len(iterations)):
        try:
            subdf = df.iloc[iterations[i]:iterations[i+1]]

        except IndexError:
            subdf = df.iloc[iterations[i]:]

        # if set_run_id == 'new' and iterations[i]==0:
        #     run_prediction(subdf, 'text_trunc_100', pt.simple_prompt, 'new', path, 'val')
        # else:
        #     run_prediction(subdf, 'text_trunc_100', pt.simple_prompt, 'previous', path, 'val')



path = f"{cf.output_path}/predictions/ICgeitje_predictions_tryout.pkl"
run_in_subsections(df, path)

0 50
50 100
100 150
150 200
200 250
250 300
300 350
350 400
400 450
450 500
500 550
550 600
600 650
650 700
700 750
750 800
800 850
850 900
900 950
950 1000
1000 1050
1050 1100
1100 1150
1150 1200
1200 1250
1250 1300
1300 1350
1350 1400
1400 1450
1450 1500
1500 1550
1550 1600
1600 1650
1650 1700
1700 1750
1750 1800
1800 1850
1850 1900
1900 1950
1950 2000
2000 2050
2050 2100
2100 2150
2150 2200
2200 2250
2250 2300
2300 2350
2350 2400
2400 2450
2450 2500
2500 2550
2550 2600
2600 2650
2650 2700
2700 2750
2750 2800
2800 2850
2850 2900
2900 2950
2950 3000
3000 3050
3050 3100
3100 3150
3150 3200
3200 3250
3250 3300
3300 3350
3350 3400
3400 3450
3450 3500
3500 3550
3550 3600
3600 3650
3650 3700
3700 3750
3750 3800
3800 3850
3850 3900
3900 3950
3950 4000
4000 4050
4050 4100
4100 4150
4150 4200
4200 4250
4250 4300
4300 4350
4350 4400
4400 4450
4450 4500
4500 4550
4550 4600
4600 4650
4650 4700
4700 4750
4750 4800
4800 4850
4850 4900
4900 4950
4950 5000
5000 5050
5050 5100
5100 5150
5150 5200
520

  df['text_trunc_100'] = df['tokens'].apply(text_truncation,100)
  df['text_trunc_1000'] = df['tokens'].apply(text_truncation,1000)


In [None]:
def run_in_subsections(df, path):
    subdf = df.iloc[0:50]
    run_prediction(subdf, 'text_trunc_100', pt.simple_prompt, 'new', path, 'val')

    iterations = list(range(50, len(df)+50, 50))
    for i in range(len(iterations)):
        if i < len(iterations)-2:
            subdf = df.iloc[iterations[i]:iterations[i+1]]
            print("\n", "iterations", iterations[i], iterations[i+1], "\n")
            run_prediction(subdf, 'text_trunc_100', pt.simple_prompt, 'previous', path, 'val')

        elif i < len(iterations)-1:
            subdf = df.iloc[iterations[i]:]
            print("\n", "iterations", iterations[i], '\n')
            run_prediction(subdf, 'text_trunc_100', pt.simple_prompt, 'previous', path, 'val')

path = f"{cf.output_path}/predictions/ICgeitje_predictions_tryout.pkl"
run_in_subsections(df, path)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio


 iterations 50 100 



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Iteration 200/50 completed.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio


 iterations 100 150 



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

Iteration 600/50 completed.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio


 iterations 150 200 



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

In [None]:
yeet = pd.read_pickle(f"{cf.output_path}/predictions/ICgeitje_predictions.pkl")
display(yeet)

yeet = pd.read_pickle(f"{cf.output_path}/overview_results.pkl")
display(yeet)

Unnamed: 0,id,path,text_column,prompt_function,response,prediction,label,runtime,date,run_id
0,16,/home/azureuser/cloudfiles/code/blobfuse/raads...,text_trunc_100,simple_prompt,Brief,brief,motie,8.043233,2024-04-04 20:04:20.534788+02:00,0
1,17,/home/azureuser/cloudfiles/code/blobfuse/raads...,text_trunc_100,simple_prompt,Voordracht,voordracht,motie,10.173065,2024-04-04 20:04:30.777959+02:00,0
0,16,/home/azureuser/cloudfiles/code/blobfuse/raads...,text_trunc_100,simple_prompt,Brief,brief,motie,8.021667,2024-04-04 20:04:50.112805+02:00,1
1,17,/home/azureuser/cloudfiles/code/blobfuse/raads...,text_trunc_100,simple_prompt,Voordracht,voordracht,motie,9.520302,2024-04-04 20:04:59.634226+02:00,1


Unnamed: 0,run_id,model,prompt_function,text_column,date,runtime,set,support,accuracy,recall_weighted_avg,precision_weighted_avg,f1_weighted_avg,recall_macro_avg,precision_macro_avg,f1_macro_avg,recall_classes,precision_classes,f1_classes,support_classes,doc_paths
0,0,Rijgersberg/GEITje-7B-chat-v2,simple_prompt,text_trunc_100,2024-04-04 20:04:30.839867+02:00,18.216298,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 2.0, 'voordracht': 0.0}",[/home/azureuser/cloudfiles/code/blobfuse/raad...
0,1,Rijgersberg/GEITje-7B-chat-v2,simple_prompt,text_trunc_100,2024-04-04 20:04:59.694706+02:00,17.541969,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 2.0, 'voordracht': 0.0}",[/home/azureuser/cloudfiles/code/blobfuse/raad...


In [None]:
import time
import sys
sys.path.append('../scripts/') 
import prompt_template as pt
import prediction_helperfunctions as ph


""" Given a dataframe with txt, return a df with predictions """
# docs_df = dataframe with the documents that need to be predicted
# text_column = name of the column that includes the input_text. Can be different based on the text representation method. 
# prompt_function = prompt template -> ONLY prompt templates that take doc as input (ZERO SHOT)

def zero_shot_predictions_incontextlearning(docs_df, text_column, prompt_function):
    results_df = pd.DataFrame(columns = ['id', 'path', 'text_column', 'prompt_function', 'response', 'prediction', 'label', 'runtime', 'date'])
    
    # prompt each document
    for index, row in docs_df.iterrows():
        if (index + 1) % 200 == 0:
            print(f"Iteration {index +1}/{len(docs_df)} completed.")

        start_time = time.time()

        # get the prompt, with the doc filled in
        txt = row[text_column]
        prompt = prompt_function(txt)

        # prompt and get the response
        converse = chatbot(Conversation(prompt))
        response = converse[1]['content']

        # extract prediction from response
        prediction = ph.get_prediction_from_response(response)

        # save results in dataframe
        results_df.loc[len(results_df)] = {
            'id': row['id'],
            'path' : row['path'],
            'text_column' : text_column,
            'prompt_function': ph.get_promptfunction_name(prompt_function),
            'response':response,
            'prediction':prediction,
            'label':row['label'].lower(),
            'runtime':time.time()-start_time,
            'date': ph.get_datetime()
        }
    return results_df

""" Run a prediction function -> can be ZeroShot or FewShot """
def run_prediction(docs_df, text_column, prompt_function, subset=None, learning='ZeroShot'):
    if learning == 'ZeroShot':
        # get the predictions
        res = zero_shot_predictions_incontextlearning(docs_df, text_column, prompt_function)

        # INSERT ELSE STATEMENT HERE FOR FEWSHOT

        # get run_id
        path = f"{cf.output_path}/predictions/ICgeitje_predictions.pkl"
        res['run_id'], predictions_df = ph.get_runid(path)

        # combine earlier predictions with new ones
        all_predictions = pd.concat([predictions_df, res])

        # save predictions
        all_predictions.to_pickle(path)

        # save the evaluation metrics for each run
        ph.update_overview_results(res, 'Rijgersberg/GEITje-7B-chat-v2')
        return res
# gestart om 10.15/
# res = run_prediction(df, 'text_trunc_100', pt.simple_prompt, 'val')
# display(res)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

Iteration 600/5374 completed.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

KeyboardInterrupt: 

In [None]:
yeet = pd.read_pickle(f"{cf.output_path}/overview_results.pkl")
display(yeet)

yeet = pd.read_pickle(f"{cf.output_path}/predictions/ICgeitje_predictions.pkl")
display(yeet)


Unnamed: 0,run_id,model,prompt_function,text_column,date,runtime,set,support,accuracy,recall_weighted_avg,precision_weighted_avg,f1_weighted_avg,recall_macro_avg,precision_macro_avg,f1_macro_avg,recall_classes,precision_classes,f1_classes,support_classes,doc_paths
0,0,Rijgersberg/GEITje-7B-chat-v2,simple_prompt,text_trunc,2024-04-03 15:34:12.322972+02:00,20.163474,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 2.0, 'voordracht': 0.0}",[/home/azureuser/cloudfiles/code/blobfuse/raad...
0,1,Rijgersberg/GEITje-7B-chat-v2,simple_prompt,text_trunc,2024-04-03 15:35:38.851973+02:00,53.37181,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 0.0, 'voordracht': 0.0}","{'brief': 0.0, 'motie': 2.0, 'voordracht': 0.0}",[/home/azureuser/cloudfiles/code/blobfuse/raad...


Unnamed: 0,id,path,text_column,prompt_function,response,prediction,label,runtime,date,run_id
0,0,/home/azureuser/cloudfiles/code/blobfuse/raads...,text_trunc,simple_prompt,Brief,brief,motie,8.250339,2024-04-03 15:34:00.278762+02:00,0
1,1,/home/azureuser/cloudfiles/code/blobfuse/raads...,text_trunc,simple_prompt,Voordracht,voordracht,motie,11.913135,2024-04-03 15:34:12.251143+02:00,0
0,0,/home/azureuser/cloudfiles/code/blobfuse/raads...,text_trunc,simple_prompt,Brief,brief,motie,25.226653,2024-04-03 15:35:10.573034+02:00,1
1,1,/home/azureuser/cloudfiles/code/blobfuse/raads...,text_trunc,simple_prompt,Voordracht,voordracht,motie,28.145157,2024-04-03 15:35:38.719588+02:00,1


#### Tryout GEITje
Load chatbot

In [None]:
from transformers import pipeline, Conversation

chatbot = pipeline(task='conversational', model='Rijgersberg/GEITje-7B-chat-v2',
                   device_map='auto')

## simple query
print(chatbot(
    Conversation("Hallo, ik ben Bram. Ik wil vanavond graag een film kijken. Heb je enkele suggesties?")
))



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
from transformers import pipeline, Conversation

# load_in_8bit: lower precision but saves a lot of GPU memory
# device_map=auto: loads the model across multiple GPUs
# chatbot = pipeline("conversational", model="BramVanroy/GEITje-7B-ultra",  model_kwargs={"load_in_8bit": True}, device_map="auto")
chatbot = pipeline("conversational", model="BramVanroy/GEITje-7B-ultra",  device_map="auto")

# start_messages = [
#     # {"role": "system", "content": "Je bent een grappige chatbot die Bert heet. Je maakt vaak mopjes."},
#     {"role": "user", "content": "Hallo, ik ben Bram. Ik wil vanavond graag een film kijken. Heb je enkele suggesties?"}
# ]
# conversation = Conversation(start_messages)
# conversation = chatbot(conversation)
# response = conversation.messages[-1]["content"]
# print(response)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
txt = df.iloc[0]['text']
prompt = f"""

Classificeer de gegeven tekst in 1 van de categoriën.
Geef als reactie enkel de naam van de categorie
Categorieën: ['Voordracht', 'Besluit', 'Schriftelijke Vragen', 'Brief', 'Raadsadres', 'Onderzoeksrapport', 'Termijnagenda', 'Raadsnotulen', 'Agenda', 'Motie', 'Actualiteit', 'Factsheets']
Tekst: 

{txt}

""" 

start_messages = [
    {"role": "system", "content": "Jouw enige taak is om teksten te classificeren. Je geeft geen uitleg voor je keuzes."},
    {"role": "user", "content": prompt}
]

In [None]:
chatbot(Conversation(start_messages))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


KeyboardInterrupt: 

In [None]:
%pip install accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [None]:
df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")


In [None]:
display(df)

Unnamed: 0,label,path,id,set,text,tokens,token_count,clean_tokens,clean_tokens_count,pdf_path,num_pages,clean_text
0,Motie,/home/azureuser/cloudfiles/code/blobfuse/raads...,0,train,Gemeente Amsterdam\n% Gemeenteraad R\n% Gemeen...,"[Gemeente, Amsterdam, %, Gemeenteraad, R, %, G...",395,"[Gemeente, Amsterdam, Gemeenteraad, Gemeentebl...",205,/home/azureuser/cloudfiles/code/blobfuse/raads...,2.0,Gemeente Amsterdam Gemeenteraad Gemeenteblad M...
1,Motie,/home/azureuser/cloudfiles/code/blobfuse/raads...,1,train,Gemeente Amsterdam\n\n% Gemeenteraad R\n\n% Ge...,"[Gemeente, Amsterdam, %, Gemeenteraad, R, %, G...",390,"[Gemeente, Amsterdam, Gemeenteraad, Gemeentebl...",197,/home/azureuser/cloudfiles/code/blobfuse/raads...,2.0,Gemeente Amsterdam Gemeenteraad Gemeenteblad M...
2,Motie,/home/azureuser/cloudfiles/code/blobfuse/raads...,2,train,Gemeente Amsterdam\n\n% Gemeenteraad R\n\n% Ge...,"[Gemeente, Amsterdam, %, Gemeenteraad, R, %, G...",389,"[Gemeente, Amsterdam, Gemeenteraad, Gemeentebl...",192,/home/azureuser/cloudfiles/code/blobfuse/raads...,2.0,Gemeente Amsterdam Gemeenteraad Gemeenteblad M...
3,Motie,/home/azureuser/cloudfiles/code/blobfuse/raads...,3,train,Gemeente Amsterdam\n\n% Gemeenteraad R\n\n% Ge...,"[Gemeente, Amsterdam, %, Gemeenteraad, R, %, G...",464,"[Gemeente, Amsterdam, Gemeenteraad, Gemeentebl...",225,/home/azureuser/cloudfiles/code/blobfuse/raads...,2.0,Gemeente Amsterdam Gemeenteraad Gemeenteblad M...
4,Motie,/home/azureuser/cloudfiles/code/blobfuse/raads...,4,test,x Gemeente Amsterdam R\nGemeenteraad\n\n% Geme...,"[x, Gemeente, Amsterdam, R, Gemeenteraad, %, G...",261,"[Gemeente, Amsterdam, Gemeenteraad, Gemeentebl...",134,/home/azureuser/cloudfiles/code/blobfuse/raads...,1.0,Gemeente Amsterdam Gemeenteraad Gemeenteblad M...
...,...,...,...,...,...,...,...,...,...,...,...,...
33112,Factsheets,/home/azureuser/cloudfiles/code/blobfuse/raads...,33123,train,"In Amsterdam is, net als in andere delen van h...","[In, Amsterdam, is, ,, net, als, in, andere, d...",3816,"[Amsterdam, net, delen, land, sprake, tekort, ...",2001,/home/azureuser/cloudfiles/code/blobfuse/raads...,10.0,Amsterdam net delen land sprake tekort leraren...
33113,Factsheets,/home/azureuser/cloudfiles/code/blobfuse/raads...,33124,val,| September 2021 C 1 / )\nr 1E | rs | a 4 in\n...,"[|, September, 2021, C, 1, /, ), r, 1E, |, rs,...",85228,"[September, 2021, 1E, rs, ke, We, ear, nrc, TW...",43467,/home/azureuser/cloudfiles/code/blobfuse/raads...,164.0,September 2021 1E rs ke We ear nrc TW Hr mn zn...
33114,Factsheets,/home/azureuser/cloudfiles/code/blobfuse/raads...,33125,train,WPI KWARTAAL FACTSHEET DECEMBER 2022\nKERNCIJF...,"[WPI, KWARTAAL, FACTSHEET, DECEMBER, 2022, KER...",1595,"[WPI, KWARTAAL, FACTSHEET, DECEMBER, 2022, KER...",970,/home/azureuser/cloudfiles/code/blobfuse/raads...,7.0,WPI KWARTAAL FACTSHEET DECEMBER 2022 KERNCIJFE...
33115,Factsheets,/home/azureuser/cloudfiles/code/blobfuse/raads...,33126,train,"WPI VOORTGANGSRAPPORTAGE\nKERNCIJFERS WERK, PA...","[WPI, VOORTGANGSRAPPORTAGE, KERNCIJFERS, WERK,...",1821,"[WPI, VOORTGANGSRAPPORTAGE, KERNCIJFERS, WERK,...",1024,/home/azureuser/cloudfiles/code/blobfuse/raads...,6.0,WPI VOORTGANGSRAPPORTAGE KERNCIJFERS WERK PART...


In [None]:
text = df.loc[df['clean_tokens_count'].idxmax()]['text']
print(df.loc[df['clean_tokens_count'].idxmax()]['clean_tokens_count'])

print(pt.simple_prompt(text))

143782

    Classificeer het document in één van de categoriën.
    Houd het kort, geef enkel de naam van de categorie als response.
    
    Categoriën: ['Voordracht', 'Besluit', 'Schriftelijke Vragen', 'Brief', 'Raadsadres', 'Onderzoeksrapport', 'Termijnagenda', 'Raadsnotulen', 'Agenda', 'Motie', 'Actualiteit', 'Factsheets']
    
    Document: 
    se a en wac SE | RE
ET ee enn oe ed La \ Á 4 +
nt On ee Ee ee 1E VD an] Í _ = ___
en en de ed eee | raul Fai
sr ee nn en nnen men ee etheen ze dd Te Fa
En ef ee en en Te nn | 1 | Ank
en Á en EE es EE: - ; ; A ; D
Ees ee Ef en er En as en nb er
ee he ie Ee ns Ee EE , se ee Tees 2 rent Er eg geen; E
EE es en en En en d Me Ee a EE en BENN
En en en ne Te A Ee Zr me et MEE ii P, ,
: en Ee ma on fr ee Tt en es ee en ï B Lt
En en en Ees ie Ee EE il En == en ne en en mn 8 je
Ee en ES ns men Ee Pen En en LN 4 ES En Te eenen
ge Ss Ee nn nn eht a en ee en ze ee
EEE es nn en en en el ee ee En en ee en en
Ss En ee n En En ee == nn - mi Fo L ï en, ee Se

In [None]:
print(chatbot(
    Conversation(pt.simple_prompt(text))
    ))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'Rijgersberg/GEITje-7B-chat-v2'
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16,
                                             low_cpu_mem_usage=True, attn_implementation='eager',
                                             device_map=device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def generate(conversation, temperature=0.2, top_k=50, max_new_tokens=1_000):
    tokenized = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,
                                              return_tensors='pt').to(device)
    outputs = model.generate(tokenized, do_sample=True, temperature=temperature,
                             top_k=top_k, max_new_tokens=max_new_tokens)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

conversation = [
    {
        'role': 'user',
        'content': 'Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"?'
    }
]
print(generate(conversation))
# <|user|>
# Welk woord hoort er niet in dit rijtje thuis: "auto, vliegtuig, geitje, bus"? 
# <|assistant|>
# Het woord dat niet op zijn plaats staat is 'geit'. Een geit zou niet tussen een lijst van vervoersmiddelen moeten staan. Het past beter bij een boerderijthema of dierenlijst.

### BACK-UP CODE

In [None]:
from collections import Counter
import re
import time
import datetime
import pytz
import os
from sklearn.metrics import classification_report


""" Given the string response, extract the prediction """
def get_prediction_from_response(response):
    # get a list of the possible classes
    classes_list = pt.get_class_list()

    predictions = [True if category.lower() in response.lower() else False for category in classes_list]

    # check if multiple classes were named, this is a prediction error
    if Counter(predictions)[True] > 1:
        return "PredictionError"

    # check if exactly one class is named, this is the prediction
    elif Counter(predictions)[True] == 1:
        prediction = [category.lower() for category in classes_list if category.lower() in response.lower()]
        return prediction[0]

    # if no class is named, then this is a no prediction error
    else:
        return 'NoPrediction'

""" Extract the promptfunction name """
def get_promptfunction_name(prompt_function):
    string = f"{prompt_function}"
    match = re.search(r'<function\s+(\w+)', string)
    if match:
        function_name = match.group(1)
        return function_name
    else:
        return f"{prompt_function}"
    
""" Get the current time in the Netherlands """
def get_datetime():
    current_datetime_utc = datetime.datetime.now(pytz.utc)

    # Convert UTC time to Dutch time (CET)
    dutch_timezone = pytz.timezone('Europe/Amsterdam')
    current_datetime_dutch = current_datetime_utc.astimezone(dutch_timezone)
    return current_datetime_dutch
        
""" Get the new runid """
def get_runid(path):

    # if not first run, set runid to most recent run+1
    if os.path.exists(path):
        df = pd.read_pickle(path)
        return max(df['run_id'])+1, df

    # if first run, set runid to 0
    else:
        return 0, pd.DataFrame()
    
""" Save evaluation metrics of a run """
def update_overview_results(df, model_name, subset=None):
    # df= dataframe with predictions for each do, one row per doc/prediction
    # model_name = string with the name of the model
    # subset = can be train, val, or test, or left open
 
    # get evalaution scores
    evaluation_dict = classification_report(df['label'], df['prediction'], output_dict=True)
    evaluation = pd.DataFrame(evaluation_dict).transpose()
    
    new_row = {
        # stuff about the run
        'run_id':df.iloc[0]['run_id'],
        'model':model_name,
        'prompt_function':df.iloc[0]['prompt_function'],
        'text_column':df.iloc[0]['text_column'],
        'date': get_datetime(),
        'runtime':sum(df['runtime']),
        'set':subset,
        'support':evaluation.iloc[-1]['support'],

        # evaluation
        'accuracy': evaluation_dict['accuracy'],

        'recall_weighted_avg':evaluation.loc[evaluation.index=='weighted avg']['recall'].values[0],
        'precision_weighted_avg': evaluation.loc[evaluation.index=='weighted avg']['precision'].values[0],
        'f1_weighted_avg': evaluation.loc[evaluation.index=='weighted avg']['f1-score'].values[0],

        'recall_macro_avg':evaluation.loc[evaluation.index=='macro avg']['recall'].values[0],
        'precision_macro_avg': evaluation.loc[evaluation.index=='macro avg']['precision'].values[0],
        'f1_macro_avg': evaluation.loc[evaluation.index=='macro avg']['f1-score'].values[0],


        'recall_classes': dict(zip(evaluation.index[0:-3], evaluation['recall'][0:-3])),
        'precision_classes': dict(zip(evaluation.index[0:-3], evaluation['precision'][0:-3])),
        'f1_classes': dict(zip(evaluation.index[0:-3], evaluation['f1-score'][0:-3])),
        'support_classes': dict(zip(evaluation.index[0:-3], evaluation['support'][0:-3])),

        # docs that were predicted
        'doc_paths':list(df['path'].values)
        
    }

    # create a new dataframe with the evaluation, each run has one row
    results = pd.DataFrame(columns=new_row.keys())
    results.loc[len(results)] = new_row
   
    # if not the first run, get results from previous runs
    path = f"{cf.output_path}/overview_results.pkl"
    if os.path.exists(path):
        earlier_results = pd.read_pickle(path)

        # combine evaluation of previous runs with current run
        results = pd.concat([earlier_results, results])

    # save to overview_results.pkl
    results.to_pickle(path)
   


In [None]:
# update_overview_results(res, 'geitje')

In [None]:
# yeet = pd.read_pickle(f"{cf.output_path}/overview_results.pkl")
# display(yeet)

# yeet = pd.read_pickle(f"{cf.output_path}/predictions/ICgeitje_predictions.pkl")
# display(yeet)


In [None]:
import time
import datetime


""" Given a dataframe with txt, return a df with predictions """
# docs_df = dataframe with the documents that need to be predicted
# text_column = name of the column that includes the input_text. Can be different based on the text representation method. 
# prompt_function = prompt template -> ONLY prompt templates that take doc as input (ZERO SHOT)

def zero_shot_predictions_incontextlearning(docs_df, text_column, prompt_function):
    results_df = pd.DataFrame(columns = ['id', 'path', 'text_column', 'prompt_function', 'response', 'prediction', 'label', 'runtime', 'date'])
    
    # prompt each document
    for index, row in docs_df.iterrows():
        start_time = time.time()

        # get the prompt, with the doc filled in
        txt = row[text_column]
        prompt = prompt_function(txt)

        # prompt and get the response
        converse = chatbot(Conversation(prompt))
        response = converse[1]['content']

        # extract prediction from response
        prediction = get_prediction_from_response(response)

        # save results in dataframe
        results_df.loc[len(results_df)] = {
            'id': row['id'],
            'path' : row['path'],
            'text_column' : text_column,
            'prompt_function': get_promptfunction_name(prompt_function),
            'response':response,
            'prediction':prediction,
            'label':row['label'].lower(),
            'runtime':time.time()-start_time,
            'date': get_datetime()
        }
    return results_df

""" Run a prediction function -> can be ZeroShot or FewShot """
def run_prediction(docs_df, text_column, prompt_function, subset=None, learning='ZeroShot'):
    if learning == 'ZeroShot':
        # get the predictions
        res = zero_shot_predictions_incontextlearning(docs_df, text_column, prompt_function)

        # INSERT ELSE STATEMENT HERE FOR FEWSHOT

        # get run_id
        path = f"{cf.output_path}/predictions/ICgeitje_predictions.pkl"
        res['run_id'], predictions_df = get_runid(path)

        # combine earlier predictions with new ones
        all_predictions = pd.concat([predictions_df, res])

        # save predictions
        all_predictions.to_pickle(path)

        # save the evaluation metrics for each run
        update_overview_results(res, 'Rijgersberg/GEITje-7B-chat-v2')
        return res

res = run_prediction(df, 'text_trunc', pt.simple_prompt)
display(res)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,id,path,text_column,prompt_function,response,prediction,label,runtime,date,run_id
0,0,/home/azureuser/cloudfiles/code/blobfuse/raads...,text_trunc,simple_prompt,Brief,brief,motie,7.768863,2024-04-03 15:09:04.137299+02:00,1
1,1,/home/azureuser/cloudfiles/code/blobfuse/raads...,text_trunc,simple_prompt,Voordracht,voordracht,motie,11.083132,2024-04-03 15:09:15.221748+02:00,1
