In [None]:
!pip install bert-score transformers==4.28.0 datasets evaluate rouge_score sentencepiece bert_score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece
  Downloading sentencepiec

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle
from datasets import Dataset
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from sklearn.metrics import f1_score

#### Load Data

In [None]:
with open ('/content/drive/MyDrive/AQA_CleanData/results_tfidf_questions_for_ranking_passages_df.pickle', 'rb') as f:
  results_tfidf = pickle.load(f)

In [None]:
with open ('/content/drive/MyDrive/AQA_CleanData/results_random_questions_for_ranking_passages_df.pickle', 'rb') as f:
  results_random = pickle.load(f)

In [None]:
def create_dataset(results_ranking, input_type):
    inputs = []
    target_outputs = []
    for result in results_ranking:
      if input_type == 'question' or input_type == 'question_context':
        input = result['question'] + ' </s> '
      else:
        input = ''
      target_output = result['answer']

      if input_type == 'question_context' or input_type == 'context':
        for ranked_passage in result['ranked_text'][:10]:
          input+= (ranked_passage +  ' </s> ')
      inputs.append(input)
      target_outputs.append(target_output)

    data = {
        'inputs': inputs,
        'target_outputs': target_outputs
    }

    dataset = Dataset.from_dict(data)

    return dataset

In [None]:
test_tfidf_q_c = create_dataset(results_tfidf, input_type='question_context')
test_tfidf_c = create_dataset(results_tfidf, input_type='context')
test_tfidf_q = create_dataset(results_tfidf, input_type='question')

In [None]:
test_random_q_c = create_dataset(results_random, input_type='question_context')
test_random_c = create_dataset(results_random, input_type='context')

In [None]:
test_tfidf_q_c[0]['inputs']

'\n7.\nKan het college de reeds bestaande zwemplekken in Amsterdam en de directe \nomgeving, die zich op fietsafstand bevinden, beter communiceren zodat mensen \nweten waar ze allemaal heen kunnen op de fiets om te zwemmen?\n </s> Zwemmen in Amsterdam - Gemeente Amsterdam Direct naar inhoud GemeenteAmsterdam Mijn Amsterdam English site Menuzoeken Onderwerpen Nieuws Contact Zoeken in Amsterdam.nl Zoek Zoek Verbergen Deze browser wordt niet meer ondersteund . Gebruik een recente versie van Edge , Chrome of Firefox . Pad tot huidige pagina Home Sport Zwemmen in Amsterdam Lijst De zwembaden in Amsterdam Grote knop De Mirandabad Zwemmen in Zuid Grote knop Zuiderbad Zwemmen in het Centrum Grote knop Noorderparkbad Zwemmen in Noord Grote knop Brediusbad Zwemmen in West ( buitenbad ) Grote knop Flevoparkbad Zwemmen in Oost ( buitenbad ) Grote knop Alle zwembaden Lijst met </s> tijd niet kan gebruiken ? Het kan natuurlijk voorkomen dat u uw fiets langere tijd niet gebruikt door een lange vakant

In [None]:
test_random_c[0]['inputs']

". Verantwoordelijkheid DB Zuidoost Het DB Zuidoost is bestuurlijk verantwoordelijk voor de kwaliteitsverbetering van het stadspark . Normaal ligt de bestuurlijke verantwoordelijkheid voor projecten groter dan € 5 miljoen bij het college van B en W . Middels het collegebesluit van 1 maart 2022 heeft het college Ingestemd met het verlenen van mandaat aan het DB van Zuidoost om als bestuurlijk opdrachtgever te blijven optreden voor het vaststellen , uitwerken en realiseren van de Visie Nelson Mandelapark , ook nu de begroting hoger is dan € 5 miljoen . ( ZD2022-002145 ) Participatie en inspraak : Het voorliggende plan is </s> en daarmee inwendige condensatie te verhinderen . De dampremmende laag dient zeer zorgvuldig en goed sluitend te worden aangebracht , vooral ook ter plaatse van balken . Als isolatiematerialen kunnen in aanmerking komen : Minerale wol PUR Polystyreen Cellulose Vlaswol Warmtereflecterende folies Wol Voor isolatiematerialen zoals wol , cellulose en vlaswol , geldt dat

In [None]:
test_tfidf_q[0]['inputs']

'\n7.\nKan het college de reeds bestaande zwemplekken in Amsterdam en de directe \nomgeving, die zich op fietsafstand bevinden, beter communiceren zodat mensen \nweten waar ze allemaal heen kunnen op de fiets om te zwemmen?\n </s> '

#### Tokenize test sets

In [None]:
from transformers import AutoTokenizer

checkpoints = {'question_context': "natope/mT5-tfidf-10pass-all-questions-QA-22-06-2023-without-ams",
               'question': 'natope/mT5-tfidf-10pass-all-questions-QA-22-06-2023-without-ams-questionsonly',
               'context': 'natope/mT5-tfidf-10pass-all-questions-QA-22-06-2023-without-ams-3epochs-contextonly',
               'question_context_nonfactual': 'natope/mT5-tfidf-10pass-all-questions-QA-22-06-2023-without-ams-with-nonfactual',
               'question_nonfactual': 'natope/mT5-tfidf-10pass-all-questions-QA-22-06-2023-without-ams-with-nonfactual-questionsonly-v2',
               'context_nonfactual': 'natope/mT5-tfidf-10pass-all-questions-QA-22-06-2023-without-ams-with-nonfactual-contextonly'}

#### Load Model

In [None]:
from transformers import pipeline

#### Make predictions

In [None]:
def generate_answers(test_set, checkpoint): # make it a list of dictionaries instead
  model = pipeline("text2text-generation", model=checkpoint, device=0)
  generated_answers=[]
  for sample in test_set:
      generated_answers.append(model(sample,
                                     no_repeat_ngram_size = 2,
                                    min_length= 100,
                                    max_length=400
                                     ))

  return generated_answers

#### Evaluate functions

In [None]:
from nltk.translate.bleu_score import SmoothingFunction
def calculate_metrics_answer_similarity(generated_answers, reference_answers):

    generated_answers = [answer[0]['generated_text'] for answer in generated_answers]
    reference_answers = [answer for answer in reference_answers]

    # Compute BERT scores
    P, R, F1 = bert_score(generated_answers, reference_answers, lang='nl', verbose=False)

    # ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
    rouge_scores = []

    for generated_ans, reference_ans in zip(generated_answers, reference_answers):
        scores = scorer.score(generated_ans, reference_ans)
        rouge_scores.append(scores)

    # BLEU score
    bleu_score = corpus_bleu([[ref.split()] for ref in reference_answers], [gen.split() for gen in generated_answers], smoothing_function=SmoothingFunction().method1)

    # F1 score
    f1 = f1_score(reference_answers, generated_answers, average='micro')  # Adjust 'average' parameter as needed



    metrics = {
        "BERT Precision": P.mean().item(),
        "BERT Recall": R.mean().item(),
        "BERT F1": F1.mean().item(),
        "ROUGE-1 (Average)": sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores),
        "ROUGE-2 (Average)": sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores),
        "ROUGE-L (Average)": sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores),
        "BLEU Score": bleu_score,
        "F1 Score": f1
    }

    return metrics

In [None]:
target_outputs = test_tfidf_q_c['target_outputs']

#### Predict

In [None]:
tfidf_q_c_fact =  generate_answers(test_tfidf_q_c['inputs'], checkpoint=checkpoints['question_context_nonfactual']) # random
calculate_metrics_answer_similarity(tfidf_q_c_fact, target_outputs)

Downloading (…)lve/main/config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]



Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'BERT Precision': 0.6640593409538269,
 'BERT Recall': 0.641151487827301,
 'BERT F1': 0.651991605758667,
 'ROUGE-1 (Average)': 0.2906073722720637,
 'ROUGE-2 (Average)': 0.06573134868120352,
 'ROUGE-L (Average)': 0.15450750750657669,
 'BLEU Score': 0.021815133338251785,
 'F1 Score': 0.0}

In [None]:
random_q_c_fact =  generate_answers(test_random_q_c['inputs'], checkpoint=checkpoints['question_context_nonfactual']) # random
calculate_metrics_answer_similarity(random_q_c_fact, target_outputs)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'BERT Precision': 0.6637589335441589,
 'BERT Recall': 0.6410458087921143,
 'BERT F1': 0.6517848968505859,
 'ROUGE-1 (Average)': 0.28668360217386407,
 'ROUGE-2 (Average)': 0.06068212258527983,
 'ROUGE-L (Average)': 0.15197663323529548,
 'BLEU Score': 0.020208294513480985,
 'F1 Score': 0.0}

In [None]:
tfidf_c_fact =  generate_answers(test_tfidf_c['inputs'], checkpoint=checkpoints['context_nonfactual']) # tfidf c
calculate_metrics_answer_similarity(tfidf_c_fact, target_outputs)

Downloading (…)lve/main/config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'BERT Precision': 0.6703234314918518,
 'BERT Recall': 0.6337534189224243,
 'BERT F1': 0.6511412858963013,
 'ROUGE-1 (Average)': 0.2845320604411437,
 'ROUGE-2 (Average)': 0.05585632274067817,
 'ROUGE-L (Average)': 0.1551755798192784,
 'BLEU Score': 0.014655099355456594,
 'F1 Score': 0.0}

In [None]:
random_c_fact =  generate_answers(test_random_c['inputs'], checkpoint=checkpoints['context_nonfactual']) # random c
calculate_metrics_answer_similarity(random_c_fact, target_outputs)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'BERT Precision': 0.6602400541305542,
 'BERT Recall': 0.625282883644104,
 'BERT F1': 0.6417914628982544,
 'ROUGE-1 (Average)': 0.26484303598313175,
 'ROUGE-2 (Average)': 0.05016298599816393,
 'ROUGE-L (Average)': 0.14696994363683658,
 'BLEU Score': 0.013614277760158756,
 'F1 Score': 0.0}

In [None]:
tfidf_q_fact =  generate_answers(test_tfidf_q['inputs'], checkpoint=checkpoints['question_nonfactual']) # tfidf q+c
calculate_metrics_answer_similarity(tfidf_q_fact, target_outputs)

Downloading (…)lve/main/config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'BERT Precision': 0.6629801988601685,
 'BERT Recall': 0.6388957500457764,
 'BERT F1': 0.6501011252403259,
 'ROUGE-1 (Average)': 0.2890419038730458,
 'ROUGE-2 (Average)': 0.06134549332710925,
 'ROUGE-L (Average)': 0.15334702538475609,
 'BLEU Score': 0.01877153691222521,
 'F1 Score': 0.0}

In [None]:
tfidf_q_c_fact[17]

[{'generated_text': 'Het college heeft een brief aan de politie over de situatie van de stadsdelen. De politie haalde de voorlichting op de bewoners van Amsterdam West. Het gaat om de aanpak van het aantal vragen die de gemeente Amsterdam hebben geïnformeerd. Deze vraag is gebaseerd op het onderzoek van nieuwe regels. In de praktijk is het college verantwoordelijk voor de veiligheid van bezoekers en de inzet van een politie. Dit is niet mogelijk om maatregelen te treffen op diverse gebieden. Daarnaast is er geen sprake van aanleiding om een voorlichter te maken. Er is geen aandacht voor het aanpakken van deze problemen. Op dit moment is de uitwerking van dit onderzoek. Een gesprek is gestart met de Amsterdamse politie en het bestuur van stadsdeel Zuidoost. Hierbij is een aanvang van 1 januari 2020 een onderzoek gedaan naar de toekomst van die onderwerpen. Bij de onderzoeken van politie is in de stad georganiseerd. Voor de oplichting vanuit de burgemeester van Weesp is dit gericht op ee

In [None]:
target_outputs[17]

' \nIn de preventieve aanpak van babbeltrucs werkt de politie intensief samen met de \ngemeente. Op meerdere momenten per jaar organiseren politie en stadsdelen \nbijeenkomsten die specifiek gericht zijn op de veiligheid van senioren. Het geven \nvan voorlichting en versterken van de weerbaarheid zijn twee essentiële \nonderdelen van deze bijeenkomsten. Door gebruik te maken van acteurs in \nrollenspellen leren senioren diverse babbeltrucsituaties herkennen. Aansluitend \ngaan de acteurs in op wat de doelgroep zelf kan doen om te voorkomen dat zij \nslachtoffer worden. Hiernaast zet de politie samen met het Centrum voor \nCriminaliteitspreventie en Veiligheid sinds kort een nieuwe techniek in. Met hulp \nvan een 360° video ervaren senioren via een Virtual Reality–bril tijdens de training \nhoe een oplichter te werk gaat. Met deze techniek plaatst de kijker zich bij het \nslachtoffer én de dader in dezelfde virtuele ruimte. De training biedt \nhandelingsperspectieven, waardoor direct du

In [None]:
tfidf_q_c_fact2 =  generate_answers(test_tfidf_q_c['inputs'], checkpoint=checkpoints['question_context']) # random
calculate_metrics_answer_similarity(tfidf_q_c_fact2, target_outputs)

Downloading (…)lve/main/config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'BERT Precision': 0.6715618371963501,
 'BERT Recall': 0.6428503394126892,
 'BERT F1': 0.6564623117446899,
 'ROUGE-1 (Average)': 0.29625642677361935,
 'ROUGE-2 (Average)': 0.06313431015033676,
 'ROUGE-L (Average)': 0.15774729214677194,
 'BLEU Score': 0.01997264991713835,
 'F1 Score': 0.0}

In [None]:
random_q_c_fact2 =  generate_answers(test_random_q_c['inputs'], checkpoint=checkpoints['question_context']) # random
calculate_metrics_answer_similarity(random_q_c_fact2, target_outputs)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'BERT Precision': 0.6642659902572632,
 'BERT Recall': 0.6398227214813232,
 'BERT F1': 0.6513878107070923,
 'ROUGE-1 (Average)': 0.28972652927395576,
 'ROUGE-2 (Average)': 0.05794993488191634,
 'ROUGE-L (Average)': 0.15147591995557957,
 'BLEU Score': 0.017369422329260288,
 'F1 Score': 0.0}

In [None]:
tfidf_c_fact2 =  generate_answers(test_tfidf_c['inputs'], checkpoint=checkpoints['context']) # tfidf c
calculate_metrics_answer_similarity(tfidf_c_fact2, target_outputs)

Downloading (…)lve/main/config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'BERT Precision': 0.6562179923057556,
 'BERT Recall': 0.6324306726455688,
 'BERT F1': 0.6437315344810486,
 'ROUGE-1 (Average)': 0.2749108527904267,
 'ROUGE-2 (Average)': 0.05380277757496261,
 'ROUGE-L (Average)': 0.14649624902366226,
 'BLEU Score': 0.01505004676660441,
 'F1 Score': 0.0}

In [None]:
random_c_fact2 =  generate_answers(test_random_c['inputs'], checkpoint=checkpoints['context']) # random c
calculate_metrics_answer_similarity(random_c_fact2, target_outputs)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'BERT Precision': 0.6452605724334717,
 'BERT Recall': 0.6250851154327393,
 'BERT F1': 0.6346668601036072,
 'ROUGE-1 (Average)': 0.2608233448303723,
 'ROUGE-2 (Average)': 0.0515481392381462,
 'ROUGE-L (Average)': 0.14533837168058156,
 'BLEU Score': 0.010213070321652801,
 'F1 Score': 0.0}

In [None]:
tfidf_q_fact2 =  generate_answers(test_tfidf_q['inputs'], checkpoint=checkpoints['question']) # tfidf q+c
calculate_metrics_answer_similarity(tfidf_q_fact2, target_outputs)

Downloading (…)lve/main/config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'BERT Precision': 0.668830931186676,
 'BERT Recall': 0.6317691206932068,
 'BERT F1': 0.6492584943771362,
 'ROUGE-1 (Average)': 0.283587279258385,
 'ROUGE-2 (Average)': 0.06037512404630682,
 'ROUGE-L (Average)': 0.15789178263625542,
 'BLEU Score': 0.015996125789406457,
 'F1 Score': 0.0}

In [None]:
tfidf_q_c_fact[17]

[{'generated_text': 'Het college heeft een brief aan de politie over de situatie van de stadsdelen. De politie haalde de voorlichting op de bewoners van Amsterdam West. Het gaat om de aanpak van het aantal vragen die de gemeente Amsterdam hebben geïnformeerd. Deze vraag is gebaseerd op het onderzoek van nieuwe regels. In de praktijk is het college verantwoordelijk voor de veiligheid van bezoekers en de inzet van een politie. Dit is niet mogelijk om maatregelen te treffen op diverse gebieden. Daarnaast is er geen sprake van aanleiding om een voorlichter te maken. Er is geen aandacht voor het aanpakken van deze problemen. Op dit moment is de uitwerking van dit onderzoek. Een gesprek is gestart met de Amsterdamse politie en het bestuur van stadsdeel Zuidoost. Hierbij is een aanvang van 1 januari 2020 een onderzoek gedaan naar de toekomst van die onderwerpen. Bij de onderzoeken van politie is in de stad georganiseerd. Voor de oplichting vanuit de burgemeester van Weesp is dit gericht op ee

In [None]:
tfidf_q_c_fact2[17]

[{'generated_text': 'Het college heeft een vraag over de situatie van de politie. De politie haalde de voorlichting op de bewoners van Amsterdam West. Deze vragen zijn gericht op het onderwerp van het onderzoek. Het onderzoek is echter niet mogelijk om de aanpak van een preventieve voorlichten te krijgen. In de gemeente Amsterdam heerst een onderzoek naar de veiligheid van medewerkers van stadsdeel Weesp. Dit is niet een aanleiding om maatregelen te nemen. Daarnaast is er een voorgelegd aan de wethouder van politie en de stadsdelen. Er is geen sprake van aandacht voor de preventie van babbeltrucs. Op dit moment is het college een belangrijke taak om te bepalen hoe de inzet van bezoekers en ondernemers die zich hebben geïnformeerd. Hierdoor is de uitwerking van deze gesprekken met de burgemeester van Weezep en het stadsgebied. Voor de Amsterdamse politie is een beantwoording van dit onderzoek te doen. Bij de onderzoeken vanuit de raad van bestuurscommissie van De gemeente gaat het voork

In [None]:
random_q_c_fact[17]

[{'generated_text': 'Het college heeft zich op de hoogte van de aanpak van het aantal maatregelen die de politie hebben geïnformeerd. De politie haalde de inzet van een nieuwe locatie. Deze situatie is echter niet bekend. Het gaat om de uitwerking van deze campagne. In de gemeente hullen de stadsdelen altijd mogelijke activiteiten in de stad. Daarnaast is er geen sprake van problemen met de bewoners die tijdens de zomervakantie moeten worden gestart. Dit gebeurt in het stadsdeel van Amsterdam. Er is geen aanleiding om vragen te stellen over de veiligheid van stadsgebieden. Hierbij is het college verantwoordelijk voor de ontwikkeling van bezoekers en jongeren die in Amsterdam werken. Op dit moment is de Amsterdamse politie een aandacht voor het aanpakken van babbeltrucs. Voor de huidige samenwerking met ondernemers die een belangrijke bijdrage aan de preventieve voorlichting op het betreffende stadsgebied. Bij de toekomstige stadsontwikkeling is een uitgangspunt voor verbetering van vei

In [None]:
tfidf_c_fact2[17]

[{'generated_text': 'Het college heeft een aantal vragen over de vraag van de gemeente. De gemeente huldigt de beantwoording van het antwoord op de aanpak van een bevraagd onderwerp. Deze brief is gemaakt op het college. Het gaat om de uitwerking van deze bijeenkomst. In de voorlichtingsbijeenkomst is een aanleiding om een uitgangspunt te hebben om het stadsdeel te bepalen van nieuwe situaties te kunnen stellen. Dit is niet mogelijk om zich te verplaatsen. Daarnaast is het aandacht voor de inzet van bewoners die tijdens de stadsdelen te laten gaan voor het bekijken van dit evenement. Er is geen sprake van aanpassing van inzicht op een invloed van oplichting. Op dit moment is er geen aanwijzing dat de politie moet werken aan de oplichters. Hierbij is de afweging met de burgemeester van Amsterdam. Bij de toekomst van stadsgebied is in de stad altijd een belangrijke gebeurtenis vanuit de Amsterdamse locaties van die stadsregio komt. Voor de huidige vergadering van gemeente Amsterdam is da

In [None]:
random_c_fact2[17]

[{'generated_text': 'Het college heeft zich op de hoogte van de aanpak van het bestemmingsplan. De plannen zijn gericht op het bepalen van een vergunningsbeleid. Deze maatregelen zijn in de gemeente echter niet nodig. Het besluit is dat de beantwoording van deze vragen moeten worden geïnformeerd. In de periode van oktober 2019 is de uitwerking van dit voorstel voor de inzet van bewoners en de stadsdelen. Daarnaast is het college verantwoordelijk voor het opstellen van aanpassing van inzicht op vraag 1 en 2. '}]

In [None]:
random_c_fact2[20]

[{'generated_text': 'Het college heeft een aanvraag voor de vergunning voor het bouwen van een vve-gebouw. De subsidie voor verhuur van de vVE-locatie is echter niet nodig. Deze regeling is niet in werking gehouden. Het besluit is dat de gemeente zich moet houden op de aanpak van het college. In de beantwoording van deze vraag is in de uitwerking van nieuwe projecten. Dit gebeurt in het onderzoek vanuit de wettelijke maatregelen die de inzet van dit voorstel hebben geïnformeerd. Daarnaast is de opvang van bewoners van Amsterdam een aanvangsvergunning vanaf de voorjaarsvergunning. Er is geen antwoord op het volgende vragen. Op basis van dat het inzicht van alle plannen van gemeente Amsterdam is een invloed op verbetering van erfgoedwaarden vanwege de verduurzaming en de afloop van die werkzaamheden zijn gevolgd. Voor de toekomst van inwoner van Nederland is het aandacht voor een bestaande situatie. Hierbij is er een uitspraak van aanleiding van onderzoek naar de ontwikkeling van onderde