In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import json


def write_json_to_file(json_object, json_file, mode='w', encoding='utf-8'):
    with open(json_file, mode, encoding=encoding) as outfile:
        json.dump(json_object, outfile, indent=4, sort_keys=True, ensure_ascii=False)


def get_file_contents(filename, encoding='utf-8'):
    with open(filename, encoding=encoding) as f:
        content = f.read()
    return content


def read_json(filename, encoding='utf-8'):
    contents = get_file_contents(filename, encoding=encoding)
    return json.loads(contents)


def get_file_contents_as_list(file_path, encoding='utf-8', ignore_blanks=True):
    contents = get_file_contents(file_path, encoding=encoding)
    lines = contents.split('\n')
    lines = [line for line in lines if line != ''] if ignore_blanks else lines
    return lines

In [3]:
# Key for wikipedia eval is question-id. Key for web eval is the (question_id, filename) tuple
def get_key_to_ground_truth(data):
    if data['Domain'] == 'Wikipedia':
        return {datum['QuestionId']: datum['Answer'] for datum in data['Data']}
    else:
        return get_qd_to_answer(data)


def get_question_doc_string(qid, doc_name):
    return '{}--{}'.format(qid, doc_name)

def get_qd_to_answer(data):
    key_to_answer = {}
    for datum in data['Data']:
        for page in datum.get('EntityPages', []) + datum.get('SearchResults', []):
            qd_tuple = get_question_doc_string(datum['QuestionId'], page['Filename'])
            key_to_answer[qd_tuple] = datum['Answer']
    return key_to_answer
#this is for getting the answers out of the dataset, but i can do that differently

def read_clean_part(datum):
    for key in ['EntityPages', 'SearchResults']:
        new_page_list = []
        for page in datum.get(key, []):
            if page['DocPartOfVerifiedEval']:
                new_page_list.append(page)
        datum[key] = new_page_list
    assert len(datum['EntityPages']) + len(datum['SearchResults']) > 0
    return datum


def read_triviaqa_data(qajson):
    data = read_json(qajson)
    # read only documents and questions that are a part of clean data set
    if data['VerifiedEval']:
        clean_data = []
        for datum in data['Data']:
            if datum['QuestionPartOfVerifiedEval']:
                if data['Domain'] == 'Web':
                    datum = read_clean_part(datum)
                clean_data.append(datum)
        data['Data'] = clean_data
    return data


def answer_index_in_document(answer, document):
    answer_list = answer['NormalizedAliases']
    for answer_string_in_doc in answer_list:
        index = document.lower().find(answer_string_in_doc)
        if index != -1:
            return answer_string_in_doc, index
    return answer['NormalizedValue'], -1

In [4]:
# -*- coding: utf-8 -*-
""" Official evaluation script for v1.0 of the TriviaQA dataset.
Extended from the evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import sys
import argparse


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def handle_punc(text):
        exclude = set(string.punctuation + "".join([u"‘", u"’", u"´", u"`"]))
        return ''.join(ch if ch not in exclude else ' ' for ch in text)

    def lower(text):
        return text.lower()

    def replace_underscore(text):
        return text.replace('_', ' ')

    return white_space_fix(remove_articles(handle_punc(lower(replace_underscore(s))))).strip()


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def is_exact_match(answer_object, prediction):
    ground_truths = get_ground_truths(answer_object)
    for ground_truth in ground_truths:
        if exact_match_score(prediction, ground_truth):
            return True
    return False


def has_exact_match(ground_truths, candidates):
    for ground_truth in ground_truths:
        if ground_truth in candidates:
            return True
    return False


def get_ground_truths(answer):
    return answer['normalized_aliases'] + [normalize_answer(ans) for ans in answer.get('HumanAnswers', [])]


def get_oracle_score(ground_truth, predicted_answers, qid_list=None, mute=False):
    exact_match = common = 0
    if qid_list is None:
        qid_list = ground_truth.keys()
    for qid in qid_list:
        if qid not in predicted_answers:
            if not mute:
                message = 'Irrelavant question {} will receive score 0.'.format(qid)
                print(message, file=sys.stderr)
            continue
        common += 1
        prediction = normalize_answer(predicted_answers[qid])
        ground_truths = get_ground_truths(ground_truth[qid])
        em_for_this_question = has_exact_match(ground_truths, prediction)
        exact_match += int(em_for_this_question)

    exact_match = 100.0 * exact_match / len(qid_list)

    return {'oracle_exact_match': exact_match, 'common': common, 'denominator': len(qid_list),
            'pred_len': len(predicted_answers), 'gold_len': len(ground_truth)}


def evaluate_triviaqa(ground_truth, predicted_answers, qid_list=None, mute=False):
    f1 = exact_match = common = 0
    if qid_list is None:
        qid_list = ground_truth.keys()
    for qid in qid_list:
        if qid not in predicted_answers:
            if not mute:
                message = 'Missed question {} will receive score 0.'.format(qid)
                print(message, file=sys.stderr)
            continue
        if qid not in ground_truth:
            if not mute:
                message = 'Irrelavant question {} will receive score 0.'.format(qid)
                print(message, file=sys.stderr)
            continue
        common += 1
        prediction = predicted_answers[qid]
        ground_truths = get_ground_truths(ground_truth[qid])
        em_for_this_question = metric_max_over_ground_truths(
            exact_match_score, prediction, ground_truths)
        if em_for_this_question == 0 and not mute:
            print("em=0:", prediction, ground_truths)
        exact_match += em_for_this_question
        f1_for_this_question = metric_max_over_ground_truths(
            f1_score, prediction, ground_truths)
        f1 += f1_for_this_question

    exact_match = 100.0 * exact_match / len(qid_list)
    f1 = 100.0 * f1 / len(qid_list)

    return {'exact_match': exact_match, 'f1': f1, 'common': common, 'denominator': len(qid_list),
            'pred_len': len(predicted_answers), 'gold_len': len(ground_truth)}


def get_args():
    parser = argparse.ArgumentParser(
        description='Evaluation for TriviaQA {}'.format(expected_version))
    parser.add_argument('--dataset_file', help='Dataset file')
    parser.add_argument('--prediction_file', help='Prediction File')
    args = parser.parse_args()
    return args

Load dataset

In [12]:
from datasets import load_dataset
dataset = load_dataset('trivia_qa', 'rc.wikipedia', split='validation')

contexts = [entity_page['wiki_context'] for entity_page in dataset['entity_pages']]
ground_truth = {datum['question_id']: datum['answer'] for datum in dataset}

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Prepare answerset

In [None]:
predictions1 = read_json('resultsDPR1.json')
predictions2 = read_json('resultsDPR2.json')
pred4 = read_json('resultsDPR4.json')
pred5 = read_json('resultsDPR5.json')
predictions4 = [entry['answer'] for entry in pred4]
predictions5 = [entry['answer'] for entry in pred5]

FileNotFoundError: [Errno 2] No such file or directory: 'resultsDPR1.json'

In [None]:
keys = []
for key in ground_truth.keys():
  keys.append(key)

predictionary1 = dict(zip(keys, predictions1))
predictionary2 = dict(zip(keys, predictions2))
predictionary4 = dict(zip(keys, predictions4))
predictionary5 = dict(zip(keys, predictions5))
predictionarydimi = dict(zip(keys, resultsnewdimi))

In [None]:
keys = []
for key in ground_truth.keys():
  keys.append(key)

predictionarydimi = dict(zip(keys, resultsnewdimi))
resultsdimi = evaluate_triviaqa(ground_truth, predictionarydimi)

em=0: James Gordon Brown ['henry campbell bannerman', 'sir henry campbell bannerman', 'campbell bannerman']
em=0: Cancer Research Fund ['aids related cancer', 'sporadic cancer', 'cancer disease', 'malignant tumors', 'cancers', 'carcinophobia', 'cancer', 'cancer diagnosis', 'malignant neoplastic disease', 'malignant neoplasm', 'tumour virus', 'cancer medicine', 'deaths by cancer', 'malignant tumour', 'epithelial cancers', 'solid cancer', 'cancerous', 'borderline cancer', 'invasive cancer', 'anti cancer', 'cancer pathology', 'cancer signs', 'cancer aromatase', 'cancer therapy', 'financial toxicity', 'cancerophobia', 'cancer en cuirasse', 'cancer patient', 'cancerous tumor', 'malignant cancer', 'malignant neoplasms', 'tumor medication', 'signs of cancer', 'malignacy', 'malignant tumor', 'cancer medication', 'microtumor', 'malignancies', 'malignant lesion', 'malignant growth']
em=0: Hepburn ['lauren becall', 'loren bacall', 'lauren becal', 'lauren bacall', 'betty j perske', 'betty perske',

In [None]:
resultsDistillk5 = evaluate_triviaqa(ground_truth, predictionary1)
resultsDistillk1 = evaluate_triviaqa(ground_truth, predictionary2)
resultsRoberta5 = evaluate_triviaqa(ground_truth, predictionary4)
resultsRoberta1 = evaluate_triviaqa(ground_truth, predictionary5)
resultsdimi = evaluate_triviaqa(ground_truth, predictionarydimi)

NameError: name 'predictionary1' is not defined

Pinecone load packages

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0.

Pinecoding create embeddings with model above (one time)



In [None]:
def encode_function(data):
    encodings = []
    for page in data['entity_pages']:  # Iterate through individual examples
        wiki_contexts = page['wiki_context']
        if not isinstance(wiki_contexts, list):
            wiki_contexts = [wiki_contexts]
        context = " ".join(wiki_contexts)
        encodings.append(model.encode(context))
    return {'encoding': encodings}

dataset = dataset.map(encode_function, batched=True, batch_size=4)


Map:   0%|          | 0/7993 [00:00<?, ? examples/s]

In [9]:
# initialize connection to the new index
index = pc.Index('wiki-validation-minilm')

from tqdm.auto import tqdm  # progress bar

upserts = []
for v in dataset:
    # Truncate the wiki_context to a maximum of 5000 characters
    truncated_context = str(v['entity_pages']['wiki_context'])[:5000]
    upserts.append((v['question_id'], [float(x) for x in v['encoding']], {'bytes': truncated_context}))

# now upsert in chunks
for i in tqdm(range(0, len(upserts), 5)):
    i_end = i + 5
    if i_end > len(upserts): i_end = len(upserts)
    index.upsert(vectors=upserts[i:i_end])

KeyError: 'encoding'

In [None]:
spec = ServerlessSpec(
    cloud="aws",
    region="us-east-1",
  )

# check if index already exists, if not we create it
if 'wiki-validation-minilm' not in pc.list_indexes():
    pc.create_index(
        name='wiki-validation-minilm', dimension=model.get_sentence_embedding_dimension(), metric='cosine',spec=spec
    )

# we use this to get required index dims
model.get_sentence_embedding_dimension()

384

Pinecone load embeddings

In [8]:
from pinecone import Pinecone, ServerlessSpec
from google.colab import userdata

API_KEY = "pcsk_3zVMgZ_Fj5RX3UmosrRwTM5z5Fq4k2VbgBGYYqpYdivwmWR6bXfqjToAGcmX4wVp5xriVz"

pc = Pinecone(
    api_key=API_KEY
)


Pinecone reader and storing

In [11]:
from transformers import pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [45]:
question = dataset['question']

In [47]:
#question = ["Who is the king of netherlands", "Who is the king of England"]
index = pc.Index('wiki-validation-minilm')

results = []
n = len(question[0:10])
for q in question[0:10]:
  print(n)
  n = n-1
  #print(q)
  xq = model.encode([q]).tolist()
  xc = index.query(vector=xq, top_k=2, include_metadata=True)
  docs = []
  for x in range(len(xc["matches"])):
    con = xc["matches"][x]["metadata"]["bytes"]#["metadata"]
    docs.append(con)
  context = ''.join(docs)
  result = qa_pipeline(question=q, context=context)
  #print(f"Answer: {result['answer']}")
  results.append(result['answer'])

10
9
8
7
6
5
4
3
2
1


In [35]:
results

['James Gordon Brown',
 'Exile',
 'Cancer Research Fund',
 'Hepburn',
 'Bad',
 'Kilimanjaro',
 'white',
 'Kerma',
 'Oliver!',
 'Cologne']

In [None]:
resultsfirst500 = results

import json

filename = "first500."

write_json_to_file(resultsfirst500, filename)


Evaluation (for now with just 100 questions)

In [46]:
subset = dataset.select(range(0,100))
ground_truth = {datum['question_id']: datum['answer'] for datum in subset}

In [38]:
keys = []
for key in ground_truth.keys():
  keys.append(key)

predictionary = dict(zip(keys, results))
accuracy = evaluate_triviaqa(ground_truth, predictionary)

em=0: James Gordon Brown ['henry campbell bannerman', 'sir henry campbell bannerman', 'campbell bannerman']
em=0: Cancer Research Fund ['aids related cancer', 'sporadic cancer', 'cancer disease', 'malignant tumors', 'cancers', 'carcinophobia', 'cancer', 'cancer diagnosis', 'malignant neoplastic disease', 'malignant neoplasm', 'tumour virus', 'cancer medicine', 'deaths by cancer', 'malignant tumour', 'epithelial cancers', 'solid cancer', 'cancerous', 'borderline cancer', 'invasive cancer', 'anti cancer', 'cancer pathology', 'cancer signs', 'cancer aromatase', 'cancer therapy', 'financial toxicity', 'cancerophobia', 'cancer en cuirasse', 'cancer patient', 'cancerous tumor', 'malignant cancer', 'malignant neoplasms', 'tumor medication', 'signs of cancer', 'malignacy', 'malignant tumor', 'cancer medication', 'microtumor', 'malignancies', 'malignant lesion', 'malignant growth']
em=0: Hepburn ['lauren becall', 'loren bacall', 'lauren becal', 'lauren bacall', 'betty j perske', 'betty perske',

In [39]:
accuracy

{'exact_match': 22.22222222222222,
 'f1': 27.77777777777778,
 'common': 9,
 'denominator': 9,
 'pred_len': 9,
 'gold_len': 9}