In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
import json


def write_json_to_file(json_object, json_file, mode='w', encoding='utf-8'):
    with open(json_file, mode, encoding=encoding) as outfile:
        json.dump(json_object, outfile, indent=4, sort_keys=True, ensure_ascii=False)


def get_file_contents(filename, encoding='utf-8'):
    with open(filename, encoding=encoding) as f:
        content = f.read()
    return content


def read_json(filename, encoding='utf-8'):
    contents = get_file_contents(filename, encoding=encoding)
    return json.loads(contents)


def get_file_contents_as_list(file_path, encoding='utf-8', ignore_blanks=True):
    contents = get_file_contents(file_path, encoding=encoding)
    lines = contents.split('\n')
    lines = [line for line in lines if line != ''] if ignore_blanks else lines
    return lines

In [None]:
# Key for wikipedia eval is question-id. Key for web eval is the (question_id, filename) tuple
def get_key_to_ground_truth(data):
    if data['Domain'] == 'Wikipedia':
        return {datum['QuestionId']: datum['Answer'] for datum in data['Data']}
    else:
        return get_qd_to_answer(data)


def get_question_doc_string(qid, doc_name):
    return '{}--{}'.format(qid, doc_name)

def get_qd_to_answer(data):
    key_to_answer = {}
    for datum in data['Data']:
        for page in datum.get('EntityPages', []) + datum.get('SearchResults', []):
            qd_tuple = get_question_doc_string(datum['QuestionId'], page['Filename'])
            key_to_answer[qd_tuple] = datum['Answer']
    return key_to_answer
#this is for getting the answers out of the dataset, but i can do that differently

def read_clean_part(datum):
    for key in ['EntityPages', 'SearchResults']:
        new_page_list = []
        for page in datum.get(key, []):
            if page['DocPartOfVerifiedEval']:
                new_page_list.append(page)
        datum[key] = new_page_list
    assert len(datum['EntityPages']) + len(datum['SearchResults']) > 0
    return datum


def read_triviaqa_data(qajson):
    data = read_json(qajson)
    # read only documents and questions that are a part of clean data set
    if data['VerifiedEval']:
        clean_data = []
        for datum in data['Data']:
            if datum['QuestionPartOfVerifiedEval']:
                if data['Domain'] == 'Web':
                    datum = read_clean_part(datum)
                clean_data.append(datum)
        data['Data'] = clean_data
    return data


def answer_index_in_document(answer, document):
    answer_list = answer['NormalizedAliases']
    for answer_string_in_doc in answer_list:
        index = document.lower().find(answer_string_in_doc)
        if index != -1:
            return answer_string_in_doc, index
    return answer['NormalizedValue'], -1

In [None]:
# -*- coding: utf-8 -*-
""" Official evaluation script for v1.0 of the TriviaQA dataset.
Extended from the evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import sys
import argparse


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def handle_punc(text):
        exclude = set(string.punctuation + "".join([u"‘", u"’", u"´", u"`"]))
        return ''.join(ch if ch not in exclude else ' ' for ch in text)

    def lower(text):
        return text.lower()

    def replace_underscore(text):
        return text.replace('_', ' ')

    return white_space_fix(remove_articles(handle_punc(lower(replace_underscore(s))))).strip()


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def is_exact_match(answer_object, prediction):
    ground_truths = get_ground_truths(answer_object)
    for ground_truth in ground_truths:
        if exact_match_score(prediction, ground_truth):
            return True
    return False


def has_exact_match(ground_truths, candidates):
    for ground_truth in ground_truths:
        if ground_truth in candidates:
            return True
    return False


def get_ground_truths(answer):
    return answer['normalized_aliases'] + [normalize_answer(ans) for ans in answer.get('HumanAnswers', [])]


def get_oracle_score(ground_truth, predicted_answers, qid_list=None, mute=False):
    exact_match = common = 0
    if qid_list is None:
        qid_list = ground_truth.keys()
    for qid in qid_list:
        if qid not in predicted_answers:
            if not mute:
                message = 'Irrelavant question {} will receive score 0.'.format(qid)
                print(message, file=sys.stderr)
            continue
        common += 1
        prediction = normalize_answer(predicted_answers[qid])
        ground_truths = get_ground_truths(ground_truth[qid])
        em_for_this_question = has_exact_match(ground_truths, prediction)
        exact_match += int(em_for_this_question)

    exact_match = 100.0 * exact_match / len(qid_list)

    return {'oracle_exact_match': exact_match, 'common': common, 'denominator': len(qid_list),
            'pred_len': len(predicted_answers), 'gold_len': len(ground_truth)}


def evaluate_triviaqa(ground_truth, predicted_answers, qid_list=None, mute=False):
    f1 = exact_match = common = 0
    if qid_list is None:
        qid_list = ground_truth.keys()
    for qid in qid_list:
        if qid not in predicted_answers:
            if not mute:
                message = 'Missed question {} will receive score 0.'.format(qid)
                print(message, file=sys.stderr)
            continue
        if qid not in ground_truth:
            if not mute:
                message = 'Irrelavant question {} will receive score 0.'.format(qid)
                print(message, file=sys.stderr)
            continue
        common += 1
        prediction = predicted_answers[qid]
        ground_truths = get_ground_truths(ground_truth[qid])
        em_for_this_question = metric_max_over_ground_truths(
            exact_match_score, prediction, ground_truths)
        if em_for_this_question == 0 and not mute:
            print("em=0:", prediction, ground_truths)
        exact_match += em_for_this_question
        f1_for_this_question = metric_max_over_ground_truths(
            f1_score, prediction, ground_truths)
        f1 += f1_for_this_question

    exact_match = 100.0 * exact_match / len(qid_list)
    f1 = 100.0 * f1 / len(qid_list)

    return {'exact_match': exact_match, 'f1': f1, 'common': common, 'denominator': len(qid_list),
            'pred_len': len(predicted_answers), 'gold_len': len(ground_truth)}


def get_args():
    parser = argparse.ArgumentParser(
        description='Evaluation for TriviaQA {}'.format(expected_version))
    parser.add_argument('--dataset_file', help='Dataset file')
    parser.add_argument('--prediction_file', help='Prediction File')
    args = parser.parse_args()
    return args

In [None]:
from datasets import load_dataset
dataset = load_dataset('trivia_qa', 'rc.wikipedia', split='validation')

subsample = dataset.select(range(1,100))
question = subsample['question']

contexts = [entity_page['wiki_context'] for entity_page in subsample['entity_pages']]
ground_truth = {datum['question_id']: datum['answer'] for datum in subsample}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/26.7k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

train-00000-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

train-00001-of-00007.parquet:   0%|          | 0.00/261M [00:00<?, ?B/s]

train-00002-of-00007.parquet:   0%|          | 0.00/319M [00:00<?, ?B/s]

train-00003-of-00007.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00004-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

train-00005-of-00007.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

train-00006-of-00007.parquet:   0%|          | 0.00/253M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/61888 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7993 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7701 [00:00<?, ? examples/s]

In [None]:
predictions1 = read_json('resultsDPR1.json')
predictions2 = read_json('resultsDPR2.json')
pred4 = read_json('resultsDPR4.json')
pred5 = read_json('resultsDPR5.json')
predictions4 = [entry['answer'] for entry in pred4]
predictions5 = [entry['answer'] for entry in pred5]

FileNotFoundError: [Errno 2] No such file or directory: 'resultsDPR1.json'

In [None]:
keys = []
for key in ground_truth.keys():
  keys.append(key)

predictionary1 = dict(zip(keys, predictions1))
predictionary2 = dict(zip(keys, predictions2))
predictionary4 = dict(zip(keys, predictions4))
predictionary5 = dict(zip(keys, predictions5))
predictionarydimi = dict(zip(keys, resultsnewdimi))

In [None]:
keys = []
for key in ground_truth.keys():
  keys.append(key)

predictionarydimi = dict(zip(keys, resultsnewdimi))
resultsdimi = evaluate_triviaqa(ground_truth, predictionarydimi)

em=0: James Gordon Brown ['henry campbell bannerman', 'sir henry campbell bannerman', 'campbell bannerman']
em=0: Cancer Research Fund ['aids related cancer', 'sporadic cancer', 'cancer disease', 'malignant tumors', 'cancers', 'carcinophobia', 'cancer', 'cancer diagnosis', 'malignant neoplastic disease', 'malignant neoplasm', 'tumour virus', 'cancer medicine', 'deaths by cancer', 'malignant tumour', 'epithelial cancers', 'solid cancer', 'cancerous', 'borderline cancer', 'invasive cancer', 'anti cancer', 'cancer pathology', 'cancer signs', 'cancer aromatase', 'cancer therapy', 'financial toxicity', 'cancerophobia', 'cancer en cuirasse', 'cancer patient', 'cancerous tumor', 'malignant cancer', 'malignant neoplasms', 'tumor medication', 'signs of cancer', 'malignacy', 'malignant tumor', 'cancer medication', 'microtumor', 'malignancies', 'malignant lesion', 'malignant growth']
em=0: Hepburn ['lauren becall', 'loren bacall', 'lauren becal', 'lauren bacall', 'betty j perske', 'betty perske',

In [None]:
resultsdimi

{'exact_match': 27.272727272727273,
 'f1': 32.02797202797203,
 'common': 99,
 'denominator': 99,
 'pred_len': 99,
 'gold_len': 99}

In [None]:
resultsDistillk5 = evaluate_triviaqa(ground_truth, predictionary1)
resultsDistillk1 = evaluate_triviaqa(ground_truth, predictionary2)
resultsRoberta5 = evaluate_triviaqa(ground_truth, predictionary4)
resultsRoberta1 = evaluate_triviaqa(ground_truth, predictionary5)
resultsdimi = evaluate_triviaqa(ground_truth, predictionarydimi)

NameError: name 'predictionary1' is not defined

In [None]:
resultsL = evaluate_triviaqa(ground_truth, resultsnewdimi

In [None]:
print(resultsDistillk5)
print(resultsDistillk1)
print(resultsRoberta5)
print(resultsRoberta1)
print(resultsL)

{'exact_match': 16.161616161616163, 'f1': 20.151515151515152, 'common': 99, 'denominator': 99, 'pred_len': 99, 'gold_len': 99}
{'exact_match': 12.121212121212121, 'f1': 15.490766248342004, 'common': 99, 'denominator': 99, 'pred_len': 99, 'gold_len': 99}
{'exact_match': 22.22222222222222, 'f1': 27.28049728049728, 'common': 99, 'denominator': 99, 'pred_len': 99, 'gold_len': 99}
{'exact_match': 12.121212121212121, 'f1': 17.104522862098623, 'common': 99, 'denominator': 99, 'pred_len': 99, 'gold_len': 99}
{'exact_match': 7.070707070707071, 'f1': 7.97979797979798, 'common': 99, 'denominator': 99, 'pred_len': 99, 'gold_len': 99}


Pinecone method

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def encode_function(data):
    encodings = []
    for page in data['entity_pages']:  # Iterate through individual examples
        wiki_contexts = page['wiki_context']
        if not isinstance(wiki_contexts, list):
            wiki_contexts = [wiki_contexts]
        context = " ".join(wiki_contexts)
        encodings.append(model.encode(context))
    return {'encoding': encodings}

dataset = dataset.map(encode_function, batched=True, batch_size=4)


Map:   0%|          | 0/7993 [00:00<?, ? examples/s]

In [None]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0.

In [None]:
from pinecone import Pinecone, ServerlessSpec
from google.colab import userdata

API_KEY = "pcsk_3zVMgZ_Fj5RX3UmosrRwTM5z5Fq4k2VbgBGYYqpYdivwmWR6bXfqjToAGcmX4wVp5xriVz"

pc = Pinecone(
    api_key=API_KEY
)


In [None]:
spec = ServerlessSpec(
    cloud="aws",
    region="us-east-1",
  )

# check if index already exists, if not we create it
if 'wiki-validation-minilm' not in pc.list_indexes():
    pc.create_index(
        name='wiki-validation-minilm', dimension=model.get_sentence_embedding_dimension(), metric='cosine',spec=spec
    )

# we use this to get required index dims
model.get_sentence_embedding_dimension()

384

In [None]:
# initialize connection to the new index
index = pc.Index('wiki-validation-minilm')

from tqdm.auto import tqdm  # progress bar

upserts = []
for v in dataset:
    # Truncate the wiki_context to a maximum of 5000 characters
    truncated_context = str(v['entity_pages']['wiki_context'])[:5000]
    upserts.append((v['question_id'], [float(x) for x in v['encoding']], {'bytes': truncated_context}))

# now upsert in chunks
for i in tqdm(range(0, len(upserts), 5)):
    i_end = i + 5
    if i_end > len(upserts): i_end = len(upserts)
    index.upsert(vectors=upserts[i:i_end])

  0%|          | 0/1599 [00:00<?, ?it/s]

In [None]:
from transformers import pipeline


In [None]:
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
question = dataset['question']

In [None]:
#question = ["Who is the king of netherlands", "Who is the king of England"]
index = pc.Index('wiki-validation-minilm')

results = []
n = len(question[0:500])
for q in question[0:500]:
  print(n)
  n = n-1
  #print(q)
  xq = model.encode([q]).tolist()
  xc = index.query(vector=xq, top_k=2, include_metadata=True)
  docs = []
  for x in range(len(xc["matches"])):
    con = xc["matches"][x]["metadata"]["bytes"]#["metadata"]
    docs.append(con)
  context = ''.join(docs)
  result = qa_pipeline(question=q, context=context)
  #print(f"Answer: {result['answer']}")
  results.append(result['answer'])

500
499
498
497
496
495
494
493
492
491
490
489
488
487
486
485
484
483
482
481
480
479
478
477
476
475
474
473
472
471
470
469
468
467
466
465
464
463
462
461
460
459
458
457
456
455
454
453
452
451
450
449
448
447
446
445
444
443
442
441
440
439
438
437
436
435
434
433
432
431
430
429
428
427
426
425
424
423
422
421
420
419
418
417
416
415
414
413
412
411
410
409
408
407
406
405
404
403
402
401
400
399
398
397
396
395
394
393
392
391
390
389
388
387
386
385
384
383
382
381
380
379
378
377
376
375
374
373
372
371
370
369
368
367
366
365
364
363
362
361
360
359
358
357
356
355
354
353
352
351
350
349
348
347
346
345
344
343
342
341
340
339
338
337
336
335
334
333
332
331
330
329
328
327
326
325
324
323
322
321
320
319
318
317
316
315
314
313
312
311
310
309
308
307
306
305
304
303
302
301
300
299
298
297
296
295
294
293
292
291
290
289
288
287
286
285
284
283
282
281
280
279
278
277
276
275
274
273
272
271
270
269
268
267
266
265
264
263
262
261
260
259
258
257
256
255
254
253
252
251


In [None]:
results.pop(0)

'Joseph'

In [None]:
results

['James Gordon Brown',
 'Exile',
 'Cancer Research Fund',
 'Hepburn',
 'Bad',
 'Kilimanjaro',
 'white',
 'Kerma',
 'Oliver!',
 'Cologne',
 'Richard Marx',
 'John Ford',
 'Pink Floyd',
 'Never Ending Tour',
 'Romania',
 'According to Jim (2001-2009)',
 'Fiddler on the Roof',
 'Treasure Island',
 'Truman Streckfus Persons',
 'literary nonsense',
 'Cool Rider',
 'Gerald Rudolff Ford Jr',
 'basketball',
 'Rudolf Hess',
 'Luke Steele',
 'Japan',
 'basketball',
 'Morning Train',
 'Horseshoe Falls',
 'The Squaw Man',
 'Dartford, Kent',
 'Prometheus, Pandora, and the Five Ages',
 'Jelly Roll Morton',
 'Southeast Yemen',
 '27',
 'Richard Noble',
 'American',
 'The Marshall Plan',
 'The Last Boy Scout',
 'Trace Adkins',
 '\\n\\nRobbie Williams',
 'Angola',
 'Diva',
 'Dark Blood',
 'David Keith Williamson',
 'Chicago',
 'Mata Hari',
 'cartoons published in the Daily Express',
 'Spencer Gore',
 'Palmer',
 'Famous Players-Lasky Corporation',
 'Clements Markham',
 'David Leigh',
 'George Dewey Cukor

In [None]:
resultsfirst500 = results

import json

filename = "first500."

write_json_to_file(resultsfirst500, filename)


In [None]:
resultsnewdimi = results

Leon's data

In [None]:
combined_answers = {
    1: "A Christmas Carol",
    2: "Mashiach ben David",
    3: "The Beatles",
    4: "Pre-Galfridian",
    5: "Miss Prism",
    6: "Destiny",
    7: "Mount Kilimanjaro",
    8: "orange",
    9: "Senegal",
    10: "Poland was home",
    11: "Adolf Hitler",
    12: "The Staple Singers",
    13: "Christian",
    14: "Dire Straits",
    15: "Nashville",
    16: "Romania",
    17: "Mostly Harmless",
    18: "Saudi Arabia",
    19: "The Silence of the Lambs",
    20: "Frank Lloyd Wright",
    21: "Boojum",
    22: "Last Vegas",
    23: "Arif",
    24: "rights of the sender to the goods are extinguished",
    25: "Rudolf Hess",
    26: "Inigo Montoya",
    27: "Azerbaijan",
    28: "love, war, wilderness and loss",
    29: "Exordium & Terminus",
    30: "Newfoundland Red and Blue ensigns",
    31: "Boa Vista",
    32: "Oxford",
    33: "Angeloi",
    34: "capsaicin",
    35: "confidential",
    36: "Ronnie Wood",
    37: "New Guinea",
    38: "The word \"shish\" (, ) means skewer",
    39: "Zoroastrianism",
    40: "Victim",
    41: "Thynne family",
    42: "rights of the sender to the goods are extinguished",
    43: "Club Europe",
    44: "Precious",
    45: "1999",
    46: "Darul Uloom",
    47: "",
    48: "Treaty ports",
    49: "Wine tasting",
    50: "Serena Williams",
    51: "Walter Hagen",
    52: "Star Trek",
    53: "Roald Amundsen",
    54: "Krystyna Nosarzewska",
    55: "James Brudenell",
    56: "cigarette",
    57: "army",
    58: "Basic education in Libya is free for all citizens",
    59: "Michael Rimmer",
    60: "Artemis",
    61: "July 21",
    62: "Armenia",
    63: "impartial arbitrator",
    64: "Charles",
    65: "five",
    66: "renewable energy",
    67: "Jerry Palter",
    68: "The Libyan National Army",
    69: "metaphysics",
    70: "Mostly Harmless",
    71: "extremely diverse",
    72: "Michigan State University",
    73: "Indonesian",
    74: "Michael Jackson",
    75: "Katie Abrams",
    76: "Cynthia Harris",
    77: "Laura",
    78: "Bonaparte",
    79: "The Hundred Days",
    80: "Don Sharp",
    81: "The Cage",
    82: "1973",
    83: "Lisbeth Salander",
    84: "The Movie Album",
    85: "Thalia",
    86: "Gene Hackman",
    87: "Bill Robinson",
    88: "Legacy",
    89: "Christopher Lee",
    90: "The word 'shish' means skewer",
    91: "Ywain the Great",
    92: "French",
    93: "Mapocho",
    94: "Book of Laws",
    95: "Franklin",
    96: "The VIPs",
    97: "Duleep Singh",
    98: "space propulsion",
    99: "New Zealand",
    100: "Mikhail Gorbachev"
}

In [None]:
#predictionL = list(combined_answers.values())
#predictionL.pop(0)
predictionaryL = dict(zip(keys, predictionL))
predictionaryL

{'tc_40': 'Mashiach ben David',
 'tc_49': 'The Beatles',
 'tc_56': 'Pre-Galfridian',
 'tc_106': 'Miss Prism',
 'tc_137': 'Destiny',
 'tc_217': 'Mount Kilimanjaro',
 'tc_219': 'orange',
 'tc_241': 'Senegal',
 'tc_261': 'Poland was home',
 'tc_267': 'Adolf Hitler',
 'tc_276': 'The Staple Singers',
 'tc_280': 'Christian',
 'tc_282': 'Dire Straits',
 'tc_288': 'Nashville',
 'tc_298': 'Romania',
 'tc_304': 'Mostly Harmless',
 'tc_316': 'Saudi Arabia',
 'tc_349': 'The Silence of the Lambs',
 'tc_379': 'Frank Lloyd Wright',
 'tc_397': 'Boojum',
 'tc_455': 'Last Vegas',
 'tc_510': 'Arif',
 'tc_515': 'rights of the sender to the goods are extinguished',
 'tc_517': 'Rudolf Hess',
 'tc_538': 'Inigo Montoya',
 'tc_540': 'Azerbaijan',
 'tc_543': 'love, war, wilderness and loss',
 'tc_559': 'Exordium & Terminus',
 'tc_561': 'Newfoundland Red and Blue ensigns',
 'tc_564': 'Boa Vista',
 'tc_586': 'Oxford',
 'tc_604': 'Angeloi',
 'tc_626': 'capsaicin',
 'tc_635': 'confidential',
 'tc_653': 'Ronnie Wood

Self made loop method

In [None]:
all_text_chunks = []
# Loop through each file in the directory
m=0
for data in flat_list:
        text_chunks = data.split('\n\n')  # Splitting by double newlines for paragraphs
        # Tokenize the text chunks
        for x in text_chunks:
            all_text_chunks.append(text_chunks)
        #m = m+1
        #if m > 1000: break

In [None]:
len(all_text_chunks)

9104

In [None]:
!pip install rank_bm25
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from rank_bm25 import BM25Okapi

# Function to extract nouns from a question
def find_nouns_verbs(question):
    # Perform tokenization and POS tagging
    tokenized = nltk.word_tokenize(question)
    pos_tags = nltk.pos_tag(tokenized)
    # Filter nouns based on POS tags
    nouns_and_verbs = [word for (word, pos) in pos_tags if pos[:2] == 'NN' or pos[:2] == 'VB']
    names = [word for (word, pos) in pos_tags if pos in ['NNP', 'NNPS']]
    return set(nouns_and_verbs), set(names)

# Function to retrieve relevant chunks using BM25
def retrieve_relevant_chunks(chunks, query, min_score):
    # Tokenize each chunk
    tokenized_chunks = [word_tokenize(chunk.lower()) for chunk in chunks]

    if not any(tokenized_chunks):  # Check if all sublists are empty
        return []  # Or handle the empty case appropriately

    # Initialize BM25 with the tokenized chunks
    bm25 = BM25Okapi(tokenized_chunks)
    # Tokenize the query
    tokenized_query = word_tokenize(query.lower())
    #print(tokenized_chunks)
    # Retrieve relevance scores
    scores = bm25.get_scores(tokenized_query)
    # Return chunks with scores above the minimum score
    return [chunks[i] for i in range(len(scores)) if scores[i] > min_score]

# Main function to extract chunks relevant to the question
def get_relevant_chunks(question, min_score=2):
    # List to store all matching text chunks
    relevant_chunks = []
    nouns_in_question, names = find_nouns_verbs(question)  # Convert nouns to a set for efficient lookup
    file_count = 0  # Counter to limit processed files

    # Loop through text files in the directory
    for text_chunk in all_text_chunks:
        # Split the document into chunks
            if text_chunk:
            # Retrieve chunks relevant to the query using BM25
              filtered_chunks = retrieve_relevant_chunks(text_chunk, question, min_score)
            # Keep only chunks that contain at least one noun from the question
              for chunk in filtered_chunks:
                tokenize_chunk = set(word_tokenize(chunk.lower()))
              #print(word_tokenize(chunk.lower()))  # Inspect the tokenized chunk
                if names != []:
                  if set(map(str.lower, names)).intersection(tokenize_chunk):
                    relevant_chunks.append(chunk)
                else:
                  if set(nouns_in_question).intersection(tokenize_chunk):
                    relevant_chunks.append(chunk)
    return relevant_chunks

In [None]:
from transformers import pipeline

# Initialize the question-answering pipeline
qa_pipeline = pipeline('question-answering', model='deepset/roberta-base-squad2')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
for q in question:
  docs = get_relevant_chunks(q, min_score = 3)
  context = ''.join(docs)
  if context:
    result = qa_pipeline(question=q, context=context)
    print(f"Answer: {result['answer']}")
    results.append(result['answer'])

KeyboardInterrupt: 

In [None]:
predictions = read_json('resultsDPR1.json')

In [None]:
predictions

['Lord Salisbury',
 'Phil Collins',
 'breast cancer',
 'Blanche Hudson',
 "The Making of Michael Jackson's Thriller",
 'Mount Kilimanjaro',
 'red',
 'Niger',
 'My Fair Lady',
 'South Africa',
 'Richard Marx',
 'John Ford',
 'Beach Boys',
 '61',
 'Albania and Iran',
 'The Iron Dream',
 'Chicago',
 'Men Against the Sea',
 'Gerald Rudolff Ford Jr',
 'An Agony in 8 Fits',
 'Eddie',
 'Nixon',
 'writer',
 'He has also worked to promote humanitarian work in South Africa',
 'J. G. Ballard',
 'Japan',
 'championshipThere',
 'Stars on 45 Medley',
 'Vancouver and the North Shore',
 'Queen Christina',
 'Coventry, Warwickshire',
 'the sons of Uranus and Gaia',
 'American Historical Recordings',
 'South Africa',
 "When you're rich",
 'Richard Noble',
 'United States',
 'humanitarian work in South Africa',
 'Trek',
 'Byrds',
 'He has also worked to promote humanitarian work in South Africa',
 'Brazil',
 "Please Hammer, Don't Hurt 'Em",
 'Broken Dreams',
 'Osbert Lancaster',
 'Chicago',
 'Anna Christi