In [7]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import json


def write_json_to_file(json_object, json_file, mode='w', encoding='utf-8'):
    with open(json_file, mode, encoding=encoding) as outfile:
        json.dump(json_object, outfile, indent=4, sort_keys=True, ensure_ascii=False)


def get_file_contents(filename, encoding='utf-8'):
    with open(filename, encoding=encoding) as f:
        content = f.read()
    return content


def read_json(filename, encoding='utf-8'):
    contents = get_file_contents(filename, encoding=encoding)
    return json.loads(contents)


def get_file_contents_as_list(file_path, encoding='utf-8', ignore_blanks=True):
    contents = get_file_contents(file_path, encoding=encoding)
    lines = contents.split('\n')
    lines = [line for line in lines if line != ''] if ignore_blanks else lines
    return lines

In [None]:
# Key for wikipedia eval is question-id. Key for web eval is the (question_id, filename) tuple
def get_key_to_ground_truth(data):
    if data['Domain'] == 'Wikipedia':
        return {datum['QuestionId']: datum['Answer'] for datum in data['Data']}
    else:
        return get_qd_to_answer(data)


def get_question_doc_string(qid, doc_name):
    return '{}--{}'.format(qid, doc_name)

def get_qd_to_answer(data):
    key_to_answer = {}
    for datum in data['Data']:
        for page in datum.get('EntityPages', []) + datum.get('SearchResults', []):
            qd_tuple = get_question_doc_string(datum['QuestionId'], page['Filename'])
            key_to_answer[qd_tuple] = datum['Answer']
    return key_to_answer
#this is for getting the answers out of the dataset, but i can do that differently

def read_clean_part(datum):
    for key in ['EntityPages', 'SearchResults']:
        new_page_list = []
        for page in datum.get(key, []):
            if page['DocPartOfVerifiedEval']:
                new_page_list.append(page)
        datum[key] = new_page_list
    assert len(datum['EntityPages']) + len(datum['SearchResults']) > 0
    return datum


def read_triviaqa_data(qajson):
    data = read_json(qajson)
    # read only documents and questions that are a part of clean data set
    if data['VerifiedEval']:
        clean_data = []
        for datum in data['Data']:
            if datum['QuestionPartOfVerifiedEval']:
                if data['Domain'] == 'Web':
                    datum = read_clean_part(datum)
                clean_data.append(datum)
        data['Data'] = clean_data
    return data


def answer_index_in_document(answer, document):
    answer_list = answer['NormalizedAliases']
    for answer_string_in_doc in answer_list:
        index = document.lower().find(answer_string_in_doc)
        if index != -1:
            return answer_string_in_doc, index
    return answer['NormalizedValue'], -1

In [None]:
# -*- coding: utf-8 -*-
""" Official evaluation script for v1.0 of the TriviaQA dataset.
Extended from the evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import sys
import argparse


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def handle_punc(text):
        exclude = set(string.punctuation + "".join([u"‘", u"’", u"´", u"`"]))
        return ''.join(ch if ch not in exclude else ' ' for ch in text)

    def lower(text):
        return text.lower()

    def replace_underscore(text):
        return text.replace('_', ' ')

    return white_space_fix(remove_articles(handle_punc(lower(replace_underscore(s))))).strip()


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def is_exact_match(answer_object, prediction):
    ground_truths = get_ground_truths(answer_object)
    for ground_truth in ground_truths:
        if exact_match_score(prediction, ground_truth):
            return True
    return False


def has_exact_match(ground_truths, candidates):
    for ground_truth in ground_truths:
        if ground_truth in candidates:
            return True
    return False


def get_ground_truths(answer):
    return answer['normalized_aliases'] + [normalize_answer(ans) for ans in answer.get('HumanAnswers', [])]


def get_oracle_score(ground_truth, predicted_answers, qid_list=None, mute=False):
    exact_match = common = 0
    if qid_list is None:
        qid_list = ground_truth.keys()
    for qid in qid_list:
        if qid not in predicted_answers:
            if not mute:
                message = 'Irrelavant question {} will receive score 0.'.format(qid)
                print(message, file=sys.stderr)
            continue
        common += 1
        prediction = normalize_answer(predicted_answers[qid])
        ground_truths = get_ground_truths(ground_truth[qid])
        em_for_this_question = has_exact_match(ground_truths, prediction)
        exact_match += int(em_for_this_question)

    exact_match = 100.0 * exact_match / len(qid_list)

    return {'oracle_exact_match': exact_match, 'common': common, 'denominator': len(qid_list),
            'pred_len': len(predicted_answers), 'gold_len': len(ground_truth)}


def evaluate_triviaqa(ground_truth, predicted_answers, qid_list=None, mute=False):
    f1 = exact_match = common = 0
    if qid_list is None:
        qid_list = ground_truth.keys()
    for qid in qid_list:
        if qid not in predicted_answers:
            if not mute:
                message = 'Missed question {} will receive score 0.'.format(qid)
                print(message, file=sys.stderr)
            continue
        if qid not in ground_truth:
            if not mute:
                message = 'Irrelavant question {} will receive score 0.'.format(qid)
                print(message, file=sys.stderr)
            continue
        common += 1
        prediction = predicted_answers[qid]
        ground_truths = get_ground_truths(ground_truth[qid])
        em_for_this_question = metric_max_over_ground_truths(
            exact_match_score, prediction, ground_truths)
        if em_for_this_question == 0 and not mute:
            print("em=0:", prediction, ground_truths)
        exact_match += em_for_this_question
        f1_for_this_question = metric_max_over_ground_truths(
            f1_score, prediction, ground_truths)
        f1 += f1_for_this_question

    exact_match = 100.0 * exact_match / len(qid_list)
    f1 = 100.0 * f1 / len(qid_list)

    return {'exact_match': exact_match, 'f1': f1, 'common': common, 'denominator': len(qid_list),
            'pred_len': len(predicted_answers), 'gold_len': len(ground_truth)}


def get_args():
    parser = argparse.ArgumentParser(
        description='Evaluation for TriviaQA {}'.format(expected_version))
    parser.add_argument('--dataset_file', help='Dataset file')
    parser.add_argument('--prediction_file', help='Prediction File')
    args = parser.parse_args()
    return args

Load dataset

In [8]:
from datasets import load_dataset
dataset = load_dataset('trivia_qa', 'rc.wikipedia', split='validation')

contexts = [entity_page['wiki_context'] for entity_page in dataset['entity_pages']]
ground_truth = {datum['question_id']: datum['answer'] for datum in dataset}

README.md:   0%|          | 0.00/26.7k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

train-00000-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

train-00001-of-00007.parquet:   0%|          | 0.00/261M [00:00<?, ?B/s]

train-00002-of-00007.parquet:   0%|          | 0.00/319M [00:00<?, ?B/s]

train-00003-of-00007.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00004-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

train-00005-of-00007.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

train-00006-of-00007.parquet:   0%|          | 0.00/253M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/61888 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7993 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7701 [00:00<?, ? examples/s]

Pinecone first model


In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, DPRContextEncoder

model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

In [2]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0.

Pinecoding create embeddings with model above (one time)



In [None]:
def encode_function(data):
    encodings = []
    for page in data['entity_pages']:  # Iterate through individual examples
        wiki_contexts = page['wiki_context']
        if not isinstance(wiki_contexts, list):
            wiki_contexts = [wiki_contexts]
        context = " ".join(wiki_contexts)
        encodings.append(model.encode(context))
    return {'encoding': encodings}

dataset = dataset.map(encode_function, batched=True, batch_size=4)


Map:   0%|          | 0/7993 [00:00<?, ? examples/s]

In [None]:
API_KEY = "pcsk_3zVMgZ_Fj5RX3UmosrRwTM5z5Fq4k2VbgBGYYqpYdivwmWR6bXfqjToAGcmX4wVp5xriVz"

pc = Pinecone(
    api_key=API_KEY
)

In [None]:
spec = ServerlessSpec(
    cloud="aws",
    region="us-east-1",
  )

# check if index already exists, if not we create it
if 'modelst' not in pc.list_indexes():
    pc.create_index(
        name='modelst', dimension=model.get_sentence_embedding_dimension(), metric='cosine',spec=spec
    )

# we use this to get required index dims
model.get_sentence_embedding_dimension()

384

In [None]:
# initialize connection to the new index
index = pc.Index('modelst')

from tqdm.auto import tqdm  # progress bar

upserts = []
for v in dataset:
    # Truncate the wiki_context to a maximum of 5000 characters
    truncated_context = str(v['entity_pages']['wiki_context'])[:5000]
    upserts.append((v['question_id'], [float(x) for x in v['encoding']], {'bytes': truncated_context}))

# now upsert in chunks
for i in tqdm(range(0, len(upserts), 5)):
    i_end = i + 5
    if i_end > len(upserts): i_end = len(upserts)
    index.upsert(vectors=upserts[i:i_end])

  0%|          | 0/1599 [00:00<?, ?it/s]

Pinecone load embeddings

In [None]:
from pinecone import Pinecone, ServerlessSpec
from google.colab import userdata

API_KEY = "pcsk_3zVMgZ_Fj5RX3UmosrRwTM5z5Fq4k2VbgBGYYqpYdivwmWR6bXfqjToAGcmX4wVp5xriVz"

pc = Pinecone(
    api_key=API_KEY
)

Pinecone reader and storing

In [None]:
from transformers import pipeline
#qa_pipeline = pipeline("question-answering", model="FabianWillner/bert-base-uncased-finetuned-triviaqa")
qa_pipeline = pipeline("text2text-generation", model="allenai/unifiedqa-t5-small")

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
question = dataset['question']

In [None]:
len(question)

7993

In [None]:
#question = ["Who is the king of netherlands", "Who is the king of England"]
index = pc.Index('wiki-validation-minilm')

results = []
n = len(question[0:10])
for q in question[0:10]:
  print(n)
  n = n-1
  print(q)
  xq = model.encode([q]).tolist()
  xc = index.query(vector=xq, top_k=2, include_metadata=True)
  docs = []
  for x in range(len(xc["matches"])):
    con = xc["matches"][x]["metadata"]["bytes"]#["metadata"]
    docs.append(con)
  context = ''.join(docs)
  #result = qa_pipeline(question=q, context=context)
  result = qa_pipeline(f"question: {q} context: {context}")
  #print(f"Answer: {result['answer']}")
  print(f"Answer: {result[0]['generated_text']}") # Access the 'generated_text' from the first element of the list
  #results.append(result['answer'])
  results.append(result[0]['generated_text']) # Append the 'generated_text' to results


10
Which Lloyd Webber musical premiered in the US on 10th December 1993?




Answer: yes
9
Who was the next British Prime Minister after Arthur Balfour?
Answer: edwards
8
Who had a 70s No 1 hit with Kiss You All Over?
Answer: no
7
What claimed the life of singer Kathleen Ferrier?
Answer: how she lied to her father about her illness.
6
Which actress was voted Miss Greenwich Village in 1942?
Answer: edward gammell
5
What was the name of Michael Jackson's autobiography written in 1988?
Answer: the bible
4
Which volcano in Tanzania is the highest mountain in Africa?
Answer: Kilimanjaro
3
The flag of Libya is a plain rectangle of which color?
Answer: white
2
Of which African country is Niamey the capital?
Answer: northampton
1
Which musical featured the song The Street Where You Live?
Answer: no


In [None]:
len(results)

74

In [None]:

import json

filename = "resultsbase"

write_json_to_file(resultsbase, filename)


Evaluation (for now with just 100 questions)

In [None]:
subset = dataset.select(range(0,73))
ground_truth = {datum['question_id']: datum['answer'] for datum in subset}

In [None]:
keys = []
for key in ground_truth.keys():
  keys.append(key)

predictionary = dict(zip(keys, results))
accuracy = evaluate_triviaqa(ground_truth, predictionary)

em=0: Likes of Us ['sunset boulevard', 'sunset bulevard', 'west sunset boulevard', 'sunset blvd']
em=0: Maurice Harold Macmillan ['henry campbell bannerman', 'sir henry campbell bannerman', 'campbell bannerman']
em=0: Rubettes ['exiles', 'voluntary exile', 'forced exile', 'banish', 'self exile', 'exile politics and government', 'exile in greek tragedy', 'sent into exile', 'banishment', 'transported for life', 'exile', 'internal exile', 'exile and banishment']
em=0: Rape of Lucretia ['aids related cancer', 'sporadic cancer', 'cancer disease', 'malignant tumors', 'cancers', 'carcinophobia', 'cancer', 'cancer diagnosis', 'malignant neoplastic disease', 'malignant neoplasm', 'tumour virus', 'cancer medicine', 'deaths by cancer', 'malignant tumour', 'epithelial cancers', 'solid cancer', 'cancerous', 'borderline cancer', 'invasive cancer', 'anti cancer', 'cancer pathology', 'cancer signs', 'cancer aromatase', 'cancer therapy', 'financial toxicity', 'cancerophobia', 'cancer en cuirasse', 'can

In [None]:
accuracy

{'exact_match': 38.35616438356164,
 'f1': 42.68101761252446,
 'common': 73,
 'denominator': 73,
 'pred_len': 73,
 'gold_len': 73}

Baseline

In [None]:
import torch

In [None]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("deep-learning-analytics/triviaqa-t5-base")
model = AutoModelWithLMHead.from_pretrained("deep-learning-analytics/triviaqa-t5-base")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def extract_text(text_list):
  extracted_text = text_list[0].split('<pad> ')[1]
  extracted_text = extracted_text.replace("</s>", "")
  return extracted_text

Predicted Answer:  ['<pad> Gerald Ford</s>']


In [None]:
resultsbase = []
for text in question:
  print(text)
  preprocess_text = text.strip().replace("\n","")
  tokenized_text = tokenizer.encode(preprocess_text, return_tensors="pt").to(device)
  outs = model.generate(
            tokenized_text,
            max_length=10,
            num_beams=2,
            early_stopping=True
           )
  dec = [tokenizer.decode(ids) for ids in outs]
  answer = extract_text(dec)
  print(answer)
  resultsbase.append(answer)

[1;30;43mStreaminguitvoer ingekort tot de laatste 5000 regels.[0m
Forest Whitaker won the Oscar for Best Actor for his role in which 2006 film?
The Hunt for Red October
In1918 women over what age were given the right to vote?
30
What was the capital of the Confederacy in the American Civil War?
Richmond, Virginia
In the periodic table of elements which metal comes after beryllium and magnesium in the alkaline earth metals (valency 2)?
POTASSIUM
Which Roman Emperor committed suicide in 68 AD after being declared an enemy of the state by the senate?
Claudius
Which play by Sir Peter Shaffer was made into a film with Tom Hulce in the title role?
A MAN FOR ALL SEASONS
Green plover and peewit are alternative names for which British bird?
Lapwing
In which TV 'quiz' show was the consolation prize a chequebook and pen?
Bullseye
Disaronno is an 'amaretto' liqueur with a characteristic bitter-almond taste, however it doesn't contain any nuts at all; from the kernel of which fruit does the flavo

In [None]:
len(resultsbase)

7993

Try new model

In [3]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [1]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0

In [4]:
# Load model directly
from transformers import AutoTokenizer, DPRContextEncoder
import faiss

tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
model = DPRContextEncoder.from_pretrained("PaulLerner/dpr_context_encoder_triviaqa_without_viquae")
index = faiss.IndexFlatL2(768)


Some weights of the model checkpoint at PaulLerner/dpr_context_encoder_triviaqa_without_viquae were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
def encode_function(data):
    encodings = []
    for page in data['entity_pages']:  # Iterate through individual examples
        wiki_contexts = page['wiki_context']
        if not isinstance(wiki_contexts, list):
            wiki_contexts = [wiki_contexts]
        context = " ".join(wiki_contexts)
        encoded_contexts = tokenizer(context, return_tensors="pt", padding=True, truncation=True, max_length=512)
        context_vectors = model(**encoded_contexts).pooler_output.detach().numpy()
        encodings.append(context_vectors)
        #index.add(context_vectors)
    return {'encoding': encodings}

dataset = dataset.map(encode_function, batched=True, batch_size=4)

Map:   0%|          | 0/7993 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
spec = ServerlessSpec(
    cloud="aws",
    region="us-east-1",
)

# check if index already exists, if not we create it
if 'modelplgood' not in pc.list_indexes():
    # Get the embedding dimension from the model's config
    embedding_dimension = model.config.hidden_size
    pc.create_index(
        name='modelplgood', dimension=embedding_dimension, metric='cosine', spec=spec
    )

# Now you can use embedding_dimension for other purposes if needed
print(f"Embedding dimension: {embedding_dimension}")

Embedding dimension: 768


In [6]:
# initialize connection to the new index
index = pc.Index('modelplgood')

from tqdm.auto import tqdm  # progress bar

upserts = []
for v in dataset:
    # Truncate the wiki_context to a maximum of 5000 characters
    truncated_context = str(v['entity_pages']['wiki_context'])[:5000]
    #upserts.append((v['question_id'], [float(x) for x in v['encoding']], {'bytes': truncated_context}))
    upserts.append((v['question_id'], [float(i) for i in v['encoding'][0]], {'bytes': truncated_context}))
# now upsert in chunks
for i in tqdm(range(0, len(upserts), 5)):
    i_end = i + 5
    if i_end > len(upserts): i_end = len(upserts)
    index.upsert(vectors=upserts[i:i_end])

NameError: name 'dataset' is not defined

In [None]:
index

<pinecone.data.index.Index at 0x7924cecf6e90>

In [3]:
from pinecone import Pinecone, ServerlessSpec
from google.colab import userdata

API_KEY = "pcsk_3zVMgZ_Fj5RX3UmosrRwTM5z5Fq4k2VbgBGYYqpYdivwmWR6bXfqjToAGcmX4wVp5xriVz"

pc = Pinecone(
    api_key=API_KEY
)
index = pc.Index('modelplgood')


In [5]:
len(index)

TypeError: object of type 'Index' has no len()