In [2]:
# Notwendige Bibliotheken installieren
!pip install datasets transformers sentence-transformers faiss-cpu

# Bibliotheken importieren
from datasets import load_dataset
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import torch
from concurrent.futures import ThreadPoolExecutor

# Laden des Datasets
dataset = load_dataset("trivia_qa", "rc.wikipedia", split="validation")

# QA-Pipeline laden
qa_pipeline = pipeline('question-answering', model='deepset/roberta-base-squad2')

# SentenceTransformer-Modell laden
sentence_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Funktion: Embeddings für einen Batch erstellen
def generate_batch_embeddings(batch_contexts, model):
    try:
        return model.encode(batch_contexts, convert_to_tensor=True)
    except Exception as e:
        print(f"Fehler beim Verarbeiten eines Batches: {e}")
        return None

# Funktion: Embeddings parallel erstellen
def generate_embeddings_parallel(contexts, model, batch_size=32, max_batches=None):
    embeddings = []
    total_batches = (len(contexts) + batch_size - 1) // batch_size
    print(f"Verarbeite {total_batches} Batches...")

    if max_batches is not None:
        total_batches = min(total_batches, max_batches)

    with ThreadPoolExecutor() as executor:
        futures = []
        for i in range(total_batches):
            batch_contexts = contexts[i * batch_size:(i + 1) * batch_size]
            futures.append(executor.submit(generate_batch_embeddings, batch_contexts, model))

        for i, future in enumerate(futures):
            try:
                result = future.result()
                if result is not None:
                    embeddings.append(result)
                print(f"Batch {i+1}/{total_batches} verarbeitet.")
            except Exception as e:
                print(f"Fehler beim Verarbeiten des Batches {i+1}: {e}")

    if embeddings:
        return torch.cat(embeddings, dim=0)
    else:
        print("Keine Embeddings generiert.")
        return None

# Funktion: Ähnlichsten Kontext finden
def find_most_relevant_context(question, contexts, model, index, stored_embeddings=None):
    question_embedding = model.encode([question], convert_to_tensor=True)

    if stored_embeddings is None:
        context_embeddings = generate_embeddings_parallel(contexts, model)
    else:
        context_embeddings = stored_embeddings

    if context_embeddings is None:
        print("Fehler: Keine Kontext-Embeddings generiert.")
        return None

    # FAISS-Suche für die effizienteste Ähnlichkeitsberechnung
    D, I = index.search(question_embedding.cpu().numpy(), k=1)  # Top-1 Ergebnis
    most_relevant_index = I[0][0]
    return contexts[most_relevant_index]

# Hauptfunktion: Frage beantworten
def answer_question(question, dataset, qa_pipeline, sentence_model, index, stored_embeddings=None):
    contexts = []
    for example in dataset:
        entity_pages = example.get("entity_pages", {})
        wiki_context = entity_pages.get("wiki_context", "")

        if isinstance(wiki_context, list):
            wiki_context = " ".join(wiki_context)

        if wiki_context:
            contexts.append(wiki_context)

    if not contexts:
        print("Fehler: Keine gültigen Kontexte im Dataset.")
        return "Keine gültigen Kontexte gefunden."

    relevant_context = find_most_relevant_context(question, contexts, sentence_model, index, stored_embeddings)
    if relevant_context is None:
        return "Kein relevanter Kontext gefunden."

    result = qa_pipeline({
        "question": question,
        "context": relevant_context
    })

    return result.get('answer', 'Keine Antwort gefunden')

# Hauptablauf
def main():
    # Kontext extrahieren
    contexts = []
    for example in dataset:
        entity_pages = example.get("entity_pages", {})
        wiki_context = entity_pages.get("wiki_context", "")

        if isinstance(wiki_context, list):
            wiki_context = " ".join(wiki_context)

        if wiki_context:
            contexts.append(wiki_context)

    print(f"Anzahl der Kontexte: {len(contexts)}")

    # FAISS-Index vorbereiten
    print("Erstelle Embeddings und FAISS-Index...")
    embeddings = generate_embeddings_parallel(contexts, sentence_model)
    if embeddings is None:
        print("Fehler: Embeddings konnten nicht generiert werden.")
        return

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings.cpu().numpy())

    # Frage stellen
    question = "Who wrote the Harry Potter series?"
    print("Frage wird verarbeitet...")
    answer = answer_question(question, dataset, qa_pipeline, sentence_model, index, embeddings)
    torch.save(embeddings, "embeddings.pt")
    faiss.write_index(index, "faiss_index.index")


    print(f"Antwort: {answer}")

# Programm starten
if __name__ == "__main__":
    main()


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_6

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/26.7k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

train-00000-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

train-00001-of-00007.parquet:   0%|          | 0.00/261M [00:00<?, ?B/s]

train-00002-of-00007.parquet:   0%|          | 0.00/319M [00:00<?, ?B/s]

train-00003-of-00007.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00004-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

train-00005-of-00007.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

train-00006-of-00007.parquet:   0%|          | 0.00/253M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/61888 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7993 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7701 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Anzahl der Kontexte: 7993
Erstelle Embeddings und FAISS-Index...
Verarbeite 250 Batches...
Batch 1/250 verarbeitet.
Batch 2/250 verarbeitet.
Batch 3/250 verarbeitet.
Batch 4/250 verarbeitet.
Batch 5/250 verarbeitet.
Batch 6/250 verarbeitet.
Batch 7/250 verarbeitet.
Batch 8/250 verarbeitet.
Batch 9/250 verarbeitet.
Batch 10/250 verarbeitet.
Batch 11/250 verarbeitet.
Batch 12/250 verarbeitet.
Batch 13/250 verarbeitet.
Batch 14/250 verarbeitet.
Batch 15/250 verarbeitet.
Batch 16/250 verarbeitet.
Batch 17/250 verarbeitet.
Batch 18/250 verarbeitet.
Batch 19/250 verarbeitet.
Batch 20/250 verarbeitet.
Batch 21/250 verarbeitet.
Batch 22/250 verarbeitet.
Batch 23/250 verarbeitet.
Batch 24/250 verarbeitet.
Batch 25/250 verarbeitet.
Batch 26/250 verarbeitet.
Batch 27/250 verarbeitet.
Batch 28/250 verarbeitet.
Batch 29/250 verarbeitet.
Batch 30/250 verarbeitet.
Batch 31/250 verarbeitet.
Batch 32/250 verarbeitet.
Batch 33/250 verarbeitet.
Batch 34/250 verarbeitet.
Batch 35/250 verarbeitet.
Batch 36



Antwort: J. K. Rowling


In [7]:
# Bibliotheken importieren
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
import faiss
import torch

# Generatives Modell laden (z.B. Flan-T5 oder T5)
model_name = "google/flan-t5-base"  # Modell wählen
gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
gen_tokenizer = AutoTokenizer.from_pretrained(model_name)

# SentenceTransformer-Modell laden
sentence_model = SentenceTransformer('all-mpnet-base-v2')  # Präziseres Modell für semantische Ähnlichkeit

# Funktion: Ähnlichsten Kontext finden
def find_top_k_contexts(question, model, index, contexts, k=3):
    """
    Findet die k relevantesten Kontexte für eine Frage.

    Parameters:
    - question: Die Frage als String.
    - model: Das SentenceTransformer-Modell.
    - index: Der FAISS-Index mit gespeicherten Embeddings.
    - contexts: Die Liste der ursprünglichen Kontexte.
    - k: Anzahl der Top-Ergebnisse.

    Returns:
    - Eine Liste der k relevantesten Kontexte.
    """
    # Embedding der Frage erstellen
    question_embedding = model.encode([question])

    # FAISS-Suche durchführen
    distances, indices = index.search(question_embedding, k=k)  # Top-k Ergebnisse

    # Kontexte extrahieren und überprüfen
    relevant_contexts = []
    for idx in indices[0]:
        if 0 <= idx < len(contexts):  # Validierung, um Out-of-Bounds-Fehler zu vermeiden
            relevant_contexts.append(contexts[idx])
    return relevant_contexts

# Funktion: Kontext kürzen
def preprocess_context(context, max_length=512):
    """
    Kürzt den Kontext auf eine maximale Länge (in Tokens).
    """
    return " ".join(context.split()[:max_length])

# Funktion: Fragen beantworten
def answer_first_100_questions(dataset, contexts, model, index, gen_model, gen_tokenizer):
    """
    Beantwortet die ersten 100 Fragen aus dem rc.wikipedia-Dataset.

    Parameters:
    - dataset: Das rc.wikipedia Validation-Dataset.
    - contexts: Die Liste der ursprünglichen Kontexte.
    - model: Das SentenceTransformer-Modell.
    - index: Der FAISS-Index mit gespeicherten Embeddings.
    - gen_model: Das generative Modell für QA.
    - gen_tokenizer: Der Tokenizer des generativen Modells.

    Returns:
    - Eine Liste mit Ergebnissen (Frage, relevante Kontexte, Antwort).
    """
    results = []

    print("Beantworte die ersten 100 Fragen...")

    for i in range(100):  # Schleife durch die ersten 100 Elemente
        example = dataset[i]  # Hole das i-te Element als Dictionary
        question = example["question"]  # Extrahiere die Frage

        print(f"Verarbeite Frage {i+1}: {question}")

        # Top-3 relevante Kontexte finden
        top_contexts = find_top_k_contexts(question, model, index, contexts, k=3)
        if not top_contexts:
            print(f"Warnung: Kein relevanter Kontext für Frage {i+1} gefunden.")
            continue

        # Eingabe für das generative Modell erstellen
        combined_context = " ".join(preprocess_context(c) for c in top_contexts)
        input_text = f"question: {question} context: {combined_context}"
        inputs = gen_tokenizer.encode(input_text, return_tensors="pt", truncation=True)

        # Antwort generieren
        outputs = gen_model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)
        answer = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Ergebnis speichern
        results.append({
            "question": question,
            "contexts": top_contexts,
            "answer": answer
        })

    print("Verarbeitung abgeschlossen.")
    return results

# Hauptablauf
if __name__ == "__main__":
    print("Lade gespeicherte Embeddings und FAISS-Index...")

    # Gespeicherte Embeddings und Index laden
    embeddings = torch.load("embeddings.pt")
    index = faiss.read_index("faiss_index.index")

    # Dataset laden
    print("Lade rc.wikipedia Validation-Dataset...")
    dataset = load_dataset("trivia_qa", "rc.wikipedia", split="validation")

    # Kontexte extrahieren
    print("Extrahiere Kontexte...")
    contexts = []
    for example in dataset:
        entity_pages = example.get("entity_pages", {})
        wiki_context = entity_pages.get("wiki_context", "")

        if isinstance(wiki_context, list):
            wiki_context = " ".join(wiki_context)

        if wiki_context:
            contexts.append(wiki_context)

    print(f"Anzahl der Kontexte: {len(contexts)}")

    # Beantworte die ersten 100 Fragen
    results = answer_first_100_questions(dataset, contexts, sentence_model, index, gen_model, gen_tokenizer)

    # Ergebnisse ausgeben
    for i, result in enumerate(results):
        print(f"Frage {i+1}: {result['question']}")
        print(f"Antwort: {result['answer']}")
        print(f"Relevante Kontexte: {' '.join(result['contexts'])[:200]}...\n")


Lade gespeicherte Embeddings und FAISS-Index...
Lade rc.wikipedia Validation-Dataset...


  embeddings = torch.load("embeddings.pt")


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Extrahiere Kontexte...
Anzahl der Kontexte: 7993
Beantworte die ersten 100 Fragen...
Verarbeite Frage 1: Which Lloyd Webber musical premiered in the US on 10th December 1993?
Verarbeite Frage 2: Who was the next British Prime Minister after Arthur Balfour?
Verarbeite Frage 3: Who had a 70s No 1 hit with Kiss You All Over?
Verarbeite Frage 4: What claimed the life of singer Kathleen Ferrier?
Verarbeite Frage 5: Which actress was voted Miss Greenwich Village in 1942?
Verarbeite Frage 6: What was the name of Michael Jackson's autobiography written in 1988?
Verarbeite Frage 7: Which volcano in Tanzania is the highest mountain in Africa?
Verarbeite Frage 8: The flag of Libya is a plain rectangle of which color?
Verarbeite Frage 9: Of which African country is Niamey the capital?
Verarbeite Frage 10: Which musical featured the song The Street Where You Live?
Verarbeite Frage 11: "Who was the target of the failed ""Bomb Plot"" of 1944?"
Verarbeite Frage 12: Who had an 80s No 1 hit with Hold On

In [4]:
for i, result in enumerate(results):
        print(f"Frage {i+1}: {result['question']}")
        print(f"Antwort: {result['answer']}")
        print(f"Kontext: {result['context'][:200]}...\n")

Frage 1: Which Lloyd Webber musical premiered in the US on 10th December 1993?
Antwort: A Christmas Carol
Kontext: A Christmas Carol is a musical with music by Alan Menken, lyrics by Lynn Ahrens, and book by Mike Ockrent and Lynn Ahrens. The musical is based on Charles Dickens' 1843 novella of the same name. The s...

Frage 2: Who was the next British Prime Minister after Arthur Balfour?
Antwort: Maimonides
Kontext: In Jewish eschatology the term mashiach, or "Messiah", came to refer to a future Jewish king from the Davidic line, who is expected to be anointed with holy anointing oil and rule the Jewish people du...

Frage 3: Who had a 70s No 1 hit with Kiss You All Over?
Antwort: Astrud Gilberto
Kontext: "Garota de Ipanema" ("The Girl from Ipanema") is a Brazilian bossa nova jazz song.  It was a worldwide hit in the mid-1960s and won a Grammy for Record of the Year in 1965. It was written in 1962, wit...

Frage 4: What claimed the life of singer Kathleen Ferrier?
Antwort: rape
Kontext

In [None]:
print(dataset)

Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
    num_rows: 7993
})


In [None]:
# Bibliotheken importieren
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import torch

# Embeddings und FAISS-Index laden
print("Lade gespeicherte Embeddings und FAISS-Index...")
embeddings = torch.load("embeddings.pt")  # Die generierten Embeddings
index = faiss.read_index("faiss_index.index")  # Der FAISS-Index
print("Embeddings und Index erfolgreich geladen.")

# SentenceTransformer-Modell laden
sentence_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# QA-Pipeline laden
qa_pipeline = pipeline('question-answering', model='deepset/roberta-base-squad2')

# Funktion, um den relevantesten Kontext zu finden
def find_relevant_context(question, model, index, contexts):
    # Embedding der Frage erstellen
    question_embedding = model.encode([question], convert_to_tensor=True)

    # FAISS-Suche durchführen
    distances, indices = index.search(question_embedding.cpu().numpy(), k=1)  # Top-1 Ergebnis

    # Index des relevantesten Kontextes zurückgeben
    most_relevant_index = indices[0][0]
    return contexts[most_relevant_index]

# Neue Frage stellen
def ask_new_question(question, contexts, model, index, qa_pipeline):
    print(f"Verarbeite neue Frage: {question}")
    relevant_context = find_relevant_context(question, model, index, contexts)

    # Frage-Antwort-Pipeline auf den relevanten Kontext anwenden
    result = qa_pipeline({
        "question": question,
        "context": relevant_context
    })

    return result.get('answer', 'Keine Antwort gefunden')

# Kontexte laden oder definieren
print("Extrahiere Kontexte...")
contexts = []  # Fülle diese Liste mit den ursprünglichen Kontexten, die du für die Embeddings verwendet hast.
for example in dataset:
    entity_pages = example.get("entity_pages", {})
    wiki_context = entity_pages.get("wiki_context", "")
    if isinstance(wiki_context, list):
        wiki_context = " ".join(wiki_context)
    if wiki_context:
        contexts.append(wiki_context)

# Neue Frage stellen
new_question = "What are the names of the continents?"
answer = ask_new_question(new_question, contexts, sentence_model, index, qa_pipeline)
print(f"Antwort auf die Frage '{new_question}': {answer}")




Lade gespeicherte Embeddings und FAISS-Index...


  embeddings = torch.load("embeddings.pt")  # Die generierten Embeddings


FileNotFoundError: [Errno 2] No such file or directory: 'embeddings.pt'

In [None]:
def ask_first_100_questions(dataset, contexts, model, index, qa_pipeline):
    """
    Processes the first 100 questions from the rc.wikipedia validation dataset.

    Parameters:
    - dataset: The rc.wikipedia validation dataset.
    - contexts: List of all contexts used for generating embeddings.
    - model: The SentenceTransformer model for encoding questions.
    - index: The FAISS index for retrieving relevant contexts.
    - qa_pipeline: The question-answering pipeline.

    Returns:
    - results: A list of dictionaries with the question, context, and answer.
    """
    results = []

    print("Processing the first 100 questions from the dataset...")

    for i, example in enumerate(dataset[:100]):
        question = example.get("question", "")

        if not question:
            print(f"Skipping example {i} due to missing question.")
            continue

        print(f"Processing question {i + 1}: {question}")

        # Find the relevant context for the question
        relevant_context = find_relevant_context(question, model, index, contexts)

        # Apply the QA pipeline
        result = qa_pipeline({
            "question": question,
            "context": relevant_context
        })

        # Collect the result
        results.append({
            "question": question,
            "context": relevant_context,
            "answer": result.get('answer', 'Keine Antwort gefunden')
        })

    print("Processing completed.")
    return results

# Example of running the function
# Assuming `rc_wikipedia_dataset` is the validation dataset loaded into memory
# results = ask_first_100_questions(rc_wikipedia_dataset, contexts, sentence_model, index, qa_pipeline)

# You can then analyze or save `results` for further use, such as writing to a JSON file.


In [None]:
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from tqdm import tqdm


  from tqdm.autonotebook import tqdm, trange


In [None]:
!pip install faiss-gpu sentence-transformers datasets

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.2.0-py3-none-any.whl (480 kB