Imports und Setup

In [2]:
from openai import OpenAI
import numpy as np
import faiss
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import requests
from bs4 import BeautifulSoup

# IMPORTANT add your api key here
api_key = "your-api-key"


# website of the check24 Tippspiel Teilnahmebedingungen
website_url = "https://tippspiel.check24.de/ul/champions-league-24-25/teilnahmebedingungen"

embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=api_key)
client = OpenAI(api_key=api_key)

Websitetext einlesen und für Embedding vorbereiten

In [19]:
def read_website(url):
    # read the text from the website
    response = requests.get(url)
    html_content = response.text

    # remove unwanted tags
    soup = BeautifulSoup(html_content, "html.parser")

    for tag in soup.select("nav, footer, .c24-cookie-consent-notice, .ads, .sidebar"):
        tag.decompose()

    text = soup.get_text(separator="\n")  # Extract text while keeping structure

    return text

In [20]:
def remove_cookies_info(text):
    lines = text.split("\n")
    cleaned_lines = []
    cookies_over = False

    for line in lines:
        if line.strip() == "Alle akzeptieren": #search for end of cookie info which appears at beginning of scraped text
            cookies_over = True
        if cookies_over:
            cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

def remove_empty_lines(text):
    lines = text.split("\n")
    cleaned_lines = []

    for line in lines:
        if line.strip() == "":
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

def remove_urls(text):
    lines = text.split("\n")
    cleaned_lines = []
    skip_next = 0

    for i, line in enumerate(lines):
        if i < len(lines) - 2 and (lines[i+1][0:4] == "http" or lines[i+1] == "hier" or lines[i+1] == "Link" ): # skip urls and remove corresponding line brakes
            skip_next = 3
            cleaned_lines.append(lines[i] + lines[i+2])
        if skip_next > 0:
            skip_next -= 1
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)


def remove_top_level_headers(text):
    lines = text.split("\n")
    cleaned_lines = []
    
    for i in range(len(lines)):
        # Skip first two lines (they only contain headers etc)
        if i < 2:
            continue

        # Detect a top-level section header (e.g., "3") followed by a short title
        if re.match(r"^\d+$", lines[i].strip()) and i + 1 < len(lines):
            continue  # Do not include the number
        elif re.match(r"^\d+$", lines[i - 1].strip()):
            continue  # Skip title (line after number)

        # Add the rest of the content
        cleaned_lines.append(lines[i])

    return "\n".join(cleaned_lines)


def propagate_parent_sections(text: str) -> str:
    """
    Propagates parent sections to their respective child sections in a given text.
    This function processes a text where sections are denoted by hierarchical numbering (e.g., "4.2.", "4.2.1.", "4.2.1.1.").
    It appends the parent section titles to their respective child sections to provide context.
    Args:
        text (str): The input text containing sections and sub-sections.
    Returns:
        str: The processed text with parent sections propagated to their child sections.
    """

    lines = text.split("\n")
    cleaned_lines = []
    current_parent = ""
    current_child = ""
    parent_coming = False
    child_coming = False
    childchild_coming = False

    for line in lines:
        stripped = line.strip()

        if parent_coming:
            current_parent = stripped
            parent_coming = False

        if child_coming:
            current_child = stripped
            cleaned_lines.append(current_parent + " " + line)
            child_coming = False
        elif childchild_coming:
            cleaned_lines.append(current_parent + " " + current_child + " " + line)
            childchild_coming = False
        else:
            cleaned_lines.append(line)
        
        # Detect major section (e.g., "4.2.") but NOT "4.2.1."
        if re.match(r"^\d+\.\d+\.$", stripped):
            parent_coming = True
        
        # Detect sub-sections (e.g., "4.2.1.") but not top-level sections
        elif re.match(r"^\d+\.\d+\.\d+\.$", stripped):
            child_coming = True

            # Detect sub-sections (e.g., "4.2.1.") but not top-level sections
        elif re.match(r"^\d+\.\d+\.\d+\.\d+\.$", stripped):
            childchild_coming = True

    return "\n".join(cleaned_lines)


def remove_numbers(text):
    lines = text.split("\n")
    cleaned_lines = []

    for line in lines:
        if re.match(r"^\d+\.\d+\.$", line) or re.match(r"^\d+\.\d+\.\d+\.$", line) or re.match(r"^\d+\.\d+\.\d+\.\d+\.$", line):
            continue
        cleaned_lines.append(line)

    return "\n".join(cleaned_lines)

In [21]:
# Read the text from the website
text = read_website(website_url)

# Preprocess the text by removing unnecessary information and improving formatting
text = remove_empty_lines(text)
text = remove_cookies_info(text)
text = remove_urls(text)
text = remove_top_level_headers(text)
text = propagate_parent_sections(text)
text = remove_numbers(text)

Text in Chunks aufteilen und Embedding-Vectors berechnen

In [22]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
chunks = text_splitter.split_text(text)

# Generate embeddings using OpenAI
embeddings = embedding_model.embed_documents(chunks)

# Convert to FAISS format with euclidian (L2) distance measure for similarity search
dimension = len(embeddings[0])  # Get embedding size
faiss_index = faiss.IndexFlatL2(dimension)  # L2 distance
faiss_index.add(np.array(embeddings, dtype=np.float32))

# Store chunk text with same indices as in 
chunk_metadata = {i: chunks[i] for i in range(len(chunks))}

print("Vectorization complete! Stored", len(chunks), "chunks in FAISS.")

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: your-api-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

Für Prompt relevanteste Chunks heraussuchen

In [23]:
def get_context(query, number_of_chunks_to_retrieve=5):
    query_embedding = embedding_model.embed_query(query)

    # Search for similar chunks
    _, similar_indices = faiss_index.search(np.array([query_embedding], dtype=np.float32), 
                                            number_of_chunks_to_retrieve)

    # Get the text of the similar chunks
    similar_chunks = [chunk_metadata[i] for i in similar_indices[0]]

    return similar_chunks

Modell Testen

In [24]:
def get_answer(prompt):
    answer = ""
    stream = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        stream=True, 
    )
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            answer += chunk.choices[0].delta.content

    return answer

In [25]:
# Hier eine Frage zu den Teilnahmebedingungen eingeben um das Modell zu testen
query = "Was kann ich gewinnen?"



print("Frage:\n", query)


# Get context
print("\nKontext wird gesucht...\n")

context_chunks = get_context(query, number_of_chunks_to_retrieve=5)

print("\n".join(["Kontext " + str(i + 1) + ":\n" + context for i, context in enumerate(context_chunks)]))

context = "\n".join(context_chunks) # Combine chunks into one string


# Create final prompt
print("\nAntwort wird generiert...\n")

prompt = f"""
Du bist ein KI-Assistent, der Fragen zu den Teilnahmebedingungen eines Gewinnspiels beantwortet. Zur Beantwortung der Frage hast du folgenden Kontext:

Kontext:
{context}

Die Frage des Benutzers lautet:
{query}
"""

answer = get_answer(prompt)

print("Antwort:\n", answer)


Frage:
 Was kann ich gewinnen?

Kontext wird gesucht...



AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: your-api-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}