Imports und Setup

In [23]:
from openai import OpenAI
import numpy as np
import faiss
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import requests
from bs4 import BeautifulSoup

# IMPORTANT add your api key here
api_key = "your-api-key"


# website of the check24 Tippspiel Teilnahmebedingungen
website_url = "https://tippspiel.check24.de/ul/champions-league-24-25/teilnahmebedingungen"

embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=api_key)
client = OpenAI(api_key=api_key)

In [2]:
! pip freeze

aiohappyeyeballs==2.4.4
aiohttp==3.11.11
aiosignal==1.3.2
annotated-types==0.7.0
anyio==4.8.0
appnope==0.1.4
asgiref==3.8.1
asttokens==3.0.0
attrs==25.1.0
backoff==2.2.1
bcrypt==4.2.1
beautifulsoup4==4.13.3
blinker==1.9.0
bs4==0.0.2
build==1.2.2.post1
cachetools==5.5.1
certifi==2025.1.31
charset-normalizer==3.4.1
chroma-hnswlib==0.7.6
chromadb==0.6.3
click==8.1.8
coloredlogs==15.0.1
comm==0.2.2
contourpy==1.3.1
cycler==0.12.1
dash==2.18.2
dash-core-components==2.0.0
dash-html-components==2.0.0
dash-table==5.0.0
dataclasses-json==0.6.7
debugpy==1.8.12
decorator==5.1.1
Deprecated==1.2.18
distro==1.9.0
durationpy==0.9
executing==2.2.0
faiss-cpu==1.10.0
fastapi==0.115.8
fastjsonschema==2.21.1
filelock==3.17.0
Flask==3.0.3
flatbuffers==25.1.24
fonttools==4.55.8
frozenlist==1.5.0
fsspec==2025.2.0
google-auth==2.38.0
googleapis-common-protos==1.66.0
grpcio==1.70.0
h11==0.14.0
httpcore==1.0.7
httptools==0.6.4
httpx==0.28.1
httpx-sse==0.4.0
huggingface-hub==0.28.1
humanfriendly==10.0
idna==3.10

Websitetext einlesen und für Embedding vorbereiten

In [24]:
def read_website(url):
    # read the text from the website
    response = requests.get(url)
    html_content = response.text

    # remove unwanted tags
    soup = BeautifulSoup(html_content, "html.parser")

    for tag in soup.select("nav, footer, .c24-cookie-consent-notice, .ads, .sidebar"):
        tag.decompose()

    text = soup.get_text(separator="\n")  # Extract text while keeping structure

    return text

In [25]:
def remove_cookies_info(text):
    lines = text.split("\n")
    cleaned_lines = []
    cookies_over = False

    for line in lines:
        if line.strip() == "Alle akzeptieren": #search for end of cookie info which appears at beginning of scraped text
            cookies_over = True
        if cookies_over:
            cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

def remove_empty_lines(text):
    lines = text.split("\n")
    cleaned_lines = []

    for line in lines:
        if line.strip() == "":
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

def remove_urls(text):
    lines = text.split("\n")
    cleaned_lines = []
    skip_next = 0

    for i, line in enumerate(lines):
        if i < len(lines) - 2 and (lines[i+1][0:4] == "http" or lines[i+1] == "hier" or lines[i+1] == "Link" ): # skip urls and remove corresponding line brakes
            skip_next = 3
            cleaned_lines.append(lines[i] + lines[i+2])
        if skip_next > 0:
            skip_next -= 1
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)


def remove_top_level_headers(text):
    lines = text.split("\n")
    cleaned_lines = []
    
    for i in range(len(lines)):
        # Skip first two lines (they only contain headers etc)
        if i < 2:
            continue

        # Detect a top-level section header (e.g., "3") followed by a short title
        if re.match(r"^\d+$", lines[i].strip()) and i + 1 < len(lines):
            continue  # Do not include the number
        elif re.match(r"^\d+$", lines[i - 1].strip()):
            continue  # Skip title (line after number)

        # Add the rest of the content
        cleaned_lines.append(lines[i])

    return "\n".join(cleaned_lines)


def propagate_parent_sections(text: str) -> str:
    """
    Propagates parent sections to their respective child sections in a given text.
    This function processes a text where sections are denoted by hierarchical numbering (e.g., "4.2.", "4.2.1.", "4.2.1.1.").
    It appends the parent section titles to their respective child sections to provide context.
    Args:
        text (str): The input text containing sections and sub-sections.
    Returns:
        str: The processed text with parent sections propagated to their child sections.
    """

    lines = text.split("\n")
    cleaned_lines = []
    current_parent = ""
    current_child = ""
    parent_coming = False
    child_coming = False
    childchild_coming = False

    for line in lines:
        stripped = line.strip()

        if parent_coming:
            current_parent = stripped
            parent_coming = False

        if child_coming:
            current_child = stripped
            cleaned_lines.append(current_parent + " " + line)
            child_coming = False
        elif childchild_coming:
            cleaned_lines.append(current_parent + " " + current_child + " " + line)
            childchild_coming = False
        else:
            cleaned_lines.append(line)
        
        # Detect major section (e.g., "4.2.") but NOT "4.2.1."
        if re.match(r"^\d+\.\d+\.$", stripped):
            parent_coming = True
        
        # Detect sub-sections (e.g., "4.2.1.") but not top-level sections
        elif re.match(r"^\d+\.\d+\.\d+\.$", stripped):
            child_coming = True

            # Detect sub-sections (e.g., "4.2.1.") but not top-level sections
        elif re.match(r"^\d+\.\d+\.\d+\.\d+\.$", stripped):
            childchild_coming = True

    return "\n".join(cleaned_lines)


def remove_numbers(text):
    lines = text.split("\n")
    cleaned_lines = []

    for line in lines:
        if re.match(r"^\d+\.\d+\.$", line) or re.match(r"^\d+\.\d+\.\d+\.$", line) or re.match(r"^\d+\.\d+\.\d+\.\d+\.$", line):
            continue
        cleaned_lines.append(line)

    return "\n".join(cleaned_lines)

In [26]:
# Read the text from the website
text = read_website(website_url)

# Preprocess the text by removing unnecessary information and improving formatting
text = remove_empty_lines(text)
text = remove_cookies_info(text)
text = remove_urls(text)
text = remove_top_level_headers(text)
text = propagate_parent_sections(text)
text = remove_numbers(text)

Text in Chunks aufteilen und Embedding-Vectors berechnen

In [27]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
chunks = text_splitter.split_text(text)

# Generate embeddings using OpenAI
embeddings = embedding_model.embed_documents(chunks)

# Convert to FAISS format with euclidian (L2) distance measure for similarity search
dimension = len(embeddings[0])  # Get embedding size
faiss_index = faiss.IndexFlatL2(dimension)  # L2 distance
faiss_index.add(np.array(embeddings, dtype=np.float32))

# Store chunk text with same indices as in 
chunk_metadata = {i: chunks[i] for i in range(len(chunks))}

print("Vectorization complete! Stored", len(chunks), "chunks in FAISS.")

Vectorization complete! Stored 70 chunks in FAISS.


Für Prompt relevanteste Chunks heraussuchen

In [28]:
def get_context(query, number_of_chunks_to_retrieve=5):
    query_embedding = embedding_model.embed_query(query)

    # Search for similar chunks
    _, similar_indices = faiss_index.search(np.array([query_embedding], dtype=np.float32), 
                                            number_of_chunks_to_retrieve)

    # Get the text of the similar chunks
    similar_chunks = [chunk_metadata[i] for i in similar_indices[0]]

    return similar_chunks, similar_indices

Modell Testen

In [29]:
def get_answer(prompt):
    answer = ""
    stream = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        stream=True, 
    )
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            answer += chunk.choices[0].delta.content

    return answer

In [30]:
# Hier eine Frage zu den Teilnahmebedingungen eingeben um das Modell zu testen
query = "Was kann ich gewinnen?"



print("Frage:\n", query)


# Get context
print("\nKontext wird gesucht...\n")

context_chunks, _ = get_context(query, number_of_chunks_to_retrieve=5)

print("\n".join(["Kontext " + str(i + 1) + ":\n" + context for i, context in enumerate(context_chunks)]))

context = "\n".join(context_chunks) # Combine chunks into one string


# Create final prompt
print("\nAntwort wird generiert...\n")

prompt = f"""
Du bist ein KI-Assistent, der Fragen zu den Teilnahmebedingungen eines Gewinnspiels beantwortet. Zur Beantwortung der Frage hast du folgenden Kontext:

Kontext:
{context}

Die Frage des Benutzers lautet:
{query}
"""

answer = get_answer(prompt)

print("Antwort:\n", answer)


Frage:
 Was kann ich gewinnen?

Kontext wird gesucht...

Kontext 1:
Gutscheine aus der Gesamtwertung und den jeweiligen Spieltagen im Wert von 240 € (in Worten zweihundertvierzig) und weniger werden passend zur Gewinneranzahl aufgestockt, damit jeder mit entsprechender Platzierung einen Gutschein erhält.
Für die zu gewinnenden Gutscheine gelten die folgenden separaten Gutscheinbedingungen der CHECK24 GmbH ().
Ein Anspruch auf Barauszahlung eines Gewinnes (nach Ziffer 4) besteht seitens des Gewinners nicht.
Kontext 2:
In der Gesamtwertung der CHECK24 Tipprunde vergibt der Veranstalter für die Nutzer mit den meisten Punkten (gem. Ziffer 2) die folgenden Gewinne:
1 x 2.400 € Reise-Guthaben
Zusätzlich gibt es mehrere vom Veranstalter definierte Spieltage. Ein Spieltag bezeichnet einen festgelegten Termin, an dem eine Runde von Spielen innerhalb des Wettbewerbs, wie in der Gruppenphase oder K.-o.-Runde, ausgetragen wird. An einem Spieltag finden mehrere Spiele gleichzeitig statt.
Kontext 3:

Visualisierung

In [31]:
from sklearn.decomposition import PCA

print("doing PCA")
pca = PCA(n_components=2)
pca_embeddings = pca.fit_transform(embeddings)
print("PCA done")

doing PCA
PCA done


In [38]:
def prompt_model(query):
    context_chunks, context_indices = get_context(query, number_of_chunks_to_retrieve=5)
    context = "\n\n".join(context_chunks) # Combine chunks into one string

    prompt = f"""
    Du bist ein KI-Assistent, der Fragen zu den Teilnahmebedingungen eines Gewinnspiels beantwortet. Zur Beantwortung der Frage hast du folgenden Kontext:

    Kontext:
    {context}

    Die Frage des Benutzers lautet:
    {query}
    """

    answer = get_answer(prompt)
    return answer, context_indices, prompt

In [48]:
import plotly.graph_objs as go
import dash
from dash import dcc, html
from dash.dependencies import Input, Output, State

# Update the layout
layout = go.Layout(
    hovermode='closest',
    xaxis={'title': {'text': 'PCA Component 1'}},
    yaxis={'title': {'text': 'PCA Component 2'}},
    showlegend=True
)

# Create the Dash app
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H2("PCA of Text Chunks", style={"text-align": "center"}),
    
    # Text Input for query
    html.Div([
        dcc.Input(id='query-input', type='text', placeholder='Enter your query here', style={'width': '80%', 'border': '1px solid black'}),
        html.Button('Submit', id='submit-button', n_clicks=0)
    ], style={'text-align': 'center', 'padding-top': '20px'}),
    
    # Model Answer
    html.Div([
        html.H4("Model Answer"),
        html.Div(id='model-answer', style={'white-space': 'pre-line', 'padding': '10px'})
    ], style={'padding-top': '20px'}),
    
    # Plot and Text of Selected Chunk
    html.Div([
        html.Div([
            dcc.Graph(
                id='scatter-plot',
                figure={'data': [trace], 'layout': layout},
                config={'displayModeBar': False}  # Turn off the toolbar
            ),
            html.Div([
                html.H4("Text of Selected Chunk"),
                html.Div(id='chunk-text', style={'white-space': 'pre-line', 'padding': '10px'})
            ], style={'padding-top': '20px'})
        ], style={'width': '60%', 'display': 'inline-block', 'vertical-align': 'top', 'padding-top': '5px'}),
        
        # Augmented Prompt
        html.Div([
            html.H4("Augmented Prompt"),
            html.Div(id='augmented-prompt', style={'white-space': 'pre-line', 'padding': '10px', 'font-size': 'small'})
        ], style={'width': '35%', 'display': 'inline-block', 'vertical-align': 'top', 'padding-left': '20px'})
    ], style={'padding-top': '5px', 'text-align': 'left'})  # Reduced padding-top here
])

# Define callback to update the text box when a point is clicked
@app.callback(
    Output('chunk-text', 'children'),
    [Input('scatter-plot', 'clickData')]
)
def display_chunk_text(clickData):
    if clickData is None:
        return "Click on a point to see the chunk text."
    
    # Extract the index of the clicked point
    point_index = clickData['points'][0]['pointIndex']
    
    # Return the text of the corresponding chunk
    return chunks[point_index]

# Define callback to update the plot with the query embedding, model answer, and augmented prompt
@app.callback(
    [Output('scatter-plot', 'figure'), Output('model-answer', 'children'), Output('augmented-prompt', 'children')],
    [Input('submit-button', 'n_clicks')],
    [State('query-input', 'value')]
)
def update_plot(n_clicks, query):
    if n_clicks > 0 and query:
        # Get the model answer, context indices, and augmented prompt
        model_answer, context_indices, augmented_prompt = prompt_model(query)
        
        # Create a new trace for the query embedding
        query_embedding = embedding_model.embed_query(query)
        query_pca = pca.transform([query_embedding])[0]
        query_trace = go.Scatter(
            x=[query_pca[0]],
            y=[query_pca[1]],
            mode='markers',
            marker=dict(size=12, color='red', opacity=0.7),
            text=[query],  # Add the query text to display on hover
            hoverinfo='text',  # Enable hover info
            name='Query'  # Add legend name
        )
        
        # Separate the chunks into relevant and non-relevant
        relevant_chunks = [i for i in range(len(chunks)) if i in context_indices]
        non_relevant_chunks = [i for i in range(len(chunks)) if i not in context_indices]
        
        # Create traces for relevant and non-relevant chunks
        relevant_trace = go.Scatter(
            x=pca_embeddings[relevant_chunks, 0],
            y=pca_embeddings[relevant_chunks, 1],
            mode='markers',
            marker=dict(size=12, color='orange', opacity=0.7),
            text=[chunks[i] for i in relevant_chunks],
            hoverinfo='none',
            name='Relevant Chunks'  # Add legend name
        )
        
        non_relevant_trace = go.Scatter(
            x=pca_embeddings[non_relevant_chunks, 0],
            y=pca_embeddings[non_relevant_chunks, 1],
            mode='markers',
            marker=dict(size=12, color='blue', opacity=0.7),
            text=[chunks[i] for i in non_relevant_chunks],
            hoverinfo='none',
            name='Info-Chunks'  # Add legend name
        )
        
        # Update the figure with the new traces
        figure = {
            'data': [non_relevant_trace, relevant_trace, query_trace],
            'layout': layout
        }
        
        return figure, model_answer, augmented_prompt
    
    # Return the original figure, empty model answer, and empty augmented prompt if no query is submitted
    return {'data': [trace], 'layout': layout}, "", ""

# Run the Dash app
if __name__ == '__main__':
    app.run_server(debug=True)


In [34]:
! pip install dash

