# web scraping

In [None]:
import requests
from bs4 import BeautifulSoup

# Base URL
base_url = "https://www.telekom.de/hilfe"
visited_urls = set()
to_visit_urls = [base_url]

# Function to scrape a single page
def scrape_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract content, links, etc.
    page_content = soup.get_text()  # Or any specific part of the page
    links = soup.find_all('a', href=True)

    # Extract URLs and normalize them
    for link in links:
        full_url = requests.compat.urljoin(base_url, link['href'])
        if full_url not in visited_urls and base_url in full_url:
            to_visit_urls.append(full_url)

    return page_content

# Iterate over all URLs and scrape
all_pages_content = []
while to_visit_urls:
    url = to_visit_urls.pop(0)
    if url in visited_urls:
        continue
    
    content = scrape_page(url)
    all_pages_content.append(content)
    visited_urls.add(url)

# Text Embeddings

In [None]:
import openai
import os
import chromadb

from dotenv import load_dotenv
from chromadb.utils import embedding_functions

load_dotenv() #.env dosyasini yüklüyor, API key'leri yüklüyor.

openai.api_key = os.getenv("OPENAI_API_KEY")

def get_embedding(text):
    response = openai.Embedding.create(
      input=text,
      model="text-embedding-ada-002"  # Model is subject to change
    )
    return response['data'][0]['embedding']

# Example usage
embeddings = [get_embedding(content) for content in all_pages_content]


# Initialize ChromaDB client
client = chromadb.Client()

# Create a collection
collection = client.create_collection("telekom_help")

# Add embeddings to the collection
for i, embedding in enumerate(embeddings):
    collection.add(
        embeddings=[embedding],
        documents=[all_pages_content[i]],
        ids=[str(i)]
    )

# Langchain

In [None]:
from langchain import OpenAI, Chroma
from langchain.vectorstores import ChromaVectorStore
from langchain.chains import RetrievalQA

# Load Chroma vector store
vector_store = ChromaVectorStore(collection)

# Initialize OpenAI model
llm = OpenAI(model_name="gpt-4", openai_api_key = openai.api_key)

# Create a RetrievalQA chain
qa_chain = RetrievalQA(llm=llm, vector_store=vector_store)

# Example query
query = "Telekom internet tarifeleri hakkında bilgi ver"
result = qa_chain.run(query)
print(result)


# Streamlit

In [None]:
import streamlit as st

st.title("Telekom.de Yardım Sorgu Arayüzü")

query = st.text_input("Sorgu Girin:")

if query:
    result = qa_chain.run(query)
    st.write(result)
