In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def is_valid_url(url):
    parsed_url = urlparse(url)
    return parsed_url.scheme in ("http", "https")

def crawl(url, depth, max_depth=5, visited=set()):
    if depth > max_depth or url in visited:
        return []

    visited.add(url)
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return []

    data = [soup.get_text()]
    links = soup.find_all('a', href=True)
    for link in links:
        next_url = urljoin(url, link['href'])
        if is_valid_url(next_url):
            data.extend(crawl(next_url, depth + 1, max_depth, visited))

    return data

# Start crawling from the main URL
data = crawl("https://docs.nvidia.com/cuda/", 0)

# Print the first few data points to check the output
for i, text in enumerate(data[:5]):
    print(f"Data {i+1}:\n{text}\n")


In [None]:
from sentence_transformers import SentenceTransformer
import pymilvus

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Connect to MILVUS
from pymilvus import connections, utility, Collection
connections.connect("default", host='localhost', port='19530')

# Define the collection schema
from pymilvus import FieldSchema, CollectionSchema, DataType

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
    FieldSchema(name="metadata", dtype=DataType.JSON)
]
schema = CollectionSchema(fields)
collection = Collection("web_data", schema)

# Process data and insert into MILVUS
for chunk in data:
    embeddings = model.encode(chunk, convert_to_tensor=True)
    collection.insert([range(len(embeddings)), embeddings, metadata])


In [None]:
from elasticsearch import Elasticsearch
from sentence_transformers import CrossEncoder

# Initialize Elasticsearch
es = Elasticsearch()

# Query expansion and retrieval
def query_expansion(query):
    # Implement query expansion logic
    return [query]

queries = query_expansion("Your query here")
results = es.search(index="web_data", body={"query": {"match": {"content": queries}}})

# Re-rank using a cross-encoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
scores = cross_encoder.predict([(query, result['_source']['content']) for result in results['hits']['hits']])
re_ranked_results = sorted(zip(results['hits']['hits'], scores), key=lambda x: x[1], reverse=True)


In [None]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def answer_question(question, context):
    return qa_pipeline(question=question, context=context)

# Use the highest-ranked context for answering
answer = answer_question("Your question here", re_ranked_results[0][0]['_source']['content'])
print(answer)


In [None]:
import streamlit as st

st.title("Question Answering System")
user_query = st.text_input("Enter your query:")
if st.button("Get Answer"):
    answer = answer_question(user_query, re_ranked_results[0][0]['_source']['content'])
    st.write(answer)
