In [4]:
from data_loader import DataLoader
from data_retriever import DataRetriever
from response_generator import ResponseGenerator

import pinecone
from elasticsearch import Elasticsearch, helpers
from pinecone import Pinecone, ServerlessSpec

# load environment variables
import os
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
# read dspy-docs.txt file as text
with open('dspy-docs.txt', 'r') as file:
    text = file.read()
    

# print the length of the words in the text
print(len(text.split()))

In [None]:
# Initialize components
# es_index_name = 'test-rag-index'
es_index_name = 'test-rag-index-meta'
es_index_for_pinecone_meta = 'test-rag-index-meta'
pinecone_index_name = 'test-rag-index'
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

In [None]:
data_loader = DataLoader(es_index_name, es_index_for_pinecone_meta, pinecone_index_name, pinecone_api_key)

In [None]:
data_loader.delete_indexes()

In [None]:
data_loader.create_indexes()

In [None]:
# data_loader.save_embeddings_and_documents("New Paris is in Lahore")
data_loader.save_embeddings_and_documents(text)

In [None]:
data_retriever = DataRetriever(es_index_name, pinecone_index_name, pinecone_api_key)

In [None]:
query = """
write me a sample dspy code
"""

In [None]:
context = data_retriever.blended_retrieval(query)

In [None]:
from response_generator import ResponseGenerator

In [None]:
response_generator = ResponseGenerator(openai_api_key)

In [None]:
print(response_generator.generate_response(query, context))

In [None]:
text = "Here's a simple example in DSPy that demonstrates how to parse leads from raw text. In this example, we assume that \"leads\" are lines in the text that start with a specific keyword (e.g., \"Lead:\"). We will create a function to extract these lines.\n\n```python\nimport dspy\n\nclass ParseLeads(dspy.Signature):\n    \"\"\"Extracts leads from raw text.\"\"\"\n    raw_text = dspy.InputField(desc=\"Raw text input containing leads\")\n    leads = dspy.OutputField(desc=\"List of extracted leads\")\n\ndef extract_leads(raw_text):\n    \"\"\"Function to parse leads from raw text.\"\"\"\n    # Split the text into lines and filter lines that start with 'Lead:'\n    lines = raw_text.split('\\n')\n    leads = [line for line in lines if line.startswith('Lead:')]\n    return leads\n\n# Create a DSPy program for lead extraction\nlead_parser_program = dspy.ProgramOfThought(ParseLeads)\n\n# Example raw text containing leads\nraw_text_example = \"\"\"Lead: Customer interested in product A\nNot a lead: Follow up with marketing\nLead: Prospective client for project B\nLead: Request for proposal from client C\"\"\"\n\n# Running the lead parser program\nresult = lead_parser_program(raw_text=raw_text_example)\n\n# Output the extracted leads\nprint(\"Extracted Leads:\")\nfor lead in result.leads:\n    print(lead)\n```\n\nIn this example, `ParseLeads` defines the input and output structure. The `extract_leads` function extracts lines starting with \"Lead:\" from the input text. Finally, we run the program with a sample raw text, and it outputs the extracted leads."

In [None]:
print(text)

## Crawler

In [2]:
import sys

import requests
import traceback
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# define depth limit
depth_limit = 1

def crawl(url, depth, visited_urls=set()):
    # Check if the depth limit has been reached
    if depth > depth_limit:
        print("Depth limit reached")
        return ""

    # Check if the URL has already been visited
    if url in visited_urls:
        print("URL already visited")
        return ""

    # Add the URL to the set of visited URLs
    visited_urls.add(url)

    # ignore social media links
    if any(domain in url for domain in ["twitter.com", "facebook.com", "linkedin.com"]):
        print("Ignoring social media link")
        return ""

    try:
        # Make a request to the website
        r = requests.get(url)
        r.raise_for_status()
        r_html = r.text

        # Create a BeautifulSoup object and specify the parser
        soup = BeautifulSoup(r_html, "html.parser")

        # Find all the text on the page
        text = soup.find_all(text=True)

        # Remove unnecessary whitespace
        output = ""
        blacklist = [
            "[document]",
            "noscript",
            "header",
            "html",
            "meta",
            "head",
            "input",
            "script",
            "style",
        ]

        for t in text:
            if t.parent.name not in blacklist:
                output += "{} ".format(t)

        # Find all the links on the page
        links = [a["href"] for a in soup.find_all("a", href=True)]

        # Recursively crawl the linked pages
        for link in links:
            absolute_url = urljoin(url, link)
            output += crawl(absolute_url, depth + 1, visited_urls)

        return output
    except Exception as e:
        print(f"ERROR while parsing URL: {url} - {e}")
        print("Line NO:", sys.exc_info()[-1].tb_lineno)
        return ""

def parse_url_and_get_text(url):
    try:
        # Initialize a set to store the visited URLs
        visited_urls = set()
        raw_text = crawl(url, 0, visited_urls)

        # clean the output
        text = " ".join(raw_text.split())
        text = text.replace("\n", " ")

        return text
    except Exception as e:
        print(f"ERROR while parsing URL: {url} - {e}")  
        # print line number of error
        print("Line NO:", sys.exc_info()[-1].tb_lineno)
        return ""



In [3]:
# Example usage
url = "https://aliirz.substack.com/p/all-things-dev-8"
text = parse_url_and_get_text(url)

print("Length of text:", len(text.split()))


  text = soup.find_all(text=True)


Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
URL already visited
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
ERROR while parsing URL: javascript:void(0) - No connection adapters were found for 'javascript:void(0)'
Line NO: 32
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Depth limit reached
Dep

In [None]:
r = requests.get(url)

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.5; rv:128.0) Gecko/20100101 Firefox/128.0'
}

r = requests.get(url, headers=headers)

In [None]:
print(r.text)

In [None]:
print(r.status_code)

## Creating indexes if not present already

In [6]:

pinecone_api_key = os.getenv('PINECONE_API_KEY')
elastic_search_url = os.getenv('ELASTICSEARCH_URL')


In [7]:
pc = Pinecone(api_key=pinecone_api_key, environment="us-west1-gcp")

es = Elasticsearch(elastic_search_url, timeout=30, max_retries=10)

pc.create_index(
    name="blended-rag-pinecone-index-baygata",
    dimension=1536,
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)


es.indices.create(index="blended-rag-es-index-baygata", ignore=400)

  es = Elasticsearch(elastic_search_url, timeout=30, max_retries=10)
