In [1]:
import sys
sys.path.append('/Users/cohlem/Projects/OkProfessor/')

In [58]:
import asyncio
import time

from researcher.config import Config
from researcher.search.duckduckgo import Duckduckgo
from researcher.utils.functions import * 
from researcher.retriever.langchain_hybrid_retriever import HybridRetriever
from researcher.scraping.scrape import Scraper
from researcher.context.chunking import Chunking


class Researcher:
    def __init__(self, query ):
        self.query = query
        self.cfg = Config()
        self.agent = None
        self.role = None
        self.visited_urls = set()
        self.context = []
        
    async def run(self):
        """
        Run the researcher
        """
        if self.cfg.search_engine == 'Duckduckgo':
            retriever = Duckduckgo()
            
        print(f'📘 Starting research for query: {self.query}')
        self.agent, self.role = await choose_agent(self.query, self.cfg.llm )
        print(f'Running {self.agent} ...')
        
        #query modification
        sub_queries = await get_sub_queries(self.query, self.role, self.cfg) + [self.query]
        
#         tasks = [self.process_query(each_query) for each_query in sub_queries]
#         context_results = await asyncio.gather(*tasks)
        
#         for context in context_results:
#             self.context.append(context)
    
        for each_query in sub_queries:
            
            print(f'🔍 Searching web with query: {each_query}')
            content = await self.get_content_using_query(each_query)
            context = await self.get_similar_context(each_query, content)
            self.context.append(context)
            
        
        return self.context    

    
    
    async def process_query(self, query):
        
        print(f'🔍 Searching web with query: {query}')
        content = await self.get_content_using_query(query)
        context = await self.get_similar_context(query, content)
        
        self.context.append(context)
    
    
    async def get_content_using_query(self,query):

        search_engine = Duckduckgo(query = query)
        search_urls = search_engine.search(max_results = self.cfg.max_search_results_per_query)

        search_urls = [url.get('href') for url in search_urls]

        new_search_urls = await self.get_unique_urls(search_urls) #filter out the same urls 

        content_scraper = Scraper(new_search_urls)
        content = content_scraper.run()

        return content
    
    async def get_chunks(self, content):
        
        chunks = []
        chunking = Chunking(self.cfg.chunk_size ,self.cfg.chunk_overlap)

        for each_content in content:
            chunks += chunking.run(content=each_content['raw_content'], metadatas= {'url': each_content['url'] })
            
        return chunks
    
    async def get_unique_urls(self, urls):
        
        new_urls = []
        for url in urls:
            if url not in self.visited_urls:
                
                print(f'✅ Adding url {url} to our research')
                
                new_urls.append(url)
                self.visited_urls.add(url)
                
        return new_urls
                
    
    async def get_similar_context(self, query, content):
        
        #chunk where?
        chunks = await self.get_chunks(content)
        hybrid_retriever = HybridRetriever(chunks ,max_results = self.cfg.max_chunks_per_query)
        similar_context = hybrid_retriever.get_context(query)

        return similar_context

In [59]:
parallel_r = Researcher('Investment strategies and buying rental properties')

In [57]:
start_time = time.time()
parallel_ans = await parallel_r.run()
end_time = time.time()

print('time taken by the async parallel method is', end_time - start_time)

📘 Starting research for query: Investment strategies and buying rental properties
Running 💰 Finance Agent ...
🔍 Searching web with query: Best investment strategies for rental properties
✅ Adding url https://www.forbes.com/advisor/investing/rental-property-real-estate-investing/ to our research
✅ Adding url https://smartasset.com/investing/investing-in-rental-property-for-beginners to our research
✅ Adding url https://www.investopedia.com/articles/investing/090815/buying-your-first-investment-property-top-10-tips.asp to our research
✅ Adding url https://www.mashvisor.com/blog/rental-property-investment-strategy-definitive-guide/ to our research
✅ Adding url https://www.moneygeek.com/mortgage/resources/rental-property-investing/ to our research
🔍 Searching web with query: Pros and cons of buying rental properties as an investment
✅ Adding url https://learn.roofstock.com/blog/pros-and-cons-of-owning-rental-property to our research
✅ Adding url https://www.investopedia.com/articles/invest

In [60]:
r = Researcher('Investment strategies and buying rental properties')

In [61]:
x = await r.run()

📘 Starting research for query: Investment strategies and buying rental properties
Running 💰 Finance Agent ...
🔍 Searching web with query: Best investment strategies for rental properties
✅ Adding url https://www.forbes.com/advisor/investing/rental-property-real-estate-investing/ to our research
✅ Adding url https://smartasset.com/investing/investing-in-rental-property-for-beginners to our research
✅ Adding url https://www.investopedia.com/articles/investing/090815/buying-your-first-investment-property-top-10-tips.asp to our research
✅ Adding url https://www.mashvisor.com/blog/rental-property-investment-strategy-definitive-guide/ to our research
✅ Adding url https://www.moneygeek.com/mortgage/resources/rental-property-investing/ to our research
🔍 Searching web with query: Pros and cons of buying rental properties as an investment
✅ Adding url https://learn.roofstock.com/blog/pros-and-cons-of-owning-rental-property to our research
✅ Adding url https://www.investopedia.com/articles/invest

In [38]:
x

[[Document(page_content="Rental Properties: Pros and Cons\nRental Properties: Pros and Cons\nPete Rathburn is a copy editor and fact-checker with expertise in economics and personal finance and over twenty years of experience in the classroom.\nOwning a rental property can be financially rewarding. If you're exploring this type of real estate as an investment, be aware of the risks and responsibilities.\nRental Properties: An Overview\nThe idea of buying a home or apartment to rent out for profit may sound alluring. But buying a rental property for income and long-term capital appreciation can have its ups and downs. For example, the housing market can fluctuate depending on location, supply and demand, and the economy.", metadata={'url': 'https://www.investopedia.com/articles/investing/051515/pros-cons-owning-rental-property.asp'}),
  Document(page_content='The Pros and Cons of Investing in Rental Properties\nLike any investment, it’s important that you understand the benefits and lim

In [37]:
len(x[4])

IndexError: list index out of range

In [19]:
something = await r.get_content_using_query('Pros and cons of investing in rental properties')

✅ Adding url https://www.sofi.com/learn/content/the-pros-and-cons-of-owning-rental-property/ to our research


In [20]:
something

[{'url': 'https://www.sofi.com/learn/content/the-pros-and-cons-of-owning-rental-property/',
  'raw_content': 'We need to confirm you’re human.\nThanks for helping us keep the bots away. Check the box and let’s get you where you’re going.\nChecking your browser before accessing "www.sofi.com".\nRedirecting...\nPlease turn JavaScript on and reload the page.\nPlease enable Cookies and reload the page.\nRequest ID: 8301ed34ab939e82\nIP: 27.34.65.52\n© 2022 Social Finance, Inc.'}]

In [23]:
response

['Real estate market trends November 2023',
 'Property value forecast 2024',
 'Rental income potential in [specific location]',
 'investment in real estate and buying properties']

In [10]:
from researcher.search.duckduckgo import Duckduckgo

In [139]:
search_engine = Duckduckgo('investment in real estate and buying properties')


In [140]:
search_urls = list(search_engine.search(max_results=r.cfg.max_search_results_per_query))

In [141]:
search_urls = [url['href'] for url in search_urls]

In [142]:
search_urls

['https://www.businessinsider.com/how-to-find-good-deal-property-buy-real-estate-investing-2023-11?op=1',
 'https://www.investopedia.com/mortgage/real-estate-investing-guide/',
 'https://smartasset.com/investing/how-to-buy-your-first-investment-property',
 'https://time.com/personal-finance/article/how-to-invest-in-real-estate/',
 'https://money.usnews.com/investing/real-estate-investments/articles/rules-for-buy-and-hold-real-estate-investing']

In [27]:
response

['Real estate market trends November 2023',
 'Property value forecast 2024',
 'Rental income potential in [specific location]',
 'investment in real estate and buying properties']

In [56]:
from researcher.scraping.scrape import Scraper

In [None]:
def get_unique_urls(urls):
    self.visited_urls 

In [150]:
async def get_content_using_query(query):
    
    search_engine = Duckduckgo(query = query)
    search_urls = search_engine.search(max_results = r.cfg.max_search_results_per_query)
    
    search_urls = [url.get('href') for url in search_urls]
    
#     new_search_urls = get_unique_urls(search_urls)
#     print(search_urls)
    
    scraper = Scraper(search_urls)
    content = scraper.run()
    
    
    return content
    
    

In [206]:
ans = await get_content_using_query('Property value forecast 2024')

In [208]:
ans

[{'url': 'https://www.realtor.com/research/2024-national-housing-forecast/',
  'raw_content': 'Articles\n2024 Housing Market Forecast and Predictions: Housing Affordability Finally Begins to Turnaround\nAs we look ahead to 2024, we see a mix of continuity and change in both the housing market and economy. Against a backdrop of modest economic growth, slightly higher unemployment, and easing inflation longer term interest rates including mortgage rates begin a slow retreat. The shift from climbing to falling mortgage rates improves housing affordability, but saps some of the urgency home shoppers had previously sensed. Less frenzied housing demand and plenty of rental home options keep home sales relatively stable at low levels in 2024, helping home prices to adjust slightly lower even as the number of for-sale homes continues to dwindle.\nRealtor.com® 2024 Forecast for Key Housing Indicators\nHome Prices Dip, Improving Affordability\nHome prices grew at a double-digit annual clip for t

In [53]:


for each_query in response:
    
    content = await get_content_using_query(each_query)
    
    relevant_context = 
    self.context.append() #all relevant context
    
    

In [41]:
len(results)

5

In [25]:
search_urls

{'https://smartasset.com/investing/how-to-buy-your-first-investment-property',
 'https://time.com/personal-finance/article/how-to-invest-in-real-estate/',
 'https://www.businessinsider.com/how-to-find-good-deal-property-buy-real-estate-investing-2023-11?op=1',
 'https://www.forbes.com/sites/forbesbusinesscouncil/2023/10/30/exploring-the-pros-and-cons-of-real-estate-investment/',
 'https://www.investopedia.com/mortgage/real-estate-investing-guide/'}

In [29]:
gg = {'a': 1, 'b': 2}

In [32]:
ans = gg.get('c')

## Langchain's hybrid search

In [202]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

In [134]:

# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(context)
bm25_retriever.k = 2

embedding = OpenAIEmbeddings()
faiss_vectorstore = FAISS.from_texts(context, embedding) 

AttributeError: 'Document' object has no attribute 'split'

In [103]:


# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(context)
bm25_retriever.k = 2

embedding = OpenAIEmbeddings()
faiss_vectorstore = FAISS.from_texts(context, embedding)


faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})



# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

In [107]:
docs = ensemble_retriever.get_relevant_documents("i very much love apples")
docs

[Document(page_content='I like apples'),
 Document(page_content='I like oranges'),
 Document(page_content='Apples and oranges are fruits')]

In [106]:
faiss_retriever.get_relevant_documents('i very much love apples')

[Document(page_content='I like apples'),
 Document(page_content='I like oranges')]

In [166]:
import os
from enum import Enum
from typing import Any, Dict, List, Optional

from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from langchain.schema import Document
from langchain.schema.retriever import BaseRetriever


class SearchAPIRetriever(BaseRetriever):
    """Search API retriever."""
    pages: List[Dict] = []

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:

        docs = [
            Document(
                page_content=page.get("raw_content", ""),
                metadata={
#                     "title": page.get("title", ""),
                    "source": page.get("url", ""),
                },
            )
            for page in self.pages
        ]

        return docs

In [116]:
base_retriever = SearchAPIRetriever(pages = ans)

In [203]:
from researcher.context.chunking import Chunking

In [205]:
chunking = Chunking(1000,100)

In [122]:
len(ans)

5

In [153]:
ans

[{'url': 'https://www.realtor.com/research/2024-national-housing-forecast/',
  'raw_content': 'Articles\n2024 Housing Market Forecast and Predictions: Housing Affordability Finally Begins to Turnaround\nAs we look ahead to 2024, we see a mix of continuity and change in both the housing market and economy. Against a backdrop of modest economic growth, slightly higher unemployment, and easing inflation longer term interest rates including mortgage rates begin a slow retreat. The shift from climbing to falling mortgage rates improves housing affordability, but saps some of the urgency home shoppers had previously sensed. Less frenzied housing demand and plenty of rental home options keep home sales relatively stable at low levels in 2024, helping home prices to adjust slightly lower even as the number of for-sale homes continues to dwindle.\nRealtor.com® 2024 Forecast for Key Housing Indicators\nHome Prices Dip, Improving Affordability\nHome prices grew at a double-digit annual clip for t

In [211]:
sub_query_chunks = []
for each_ans in ans:
    sub_query_chunks += chunking.run(content=each_ans['raw_content'], metadatas= {'url': each_ans['url'] })

In [212]:
len(sub_query_chunks)

68

In [214]:
len(ans)

4

In [197]:
def get_similar_context(query, content):
    chunks = []
    chunking = Chunking(1000,100)
    
    for each_content in content:
        chunks += chunking.run(content=each_content['raw_content'], metadatas= {'url': each_content['url'] })
   
    retriever = hybrid_retriever(chunks,8)
    
    relevant_chunks = retriever.get_relevant_documents(query)
    
    return relevant_chunks

In [191]:
final_gg = get_similar_context('investment in 2024', ans)

In [192]:
len(final_gg)

12

In [193]:
final_gg

[Document(page_content='Many companies featured on Money advertise with us. Opinions are our own, but compensation and in-depth research may determine where and how companies appear. Learn more about how we make money.\nHere’s How Far Home Prices and Mortgage Rates Could Drop in 2024\nhttps://money.com/mortgage-rates-home-prices-predictions-2024/\nThe new year could finally bring good tidings for homebuyers — or at least the beginning of improved housing affordability.\nHistorically high mortgage rates and housing prices that stymied homebuyers this year are expected to ebb a bit in 2024, according to real estate brokerage Realtor.com’s new housing market forecast. While home shoppers shouldn’t expect major relief from today’s crushing homeownership costs, small gains in affordability are expected to help some buyers get a foot in the door.\nRenters can also look forward to somewhat better conditions as new construction hits the market in the coming months.', metadata={'url': 'https://

In [215]:
class HybridRetriever:
    def __init__(self, documents, embeddings=OpenAIEmbeddings(), max_results=5):

        self.documents = documents
        self.embeddings = embeddings
        self.max_results = max_results
        self.retriever = self._hybrid_retriever()
        
    def _hybrid_retriever(self):
        
        # initialize the bm25 retriever and faiss retriever
        bm25_retriever = BM25Retriever.from_documents(self.documents)
        bm25_retriever.k = self.max_results

        embedding = OpenAIEmbeddings()
        faiss_vectorstore = FAISS.from_documents(self.documents, self.embeddings)
        faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": self.max_results })

        #initialize the ensemble retriever
        ensemble_retriever = EnsembleRetriever(
            retrievers=[bm25_retriever, faiss_retriever], weights=[0.2, 0.8]
        )

        return ensemble_retriever
    
    def get_context(self, query):

        return self.retriever.get_relevant_documents(query)
        

In [217]:
len(sub_query_chunks)

68

In [218]:
H = HybridRetriever(sub_query_chunks)

In [220]:
retrieved_context = H.get_context('investment strategies')

In [222]:
len(retrieved_context)

9

## Playing with asyncio

In [79]:
import asyncio

async def gg():
    task = asyncio.create_task(haha())
    print('A')
    await asyncio.sleep(1)

    print('B')
    await task
    
async def haha():
    print('1')
    await asyncio.sleep(2)
    print('2')

In [80]:
await gg()

A
1
B
2
