In [1]:
import requests
from newspaper import Article
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

article_urls = [
    "https://www.artificialintelligence-news.com/2023/05/16/openai-ceo-ai-regulation-is-essential/",
    "https://www.artificialintelligence-news.com/2023/05/15/jay-migliaccio-ibm-watson-on-leveraging-ai-to-improve-productivity/",
    "https://www.artificialintelligence-news.com/2023/05/15/iurii-milovanov-softserve-how-ai-ml-is-helping-boost-innovation-and-personalisation/",
    "https://www.artificialintelligence-news.com/2023/05/11/ai-and-big-data-expo-north-america-begins-in-less-than-one-week/",
    "https://www.artificialintelligence-news.com/2023/05/02/ai-godfather-warns-dangers-and-quits-google/",
    "https://www.artificialintelligence-news.com/2023/04/28/palantir-demos-how-ai-can-used-military/"
]

session = requests.Session()
pages_content = []

for url in article_urls:
    try:
        time.sleep(2)
        response = session.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            article = Article(url)
            article.download()
            article.parse()
            pages_content.append({'url': url, 'text': article.text})
        else:
            print(f"Failed to fetch article at {url}")
    
    except Exception as e:
        print(f"Error occurred while fetching article at {url}: {e}")

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

user = "veaceslavcalestru"
dataset = "langchain_course_qabot_with_source"
dataset_path = f"hub://{user}/{dataset}"

db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)



Your Deep Lake dataset has been successfully created!
The dataset is private so make sure you are logged in!




In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

all_texts, all_metadatas = [], []
for d in pages_content:
    chunks = text_splitter.split_text(d['text'])
    for chunk in chunks:
        all_texts.append(chunk)
        all_metadatas.append({'source': d['url']})

db.add_texts(all_texts, all_metadatas)

|

Dataset(path='hub://veaceslavcalestru/langchain_course_qabot_with_source', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (49, 1536)  float32   None   
    id        text      (49, 1)      str     None   
 metadata     json      (49, 1)      str     None   
   text       text      (49, 1)      str     None   


 

['f4449547-728e-11ee-b60b-cc4740c98b6b',
 'f4449548-728e-11ee-98a7-cc4740c98b6b',
 'f4449549-728e-11ee-b082-cc4740c98b6b',
 'f444954a-728e-11ee-b8f9-cc4740c98b6b',
 'f444954b-728e-11ee-9aea-cc4740c98b6b',
 'f444954c-728e-11ee-8297-cc4740c98b6b',
 'f444954d-728e-11ee-9092-cc4740c98b6b',
 'f444954e-728e-11ee-bdca-cc4740c98b6b',
 'f444954f-728e-11ee-a0b8-cc4740c98b6b',
 'f4449550-728e-11ee-9f19-cc4740c98b6b',
 'f4449551-728e-11ee-99ea-cc4740c98b6b',
 'f4449552-728e-11ee-bac4-cc4740c98b6b',
 'f4449553-728e-11ee-b229-cc4740c98b6b',
 'f4449554-728e-11ee-b51c-cc4740c98b6b',
 'f4449555-728e-11ee-9369-cc4740c98b6b',
 'f4449556-728e-11ee-bdf0-cc4740c98b6b',
 'f4449557-728e-11ee-9e80-cc4740c98b6b',
 'f4449558-728e-11ee-96ea-cc4740c98b6b',
 'f4449559-728e-11ee-a556-cc4740c98b6b',
 'f444955a-728e-11ee-b910-cc4740c98b6b',
 'f444955b-728e-11ee-ba1e-cc4740c98b6b',
 'f444955c-728e-11ee-8dfa-cc4740c98b6b',
 'f444955d-728e-11ee-a04c-cc4740c98b6b',
 'f444955e-728e-11ee-88e9-cc4740c98b6b',
 'f444955f-728e-

In [5]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI

llm = OpenAI(model_name='text-davinci-003', temperature=0)

chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm,
                                                    chain_type='stuff',
                                                    retriever=db.as_retriever())

d_response = chain({'question': "What does Geoffrey Hinton think about recent trends in AI?"})

print("Response:")
print(d_response['answer'])
print("Sources:")
for source in d_response['sources'].split(', '):S
    print("- " + source)

Response:
 Geoffrey Hinton believes that the rapid development of generative AI products is "racing towards danger" and that false text, images, and videos created by AI could lead to a situation where average people "would not be able to know what is true anymore." He also expressed concerns about the impact of AI on the job market, as machines could eventually replace roles such as paralegals, personal assistants, and translators.

Sources:
- https://www.artificialintelligence-news.com/2023/05/02/ai-godfather-warns-dangers-and-quits-google/
