## Q and A module for a news dataset using OpenAI

In [1]:
from dotenv import load_dotenv

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain import OpenAI

load_dotenv()
embeddings = OpenAIEmbeddings()

# loader = TextLoader('news/summary.txt')
loader = DirectoryLoader('news', glob="**/*.txt")

documents = loader.load()
print(len(documents))
text_splitter = CharacterTextSplitter(chunk_size=2500, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# print(texts)

docsearch = Chroma.from_documents(texts, embeddings)
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(), 
    chain_type="stuff", 
    retriever=docsearch.as_retriever()
)

def query(q):
    print("Query: ", q)
    print("Answer: ", qa.run(q))

query("What are the effects of legislations surrounding emissions on the Australia coal market?")
query("What are China's plans with renewable energy?")
query("Is there an export ban on Coal in Indonesia? Why?")
query("Who are the main exporters of Coal to China? What is the role of Indonesia in this?")

KeyboardInterrupt: 

## English to SQL translator

In [None]:
from langchain import HuggingFaceHub, LLMChain
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate

load_dotenv()

hub_llm = HuggingFaceHub(repo_id="mrm8488/t5-base-finetuned-wikiSQL")

prompt = PromptTemplate(
    input_variables = ["question"],
    template = "Translate English to SQL for me please: {question}"
)

hub_chain = LLMChain(prompt = prompt, llm = hub_llm, verbose=True)
print(hub_chain.run("What is the average age of the respondents using a mobile device?"))





[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mTranslate English to SQL for me please: What is the average age of the respondents using a mobile device?[0m

[1m> Finished chain.[0m
SELECT AVG Age (years) FROM table WHERE Device = mobile


## Text generation plugin

In [None]:
from langchain import HuggingFaceHub, LLMChain
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate

load_dotenv()

hub_llm = HuggingFaceHub(
    repo_id="gpt2",
    model_kwargs = {'temperature': 0.2, 'max_length': 100}
    )

prompt = PromptTemplate(
    input_variables = ["adjective"],
    template = "Your doing so well. 👍 You are {adjective} and it shows in your work. You certainly are"
)

hub_chain = LLMChain(prompt = prompt, llm = hub_llm, verbose=True)
print(hub_chain.run("passionate"))
print(hub_chain.run("careful"))
print(hub_chain.run("so smart"))
print(hub_chain.run("enlightened"))





[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYour doing so well. 👍 You are passionate and it shows in your work. You certainly are[0m

[1m> Finished chain.[0m
 a great person.

I'm not saying you should be a professional, but I'm saying that you should be a professional.

You are a great person.

You are a great person.

You are a great person.

You are a great person.

You are a great person.

You are a great person.

You are a


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYour doing so well. 👍 You are careful and it shows in your work. You certainly are[0m

[1m> Finished chain.[0m
 not perfect. You may be right, but you are not perfect. You may be wrong, but you are not perfect. You may be right, but you are not perfect. You may be wrong, but you are not perfect. You may be wrong, but you are not perfect. You may be wrong, but you are not perfect. You may be wrong, but you are not perfect.


[1m> Entering n

## On Embeddings

In [None]:
from dotenv import load_dotenv
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, load_index_from_storage, StorageContext
from llama_index.storage.docstore import StorageContext
from llama_index.vector_stores import ChromaVectorStore

load_dotenv()

documents = SimpleDirectoryReader('news').load_data()
index = GPTVectorStoreIndex.from_documents(documents)

# save to disk
index.storage_context.persist()

# load from disk
storage_context = StorageContext(
    docstore=SimpleDocumentStore.from_persist_dir('storage'),
    vector_store=SimpleVectorStore.from_persist_dir('storage'),
    index_store=SimpleIndexStore.from_persist_dir('storage'),
    graph_store=SimpleGraphStore.from_persist_dir('storage')
)
index = load_index_from_storage(storage_context)

query_engine = index.as_query_engine()
r = query_engine.query("Who are the main exporters of Coal to China? What is the role of Indonesia in this?")
print(r)


The main exporters of coal to China are Indonesia, Russia, and Mongolia. Indonesia is the largest exporter of coal to China, accounting for 58.3% of total imports. Indonesia plays a significant role in supplying coal to China, as it exports a substantial amount of thermal coal to meet China's energy demands.


### Embeddings with Chroma

Create a chroma collection 

In [1]:
from dotenv import load_dotenv
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, load_index_from_storage, StorageContext
from llama_index.vector_stores import ChromaVectorStore
import chromadb

load_dotenv()

#  https://docs.trychroma.com/embeddings
# create a Chroma vector store, by default operating purely in-memory
chroma_client = chromadb.Client()

# create a collection
chroma_collection = chroma_client.create_collection("newspieces")
# https://docs.trychroma.com/api-reference
print(chroma_collection.count())
documents = SimpleDirectoryReader('news').load_data()

# Built index
vector_store = ChromaVectorStore(chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = GPTVectorStoreIndex.from_documents(documents=documents, storage_context=storage_context)
index.storage_context.persist(persist_dir='E:/CV Dimi/from BA to DS/llm-python/llm-python',
                              docstore_fname="chroma_index1.json")




0


In [2]:
query_engine = index.as_query_engine()
r = query_engine.query("Who are the main exporters of Coal to China? What is the role of Indonesia in this?")
print(r)

The main exporters of coal to China are Indonesia, Russia, and Mongolia. Indonesia is the largest exporter of coal to China, accounting for 58.3% of total imports. Indonesia plays a significant role in supplying coal to China, as it exports a substantial amount of coal to meet China's energy demands.


In [7]:
print(index.docstore)

<llama_index.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x00000267F9177B50>


In [8]:
chroma_collection.peek()

{'ids': ['55c24a02-ea56-4cc1-ac58-b7cd190f9473',
  'a879cfb6-d7b3-4e16-9688-b2d9a4ba8e48',
  '4878502a-b747-4320-8ad4-dbaf21eec670',
  '8b3da225-92f0-483b-8344-d0e3156c416f',
  '2d059e6c-f644-474c-9689-a8dfb44d49c6',
  'e9a10b43-008d-4c41-92ff-c4b69e789dde',
  'aad90eeb-6247-44c0-abed-13d404c1a1a4',
  'c260bb7f-3240-4d90-8267-61c9f14e8291'],
 'embeddings': [[-0.0058056991547346115,
   -0.01717888005077839,
   -0.01551816426217556,
   -0.025958893820643425,
   -0.016443809494376183,
   0.002416205359622836,
   -0.028558867052197456,
   -0.016212398186326027,
   -0.018499284982681274,
   -0.024720164015889168,
   0.0251693744212389,
   0.026639515534043312,
   0.016171561554074287,
   -0.009065874852240086,
   -0.025509683415293694,
   0.00852137804031372,
   0.02951173670589924,
   0.005125077906996012,
   0.009630790911614895,
   0.002458744216710329,
   -0.02506047487258911,
   0.03544675558805466,
   0.002028251299634576,
   0.021494019776582718,
   -0.003709385870024562,
   0.015858

Sentence transformer with chroma. See how the embeddings look like.

In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = [ 'This framework generates embeddings for senteces (not words)!',
             'Each sentence will be encoded or mapped to a 384 dimension vector space',
             'A fish named Fred embraces its oddity and goes along with the flow, scared as it may be',
             'A fish named Fred worries and dreads but ultimately this is overwhelmed by the trill of the journey'

]
embeddings = model.encode(sentences=sentences)

for sentence, embedding in zip(sentences, embeddings):
    print('sentence:', sentence)
    print('embedding:', embedding)
    print('\n')

sentence: This framework generates embeddings for senteces (not words)!
embedding: [-2.08490379e-02 -6.87446445e-02  1.26606775e-02 -6.88610692e-03
  2.95728985e-02  5.70758767e-02  7.78200030e-02 -4.09870222e-02
  1.26689645e-02 -7.70674832e-03  4.67152484e-02 -1.79939382e-02
  7.52381980e-02  4.60417755e-02 -8.77869874e-02  7.59569928e-02
  4.42981906e-02  7.31140077e-02 -4.36860174e-02 -5.01276702e-02
  3.03778332e-03  6.02110624e-02  5.21502309e-02 -5.83877340e-02
  2.46321298e-02 -4.10804152e-02 -1.02450535e-01  1.35456212e-02
  7.14781731e-02  1.49093326e-02  8.36374164e-02  4.80726287e-02
  6.98614342e-04  9.48225185e-02 -1.70592370e-03  8.24206248e-02
  2.87968223e-03  9.78000276e-03 -5.86907975e-02 -8.07239674e-03
 -5.92945851e-02  2.25108285e-02 -4.17630337e-02  7.52435699e-02
  7.99609348e-02 -2.74345130e-02 -3.98333557e-02  3.15873437e-02
 -5.30536585e-02  1.27168028e-02 -1.08491100e-01 -3.79101075e-02
 -1.93019174e-02  2.55927369e-02 -4.49507907e-02 -4.83445972e-02
 -1.121

## Chatbot to query a website

In [1]:
from dotenv import load_dotenv
load_dotenv()

from llama_index import GPTVectorStoreIndex, TrafilaturaWebReader
import chromadb

def create_embedding_store(name):
    chroma_client = chromadb.Client()
    return chroma_client.create_collection(name)

def query_page(collection, urls, questions):
    docs = TrafilaturaWebReader().load_data(urls)
    index = GPTVectorStoreIndex.from_documents(docs, chroma_collection=collection)
    query_engine = index.as_query_engine()
    
    for question in questions:
        print(f"Question: {question} \n")
        print(f"Answer: {query_engine.query(question)}")


if __name__ == "__main__":
    url_list = ["https://supertype.ai", "https://supertype.ai/about-us"]
    collection = create_embedding_store("supertype")
    questions = [
        "Who are the members of Supertype.ai", 
        "What problems are they trying to solve?",
        "What are the important values at the company?"
    ]

    query_page(collection, url_list, questions)

Question: Who are the members of Supertype.ai 

Answer: The members of Supertype.ai include Samuel Chan, Nick Gibbons, Nayoko Wicaksono, Vincentius Christopher Calvin, Stane Aurelius, Geraldus Wilsen, Vito Ghifari, and Aurellia Christie.
Question: What problems are they trying to solve? 

Answer: The team at Supertype is focused on solving challenges in the field of data science and analytics. They work on a range of topics including text processing, time series analysis, forecasting, machine learning, and natural language processing. They aim to extract actionable insights from data and develop software applications in the realm of data science and artificial intelligence. Additionally, they are interested in the intersection of healthcare and technology. Overall, their goal is to solve complex problems using their expertise in data analysis and programming.
Question: What are the important values at the company? 

Answer: The important values at the company are taking ownership, taki

## Working with OpenAI models 

Text completion

In [4]:
import openai
from dotenv import load_dotenv
load_dotenv()

#list all models
models = openai.Model.list()

#OpenAI text completion
completion = openai.Completion.create(model = "ada", prompt = "Bill Gates is a")
print(completion.choices[0].text)



 longtime audience member at Harvard Square. Another West Coast philanthropist comes to our talk


Image generation

In [5]:
import webbrowser
import openai
from dotenv import load_dotenv
load_dotenv()

# Image generation with dall-e 2
image_gen = openai.Image.create(prompt = "Зелено дърво", n=2, size = "512x512")

for img in image_gen.data:
    webbrowser.open_new_tab(img.url)

Audio recognition

In [7]:
import webbrowser
import openai
from dotenv import load_dotenv
load_dotenv()

audio = open("audio/donda.mp3", "rb")
transcribe = openai.Audio.transcribe("whisper-1", audio)
print(transcribe)

{
  "text": "That she wrote, speech to the young, speech to the progress-toward, say to them, say to the down-keepers, the sun-slappers, the self-soilers, the harmony-ushers, even if you are not ready for the day, it cannot always be night."
}


## Caching

Caching a Large Language Model will let you store the responses, which can be later fetched instead of sending a request to the LLM for a similar Prompt

#### Caching answers to queries

In [1]:
import time
import langchain
from langchain.llms import OpenAI
from langchain.cache import InMemoryCache, SQLiteCache
from langchain.callbacks import get_openai_callback
from dotenv import load_dotenv
load_dotenv()

# There are to options to store your answers, firstly in memory, which is faster but uses more memory
# The second is to make an SQL lite db for your answers; more memory efficient, but has a slight increase in execution time 
# langchain.llm_cache = InMemoryCache()
langchain.llm_cache = SQLiteCache(database_path='.langchain.db')

llm = OpenAI(model='text-davinci-002')

with get_openai_callback() as cb:
    start = time.time()
    result = llm("What doesn't fall far from the tree?")
    end = time.time()
    print(result)
    print('----- cb')
    print(str(cb) + f'({end - start:2f} seconds)') 

with get_openai_callback() as cb2:
    start = time.time()
    result2 = llm("What doesn't fall far from the tree?")
    result3 = llm("What doesn't fall far from the tree?")
    end = time.time()
    print(result2)
    print(result3)
    print('----- cb')
    print(str(cb2) + f'({end - start:2f} seconds)') 




The saying is "like father, like son." It means that the son is similar to the father in character or behavior.
----- cb
Tokens Used: 36
	Prompt Tokens: 9
	Completion Tokens: 27
Successful Requests: 1
Total Cost (USD): $0.0007199999999999999(0.833076 seconds)


The saying is "like father, like son." It means that the son is similar to the father in character or behavior.


The saying is "like father, like son." It means that the son is similar to the father in character or behavior.
----- cb
Tokens Used: 0
	Prompt Tokens: 0
	Completion Tokens: 0
Successful Requests: 0
Total Cost (USD): $0.0(0.023442 seconds)


#### Chache summaries 

In [2]:
import time
import langchain
from langchain.llms import OpenAI
from langchain.cache import InMemoryCache, SQLiteCache
from langchain.text_splitter import CharacterTextSplitter
from langchain.callbacks import get_openai_callback
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from dotenv import load_dotenv
load_dotenv()

# There are to options to store your answers, firstly in memory, which is faster but uses more memory
# The second is to make an SQL lite db for your answers; more memory efficient, but has a slight increase in execution time 
# langchain.llm_cache = InMemoryCache()
langchain.llm_cache = SQLiteCache(database_path='.langchain.db')

text_splitter = CharacterTextSplitter()

llm = OpenAI(model='text-davinci-002')
llm_no_cache = OpenAI(model='text-davinci-002', cache = False)

with open('news/summary.txt') as f:
    news = f.read()

text = text_splitter.split_text(news)
docs = [
    Document(page_content=t) for t in text
]

chain = load_summarize_chain(llm, chain_type='map_reduce', reduce_llm = llm_no_cache)

with get_openai_callback() as cb:
    start = time.time()
    result = chain.run(docs)
    end = time.time()
    print(result)
    print('----- cb')
    print(str(cb) + f'({end - start:2f} seconds)')

with get_openai_callback() as cb2:
    start = time.time()
    result2 = chain.run(docs)
    end = time.time()
    print(result2)
    print('----- cb2')
    print(str(cb2) + f'({end - start:2f} seconds)') 





- Australia's coal and gas exports are expected to decline sharply in the next five years, due to the country's peak production and the efforts of Asian countries to reduce greenhouse gas emissions.
- Coal producers in Australia are in talks with the government, following the announcement that coal miners should reserve up to 10% of production for domestic supply.
- China's coal industry is expected to see stronger trade as the country ramps up energy supplies and stabilizes prices. In July 2022, China's coal and lignite exports rose by 171.6% YoY to 230,000 tons.
- Indonesia's new ban on coal exports is expected to disrupt Supramax and Panamax markets in the Pacific region.
----- cb
Tokens Used: 2858
	Prompt Tokens: 2303
	Completion Tokens: 555
Successful Requests: 4
Total Cost (USD): $0.05716(607.735022 seconds)


- Australia's coal and gas exports are expected to decline sharply over the next five years.
- Coal producers in Australia are in talks with the government about reservin

In [5]:
import os
from dotenv import load_dotenv
load_dotenv()

from langchain.llms import OpenAI
from langchain.agents import load_tools, initialize_agent, AgentType
from langchain.tools import DuckDuckGoSearchRun
from langchain.callbacks import StreamingStdOutCallbackHandler
import streamlit as st

llm = OpenAI(temperature=0, streaming=True, openai_api_key = os.getenv('OPENAI_API_KEY'))
tools = load_tools(
    ['ddg-search']
)

agent = initialize_agent(
  tools = tools,
  llm = llm,
  agent =  AgentType.ZERO_SHOT_REACT_DESCRIPTION
)

if prompt := st.chat_input():
    st.chat_message('user').write(prompt)
    with st.chat_message('assistant'):
        st.write("🧠 thinking...")
        st_callback = StreamingStdOutCallbackHandler(st.container())
        response = agent.run(prompt, callbacks = [st_callback])
        st.write(response)

