# Rational Future Prediction
Goal: create an LLM system that uses RAG in order to predict the next innovation in a particular field through time series forecasting.

# Step 1: gather data for the vector database from a website containing "timeline" data.

In [35]:
from bs4 import BeautifulSoup
import requests

r = requests.get("https://www.telephonetribute.com/timeline.html")
html = r.text
clean_text = ' '.join(BeautifulSoup(html, "html.parser").stripped_strings)
text = "1881"+clean_text.split("1881")[4]
text

'1881: First long-distance line, between\nBoston and Providence, R.I. *[Note 5] 1908: AT&T President Theodore Vail sets a goal of Universal Service: providing\nhigh-quality telephone service to any American who wants it. *[Note 5] 1915: AT&T engineers transmit speech across the Atlantic Ocean by wireless\nradio. *[Note 5] 1917: Bell System engineers demonstrate oneway radiotelephone transmission from\nairplane to ground. *[Note 5] 1919: AT&T introduces dial telephones. *[Note 5] 1923: AT&T longdistance lines connect stations in the first network-radio\nbroadcast. *[Note 5] 1925: The Research and Engineering Departments of AT&T and the Western Electric\nCompany are incorporated as ell Telephone Laboratories. *[Note 5] 1927: Wireless telephone service between New York and London. *[Note 5] 1927: Don Juan, the first fulllength movie with sound uses AT&T equipment and\nthe next year, Warner Bros. produces The Jazz Singer with AT&T equipment. *[Note\n5] 1947: Bell Labs scientists Shockley, 

## To extract data in the format of ['year']: ['sentence describing innovation'], use a GPT procedure.

In [23]:
from langchain_openai import ChatOpenAI
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

llm = ChatOpenAI(model="gpt-3.5-turbo-0613")

In [39]:
filteredText = llm.invoke(f"Filter out data in the form of 'year':'sentence' from the following text document: {text}")

In [56]:
data = filteredText.content
open('data.txt', 'w').write(data)

2550

## Step 2: add the data to a vector database.

In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
import chromadb

client = chromadb.Client()
tp = client.create_collection(name="telephoneinnovation")

data = []
with open('data.txt') as f:
    for line in f:
        data.append(line)

In [34]:
#Embed each list.
embeddings = OpenAIEmbeddings()
dataEmbed = embeddings.embed_documents(data)
ids = str(list(range(1, len(data))))

tp.add(
    
    ids=ids,
    documents=data,
    embeddings = [dataEmbed]
    
    
)

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27']