In [43]:
## Pre-Creating knowledge of LOS
import snowflake.connector
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
import os
load_dotenv(override=True)
from openai import OpenAI
import os
import openai
import pinecone
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [7]:

# Snowflake config
snowflake_user = os.getenv('SNOWFLAKE_USER')
snowflake_password = os.getenv('SNOWFLAKE_PASS')
snowflake_account = os.getenv('SNOWFLAKE_ACCOUNTID')
database = os.getenv('SNOWFLAKE_DATABASE')
schema = os.getenv('SNOWFLAKE_SCHEMA')
warehouse = os.getenv('SNOWFLAKE_WAREHOUSE')

def get_3_data():
    """get Introduction LOS Summary"""
    df = {}
    try:
        conn = snowflake.connector.connect(       
            user=snowflake_user,
            password=snowflake_password,
            account=snowflake_account,
            warehouse=warehouse,
            database=database,
            schema=schema,
        )
        
        # run the sql query to get data from snowflake
        cursor = conn.cursor()
        sql = f"SELECT NAMEOFTOPIC, INTRODUCTION, LEARNINGOUTCOME, SUMMARY FROM CFA_COURSES ;"

        results = cursor.execute(sql)
        if results is not None:
            df = pd.DataFrame(results.fetchall())
            df.columns = ['Topic','Introduction', 'Learning outcomes', 'Summary']
        else:
            print('Fail to get data. Try again')
    except NameError as e:
        print(f'Program fail: {e}')
    finally:
        cursor.close()
        conn.close()

    return df

df = get_3_data()


In [37]:

def creat_knowledge(topic, introduction, summary, los):

    client = OpenAI()
    response = client.chat.completions.create(
    model="gpt-3.5-turbo-0125",
    messages=[
        {"role": "system", "content": """You are a helpful assistant to generate techincal note for CFA Refresher Readings articles. Based on the information provided in the prompt by user, create a technical note in Markdown format that summarizes only the key Learning outcomes
          under "Summary of Key Learning Outcome" heading. Be sure to include any tables or equations necessary. Do not provide any introduction or conclusion. The "Summary of Learning outcomes" should have bullets only format which will be used as Knowledge Base for future prompts."""},
        {"role": "user", "content": f"""## Topic\n{topic}\n\n## Introduction\n{introduction}\n\n## Summary\n{summary}\n\n## Learning Outcomes\n{los}\n"""}
    ]
    )
    return response

In [39]:
response_list = []
for i in range(4):
    topic, intro, los, summary = df.loc[i]
    resp = creat_knowledge(topic, intro, summary, los)
    response_list.append(los)
    response_list.append(resp.choices[0].message.content)

print(len(response_list))

8


In [40]:
print(response_list)

['interpret interest rates as required rates of return, discount rates, or opportunity costs;\nexplain an interest rate as the sum of a real risk-free rate and premiums that compensate investors for bearing distinct types of risk;\ncalculate and interpret the effective annual rate, given the stated annual interest rate and the frequency of compounding;\nsolve time value of money problems for different frequencies of compounding;\ncalculate and interpret the future value (FV) and present value (PV) of a single sum of money, an ordinary annuity, an annuity due, a perpetuity (PV only), and a series of unequal cash flows;\ndemonstrate the use of a time line in modeling and solving time value of money problems.', '## Summary of Key Learning Outcome\n\n- Interpret interest rates as required rates of return, discount rates, or opportunity costs.\n- Explain an interest rate as the sum of a real risk-free rate and premiums that compensate investors for bearing distinct types of risk.\n- Calcula

In [45]:
import tiktoken

tokenizer = tiktoken.get_encoding('p50k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [55]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [56]:
from uuid import uuid4
from tqdm.auto import tqdm

chunks = []
for idx, text in enumerate(tqdm(response_list)):
    if isinstance(text, str):  # Check if the item is a string
        texts = text_splitter.split_text(text)
        chunks.extend([{
            'id': str(uuid4()),
            'text': texts[i],
            'chunk': i,
            'url': None  # No URL available for this case
        } for i in range(len(texts))])
    else:
        print(f"Skipping item at index {idx} as it is not a string")


100%|██████████| 8/8 [00:00<00:00, 2060.32it/s]


In [57]:
print(len(chunks))

22


In [None]:
!pip install tiktoken -q

In [None]:
!pip install pinecone-client -q

In [None]:
embeddings = OpenAIEmbeddings(model_name="ada")

query_result = embeddings.embed_query("Hello world")
len(query_result)

In [None]:
pinecone.init(
    api_key="pinecone api key",
    environment="env"
)

index_name = "langchain-demo"

index = Pinecone.from_documents(docs, embeddings, index_name=index_name)