## Creating note base on LOS Summary and Introudctioin

In [None]:
## Pre-Creating knowledge of LOS
import snowflake.connector
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
import os
load_dotenv(override=True)
from openai import OpenAI


In [2]:

# Snowflake config
snowflake_user = os.getenv('SNOWFLAKE_USER')
snowflake_password = os.getenv('SNOWFLAKE_PASSWORD')
snowflake_account = os.getenv('SNOWFLAKE_ACCOUNT')
database = os.getenv('SNOWFLAKE_DATABASE')
schema = os.getenv('SNOWFLAKE_SCHEMA')
warehouse = os.getenv('SNOWFLAKE_WAREHOUSE')

def get_3_data():
    """get Introduction LOS Summary"""
    df = {}
    try:
        conn = snowflake.connector.connect(       
            user=snowflake_user,
            password=snowflake_password,
            account=snowflake_account,
            warehouse=warehouse,
            database=database,
            schema=schema,
        )
        
        # run the sql query to get data from snowflake
        cursor = conn.cursor()
        sql = f"SELECT NAMEOFTOPIC, INTRODUCTION, LEARNINGOUTCOME, SUMMARY FROM CFA_COURSES ;"

        results = cursor.execute(sql)
        if results is not None:
            df = pd.DataFrame(results.fetchall())
            df.columns = ['Topic','Introduction', 'Learning outcomes', 'Summary']
        else:
            print('Fail to get data. Try again')
    except NameError as e:
        print(f'Program fail: {e}')
    finally:
        cursor.close()
        conn.close()

    return df

df = get_3_data()


In [34]:

def creat_knowledge(topic, introduction, summary, los):

    client = OpenAI()
    response = client.chat.completions.create(
    model="gpt-3.5-turbo-0125",
    messages=[
        {"role": "system", "content": """You are a helpful assistant to generate techincal note for CFA Refresher Readings articles. Based on the information provided in the prompt by user, create a technical note in Markdown format that summarizes only the key Learning outcomes
under "Summary of Key Learning Outcome" heading. Be sure to include any tables or equations necessary. Do not provide any introduction or conclusion. The "Summary of Learning outcomes" should have bullets only format which will be used as Knowledge Base for QA prompts.
Follow the md file structure below:
## Summary of Key Learning Outcome
- **bulet points for eachlearning outcomes:** 
    parapgh for summarizing this learning outcome.
- **bulet points for eachlearning outcomes:** 
    parapgh for summarizing this learning outcome."""},
        {"role": "user", "content": f"""## Topic\n{topic}\n\n## Introduction\n{introduction}\n\n## Summary\n{summary}\n\n## Learning Outcomes\n{los}\n"""}
    ]
    )
    # print(response.choices[0].message.content)
    return response

In [43]:

# topic, intro, los, summary = df.loc[0]
# response = creat_knowledge(topic, intro, summary, los)

In [99]:
response_list = []
for i in range(4):
    topic, intro, los, summary = df.loc[i]
    resp = creat_knowledge(topic, intro, summary, los)
    response_list.append(
        {
            'topic': topic,
            'los': los,
            'note':resp.choices[0].message.content
        })
print(response_list)


[{'topic': 'Time Value of Money in Finance', 'los': 'interpret interest rates as required rates of return, discount rates, or opportunity costs;\nexplain an interest rate as the sum of a real risk-free rate and premiums that compensate investors for bearing distinct types of risk;\ncalculate and interpret the effective annual rate, given the stated annual interest rate and the frequency of compounding;\nsolve time value of money problems for different frequencies of compounding;\ncalculate and interpret the future value (FV) and present value (PV) of a single sum of money, an ordinary annuity, an annuity due, a perpetuity (PV only), and a series of unequal cash flows;\ndemonstrate the use of a time line in modeling and solving time value of money problems.', 'note': '## Summary of Key Learning Outcome\n- **Interpret interest rates as required rates of return, discount rates, or opportunity costs:** \n    This learning outcome emphasizes the different interpretations of interest rates a

In [104]:
print(response_list[0]['los'])

interpret interest rates as required rates of return, discount rates, or opportunity costs;
explain an interest rate as the sum of a real risk-free rate and premiums that compensate investors for bearing distinct types of risk;
calculate and interpret the effective annual rate, given the stated annual interest rate and the frequency of compounding;
solve time value of money problems for different frequencies of compounding;
calculate and interpret the future value (FV) and present value (PV) of a single sum of money, an ordinary annuity, an annuity due, a perpetuity (PV only), and a series of unequal cash flows;
demonstrate the use of a time line in modeling and solving time value of money problems.


In [101]:
# gater the output and put into md

md_lines = ''
for i in range(4):
    # print(response_list[i][1])
    md_lines += response_list[i]['note']
    md_lines += '\n''\n'
# print(md_lines)
with open("../data/LOS_summary.md", "w", encoding="utf-8") as file:
    file.writelines(md_lines)


## Summary of Key Learning Outcome
- **Interpret interest rates as required rates of return, discount rates, or opportunity costs:** 
    This learning outcome emphasizes the different interpretations of interest rates as required rates of return for investments, discount rates for valuation, or opportunity costs for decision-making.
- **Explain an interest rate as the sum of a real risk-free rate and premiums that compensate investors for bearing distinct types of risk:** 
    Understanding that interest rates comprise a real risk-free rate and premiums for inflation, default, liquidity, and maturity risks helps in assessing the components that make up the overall interest rate.
- **Calculate and interpret the effective annual rate, given the stated annual interest rate and the frequency of compounding:** 
    Being able to compute the effective annual rate from the stated annual interest rate and compounding frequency provides a better understanding of the actual growth of an investm

## Splitter the note base on token length

**Spilt LOS with ';'**

**Split Technical Note with token**

In [102]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
from uuid import uuid4
from tqdm.auto import tqdm

In [103]:
tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [67]:
# tokenize a str

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

# def split_text_by_semicolon(data):# -> list[Any]:
#     list_chunks = []
#     # split LOS by ;
#     for i,text in enumerate(data):
#         chunks = [chunk.strip() for chunk in text['los'].split(';') if chunk.strip()]
#         list_chunks.append(chunks)
#     return list_chunks


In [75]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [106]:
note_chunks = text_splitter.split_text(md_lines)
print(note_chunks[0])

# los_chunks = split_text_by_semicolon(response_list)

## Summary of Key Learning Outcome
- **Interpret interest rates as required rates of return, discount rates, or opportunity costs:** 
    This learning outcome emphasizes the different interpretations of interest rates as required rates of return for investments, discount rates for valuation, or opportunity costs for decision-making.
- **Explain an interest rate as the sum of a real risk-free rate and premiums that compensate investors for bearing distinct types of risk:**


## Creating Embedding

In [81]:
import os

# get openai api key from platform.openai.com
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or 'YOUR_API_KEY'

In [84]:
from langchain_openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY  # type: ignore
)

In [107]:

note_res = embed.embed_documents(note_chunks)
# los_res = embed.embed_documents(los_chunks)
len(note_res) # directly convert into a vector

15

## Upload to Pinecone

In [94]:
from pinecone import Pinecone
from dotenv import load_dotenv
load_dotenv()
import os

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY")

# configure client
pc = Pinecone(api_key=api_key)

In [95]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1" 
)

In [98]:
import time

index_name = 'damg-group3-assignment5-step1'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [108]:
from typing import List
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(response_list)):
    # first get metadata fields for this record
    metadata = {
        'topic': record['topic'],
        'LOS': record['los'],
        'Note': record['note']
    }
    # now we create chunks from the record text
        #create record with LOS
    record['note'] = record['los'] + record['note']
    record_texts: List[str] = text_splitter.split_text(record['note'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas)) # type: ignore
        texts = []
        metadatas = []


if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas)) # type: ignore

100%|██████████| 4/4 [00:00<00:00, 745.29it/s]


In [109]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 25}},
 'total_vector_count': 25}