In [3]:
import pandas as pd
import os
import openai
from ragas import evaluate
from datasets import load_dataset
from datasets import Dataset

In [2]:
corpus_dataset = load_dataset("explodinggradients/fiqa", "corpus")
corpus_dataset = corpus_dataset['corpus'].to_pandas()

Downloading and preparing dataset fiqa/corpus to /Users/deathscope/.cache/huggingface/datasets/explodinggradients___fiqa/corpus/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8...


Downloading data:   0%|          | 0.00/44.6M [00:00<?, ?B/s]

Generating corpus split: 0 examples [00:00, ? examples/s]

/Users/deathscope/.cache/huggingface/datasets/downloads/19473702797a0b8c21a50f1826b68ba291f86ce9e03cb467b4003149fa4a2bfb
Dataset fiqa downloaded and prepared to /Users/deathscope/.cache/huggingface/datasets/explodinggradients___fiqa/corpus/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
corpus_dataset.head()

Unnamed: 0,doc
0,I'm not saying I don't like the idea of on-the...
1,So nothing preventing false ratings besides ad...
2,You can never use a health FSA for individual ...
3,Samsung created the LCD and other flat screen ...
4,Here are the SEC requirements: The federal sec...


In [14]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [7]:
import pinecone

# get API key from app.pinecone.io and environment from console
pinecone.init(
    api_key='97892a63-64e0-41bd-a60c-f7bc053951bd',
    environment='gcp-starter'
)

In [8]:
import time

index_name = 'llama-2-rag'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=1536,
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

index = pinecone.Index(index_name)

In [9]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [15]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=openai.api_key)

In [20]:
from tqdm.auto import tqdm  # for progress bar

batch_size = 100

for i in tqdm(range(0, len(corpus_dataset), batch_size)):
    i_end = min(len(corpus_dataset), i + batch_size)
    # get batch of data
    batch = corpus_dataset.iloc[i:i_end]
    # generate string ids for each chunk based on the DataFrame index
    ids = [str(id_num) for id_num in range(i, i_end)]
    # get text to embed
    texts = [x['doc'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['doc']} for _, x in batch.iterrows()
    ]

  0%|          | 0/577 [00:00<?, ?it/s]

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIError: The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 363b00e9b2e873d44879cf8181ebcf5e in your message.) {
  "error": {
    "message": "The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 363b00e9b2e873d44879cf8181ebcf5e in your message.)",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 363b00e9b2e8

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIError: The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 30ebd611a22b77645e7fe457f35920de in your message.) {
  "error": {
    "message": "The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 30ebd611a22b77645e7fe457f35920de in your message.)",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 30ebd611a22b

In [21]:
# add to Pinecone
index.upsert(vectors=zip(ids, embeds, metadata))

{'upserted_count': 38}

In [22]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [26]:
from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

In [27]:
query = "Income tax exemptions for small business?"

vectorstore.similarity_search(query, k=3)

[Document(page_content="You are in business for yourself. You file Schedule C with your income tax return, and can deduct the business expenses and the cost of goods sold from the gross receipts of your business. If you have inventory (things bought but not yet sold by the end of the year of purchase), then there are other calculations that need to be done.  You will have to pay income tax as well as Social Security and Medicare taxes (both the employee's share and the employer's share) on the net profits from this business activity."),
 Document(page_content='"What they are doing is wrong. The IRS and the state might not be happy with what they are doing.  One thing you can ask for them to do is to give you a credit card for business and travel expenses. You will still have to submit receipts for expenses, but it will also make it clear to the IRS that these checks are not income. Keep the pay stubs for the year, or the pdf files if they don\'t give you a physical stub. Pay attention 

In [40]:
def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vectorstore.similarity_search(query, k=5)
    contexts = [x.page_content for x in results]
    # get the text from the results
    source_knowledge = "\n".join(contexts)
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    return contexts, augmented_prompt

In [1]:
from langchain.chat_models import ChatOpenAI

In [6]:
chat = ChatOpenAI(
    openai_api_key=openai.api_key,
    model='gpt-4-1106-preview'
)

In [7]:
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)

In [8]:
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content="I'd like to understand string theory.")
]

In [None]:
res = chat(messages)
res

In [58]:
query_dataset = load_dataset("explodinggradients/fiqa", "main")
query_dataset = query_dataset['validation'].to_pandas()
query_dataset_sampled = query_dataset.sample(n=100, random_state=1)

Found cached dataset fiqa (/Users/deathscope/.cache/huggingface/datasets/explodinggradients___fiqa/main/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8)


  0%|          | 0/3 [00:00<?, ?it/s]

In [60]:
# Initialize lists to store the new columns' data.
contexts_list = []
answers_list = []

# Iterate over each question in the DataFrame.
for question in tqdm(query_dataset_sampled['question'], desc="Processing questions"):
    contexts, prompt = augment_prompt(question)  # Get contexts and prompt for the question.
    contexts_list.append(contexts)  # Append contexts to the list.

    # Create a new user prompt and add to messages.
    messages = [SystemMessage(content="You are a helpful assistant."), HumanMessage(content=prompt)]

    # Simulate a chat response.
    res = chat(messages)  # In practice, this would call the actual chat function.
    answers_list.append(res.content)  # Append the content of the response to the list.

Processing questions:   0%|          | 0/100 [00:00<?, ?it/s]

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retr

In [61]:
# Add the new columns to the DataFrame.
query_dataset_sampled['contexts'] = contexts_list
query_dataset_sampled['answer'] = answers_list

In [71]:
query_dataset_sampled.to_csv('ragas_dataset.csv')

In [77]:
query_dataset_hf = query_dataset_sampled.sample(n=30, random_state=1)
query_dataset_hf = Dataset.from_pandas(query_dataset_hf)

In [80]:
from ragas.metrics import AnswerRelevancy
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en')
answer_relevancy = AnswerRelevancy(
    embeddings=embeddings
)

# init_model to load models used
answer_relevancy.init_model()

results = answer_relevancy.score(query_dataset_hf)

print(results.to_pandas())

100%|█████████████████████████████████████████████| 2/2 [02:14<00:00, 67.03s/it]

                                             question  \
0   How can I get the car refinanced under my name...   
1   Isn't an Initial Coin Offering (ICO) a surefir...   
2   Why do people sell when demand pushes share pr...   
3   Stopping Payment on a Check--How Long Does it ...   
4   Borrow money to invest in a business venture w...   
5   Is forward P/E calculated using current price(...   
6   Is there a term for the risk of investing in a...   
7   What kind of value do retail investors look fo...   
8   Is it possible to influence a company's action...   
9   What is the best way to learn investing techni...   
10  Could there be an interest for a company to ma...   
11                        Placing bid in market price   
12       What is the Blue Line in these stock Charts?   
13         Personal credit card for business expenses   
14  Home (re)Finance and Providing Additional Info...   
15  Ideas on how to invest a relatively small amou...   
16               What is the pu




In [81]:
from ragas.metrics.faithfulness import Faithfulness
faithfulness = Faithfulness(
    batch_size = 10
)
faith_results = faithfulness.score(query_dataset_hf)
print(faith_results)

 33%|██████████████▋                             | 1/3 [05:00<10:00, 300.17s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..
100%|████████████████████████████████████████████| 3/3 [15:53<00:00, 317.93s/it]

Dataset({
    features: ['question', 'ground_truths', 'contexts', 'answer', '__index_level_0__', 'faithfulness'],
    num_rows: 30
})





In [82]:
from ragas.metrics import ContextPrecision
context_precision = ContextPrecision()

cp_results = context_precision.score(query_dataset_hf)
print(cp_results)

  0%|                                                     | 0/2 [00:00<?, ?it/s]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
 50%|██████████████████████                      | 1/2 [11:40<11:40, 700.20s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
100%|███████████████████████████████████████████| 2/2 [34:57<00:00, 1048.58s/it]

Dataset({
    features: ['question', 'ground_truths', 'contexts', 'answer', '__index_level_0__', 'context_precision'],
    num_rows: 30
})





In [83]:
from ragas.metrics import ContextRecall
context_recall = ContextRecall(
    batch_size=10

)

cr_results = context_recall.score(query_dataset_hf)
print(cr_results)

 33%|██████████████▋                             | 1/3 [13:31<27:02, 811.32s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
 67%|███████████████████████████▎             | 2/3 [1:01:24<33:44, 2024.23s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..
100%|█████████████████████████████████████████| 3/3 [1:12:04<00:00, 1441.51s/it]

Dataset({
    features: ['question', 'ground_truths', 'contexts', 'answer', '__index_level_0__', 'context_recall'],
    num_rows: 30
})





In [85]:
results_df = results.to_pandas()
faith_results_df = faith_results.to_pandas()
cp_results_df = cp_results.to_pandas()
cr_results_df = cr_results.to_pandas()

# Calculate the average of the columns
average_answer_relevancy = results_df['answer_relevancy'].mean()
average_faithfulness = faith_results_df['faithfulness'].mean()
average_context_precision = cp_results_df['context_precision'].mean()
average_context_recall = cr_results_df['context_recall'].mean()

# You can print them out or store them as needed
averages = {
    "Average Answer Relevancy": average_answer_relevancy,
    "Average Faithfulness": average_faithfulness,
    "Average Context Precision": average_context_precision,
    "Average Context Recall": average_context_recall
}

averages

{'Average Answer Relevancy': 0.9046364800769946,
 'Average Faithfulness': 0.5152380952380953,
 'Average Context Precision': 0.16111111109861112,
 'Average Context Recall': 0.17370833912449582}

In [86]:
cp_results_df

Unnamed: 0,question,ground_truths,contexts,answer,__index_level_0__,context_precision
0,How can I get the car refinanced under my name...,"[""The best solution is to """"buy"""" the car and ...","[""It depends on when, where and how the accoun...","Based on the given contexts, it is not clear h...",260,0.0
1,Isn't an Initial Coin Offering (ICO) a surefir...,"[""My big gripe with the ICO name and correspon...",[What is the goal of the money? If it is to u...,"No, an Initial Coin Offering (ICO) is not a su...",165,0.0
2,Why do people sell when demand pushes share pr...,[You are assuming the price increase will cont...,[Investopedia has a good explanation of the te...,People may sell when demand pushes share price...,442,0.0
3,Stopping Payment on a Check--How Long Does it ...,[Is this a USA bank to a USA bank transaction?...,"[Honestly, if you're going to restrict the onl...",The provided contexts do not directly address ...,117,0.0
4,Borrow money to invest in a business venture w...,"[It's clearly a risk, but is it any different ...",[Investopedia has a good explanation of the te...,Borrowing money to invest in a business ventur...,6,0.0
5,Is forward P/E calculated using current price(...,[generally Forward P/E is computed as current ...,[Points are index based. Simple take the tota...,Forward P/E is typically calculated using the ...,289,0.0
6,Is there a term for the risk of investing in a...,"[""I'd question whether a guaranteed savings in...","[No I get that. But if you don’t want risk, t...","Yes, the term for the risk of investing in an ...",277,1.0
7,What kind of value do retail investors look fo...,"[""I'm not downvoting you because I can relate,...","[No I get that. But if you don’t want risk, t...",Retail investors typically look for two main t...,187,0.0
8,Is it possible to influence a company's action...,"[To quote Adam Smith, 'Everything is worth wha...",[Investopedia has a good explanation of the te...,"Yes, it is possible to influence a company's a...",162,0.0
9,What is the best way to learn investing techni...,"[All the things you suggest are good, but I th...","[No I get that. But if you don’t want risk, t...",The best way to learn investing techniques is ...,315,0.833333


In [93]:
# Print the first 'question'
print(cp_results_df['question'][13])

print("-"*60)

# Print the first 'question'
print(cp_results_df['ground_truths'][13])

print("-"*60)

# Print the first 'question'
print(cp_results_df['answer'][13])

print("-"*60)

# Print the first 'contexts'
print(cp_results_df['contexts'][13])

Personal credit card for business expenses
------------------------------------------------------------
["Early on, one might not be able to get credit for their business. For convenience, and the card perks, it makes sense to use the personal card. But for sake of a clean paper trail, I'd choose 1 card and use it exclusively, 100% for the business. Not one card here, one card there."
 "Do you have a separate bank account for your business?  That is generally highly recommended. I have a credit card for my single-member LLC.  I prefer it this way because it makes the separation of personal and business expenses very clear. Using a personal credit card, but using it for only business expenses seems to be a reasonable practice.  You may be able to do one better though... For your sole proprietorship, you can file a DBA which establishes the business name.  The details of this depend on your state.  With a DBA, I believe you can open a bank account in the name of your business and you may