# Book 1: Data Extraction from the SEC Website

## Extracting data using BeautifulSoup4
1. Install bs4



In [1]:
%pip install bs4

Note: you may need to restart the kernel to use updated packages.


2. Extract the links from the SEC Government Website


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import BytesIO
from pypdf import PdfReader


url = 'https://www.sec.gov/forms'  
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

pdf_links = []

for row in soup.select('tr'):
    columns = row.find_all('td')
    if columns:
        pdf_link = columns[1].find('a', href=True)
        pdf_info = [column.text.strip() for column in columns]
        if pdf_link and pdf_link['href'].endswith('.pdf'):
            pdf_info.append(pdf_link['href'])
            pdf_links.append(pdf_info)


dfs = []


for pdf_info in pdf_links:
    pdf_url = f'https://www.sec.gov{pdf_info[-1]}'  # The PDF link is in the last position
    response = requests.get(pdf_url)
    pdf_content = response.content


    pdf_file = BytesIO(pdf_content)
    pdf_reader = PdfReader(pdf_file)
    
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()

    if text:
        # Split the PDF text 
        sections = text.split("\n")  

        #(fewer than 40 tokens)
        filtered_sections = [section for section in sections if len(section.split()) > 40]
        

        if filtered_sections:
            
            topics = pdf_info[4].split("Topic(s):")[1].strip()
            number = pdf_info[0].split("Number:")[1].strip()
            sec_number = pdf_info[3].split("SEC Number:")[1].strip()
            last_updated = pdf_info[2].split("Last Updated:")[1].strip()
            data = {
                "Number": number, 
                "Last Updated": last_updated, 
                "SEC Number": sec_number,  
                "Topics": topics,  
                "Context": filtered_sections
            }
            df = pd.DataFrame(data)
            dfs.append(df)
            print(f"Filtered Sections from {pdf_url}:")
            print(df)
            
        else:
            print(f"No filtered sections found in {pdf_url}.")
    else:
        print(f"Unable to extract text from the PDF: {pdf_url}")

if dfs:
    df = pd.concat(dfs, ignore_index=True)
    print("Combined DataFrames:")
    print(df)

Filtered Sections from https://www.sec.gov/files/exam-brochure.pdf:
  Number Last Updated SEC Number Topics  \
0           Jan. 2023                     
1           Jan. 2023                     
2           Jan. 2023                     
3           Jan. 2023                     
4           Jan. 2023                     
5           Jan. 2023                     

                                             Context  
0  Commission may also share information and docu...  
1  conference with the entity to discuss any issu...  
2  being examined or inspected with written notif...  
3  voluntarily come forward with high-quality, or...  
4  whistleblower must comply with the procedures ...  
5  Company Accounting Oversight Board, or any sel...  
No filtered sections found in https://www.sec.gov/files/form1.pdf.
Filtered Sections from https://www.sec.gov/files/form1-a.pdf:
   Number Last Updated SEC Number                                    Topics  \
0     1-A   Sept. 2021     SEC486  Se

3. Save the file as SEC- PDF- Data

In [3]:
df.to_csv('sec_pdfs_data.csv', index=False)

In [4]:
if not df.empty:
    df.drop(df.index, inplace=True)

In [11]:
df.head(5)

Unnamed: 0,Number,Last Updated,SEC Number,Topics,Context
0,,Jan. 2023,,,Commission may also share information and docu...
1,,Jan. 2023,,,conference with the entity to discuss any issu...
2,,Jan. 2023,,,being examined or inspected with written notif...
3,,Jan. 2023,,,"voluntarily come forward with high-quality, or..."
4,,Jan. 2023,,,whistleblower must comply with the procedures ...


# Book 2: Install Open AI and Create a Q/A of all the PDFs

In [6]:
%pip install openai

Collecting openai
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 KB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tqdm
  Using cached tqdm-4.66.1-py3-none-any.whl (78 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 KB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting frozenlist>=1.1.1
  Downloading frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

### Using Da-Vinci Model for Q/A Generation

In [8]:
import openai
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

# api_key = os.getenv('openapi_key') #Uncomment this and run the command.
# openai.api_key = api_key


df = pd.read_csv('sec_pdfs_data.csv')


qa_df =[]

def generate_question(context):
    try:
        response = openai.Completion.create(
            engine="gpt-3.5-turbo",
            prompt=f"Generate a question based on the following text:\n\n{context}\n\nQuestion:",
            temperature= 0.5,
            max_tokens=50
        )
        return response.choices[0].text.strip()
    except Exception as e:
        print(f"Error generating question: {e}")
        return ""

def generate_answer(context, question):
    try:
        response = openai.Completion.create(
            engine="gpt-3.5-turbo",
            prompt=f"Answer the following question based on the text:\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:",
            max_tokens=100
        )
        return response.choices[0].text.strip()
    except Exception as e:
        print(f"Error generating answer: {e}")
        return ""

qa_list = []


for index, row in df.iterrows():
    context = row['Context']

    question = generate_question(context)
    if question:
    
        answer = generate_answer(context, question)
        if answer:
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            qa_list.append({'Question': question, 'Answer': answer})

qa_df = pd.DataFrame(qa_list)


Question: What are the possible uses of information and documents provided to the Commission?
Answer: The possible uses of information and documents provided to the Commission are that the Commission may share the information and documents with other regulators or authorities, and that the Commission may use the information and documents to investigate possible violations.
Question: What is typically discussed at an exit conference?
Answer: The staff typically discusses any issues that were raised during the examination and gives the entity an opportunity to provide additional relevant information, including any actions that the entity has taken or plans to take to address the issues raised.
Question: What happens if an entity does not hear back from the staff within 180 days from the completion of the on-site examination or inspection?
Answer: If an entity does not hear back from the staff within 180 days from the completion of the on-site examination or inspection, the examination or

### Save the data as a SEC- Q/A

In [10]:
qa_df.to_csv('sec_pdfs_qa.csv', index=False)

## Search File using Embedding


1. Prepare the data for Embeddings


Imports

In [38]:
%pip install spacy

Collecting spacy
  Downloading spacy-3.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting spacy-legacy<3.1.0,>=3.0.11
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting pathy>=0.10.0
  Downloading pathy-0.10.2-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.10-py3-none-any.whl (17 kB)
Collecting wasabi<1.2.0,>=0.9.1
  Downloading wasabi-1.1.2-py3-none-any.whl (27 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 KB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting murmurhash<

In [39]:
!python3 -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [40]:
import spacy
nlp = spacy.load("en_core_web_md")

In [33]:
%pip install scipy

Collecting scipy
  Downloading scipy-1.11.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: scipy
Successfully installed scipy-1.11.3
Note: you may need to restart the kernel to use updated packages.


In [34]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
from scipy import spatial  # for calculating vector similarities for search


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

## Created Embedding for whole data using Spacy and Vector

In [1]:
import pandas as pd
import spacy

# Load a pre-trained spaCy model
nlp = spacy.load("en_core_web_md")

# Load your dataset into a DataFrame (assuming you already have it)
df = pd.read_csv('sec_pdfs_data.csv')

def question_answering(question, df):
    # Compute the question embedding
    question_embedding = nlp(question)

    # Compute embeddings for each row in the DataFrame
    df['text_embedding'] = df['Context'].apply(lambda text: nlp(text))

    # Calculate cosine similarity between the question and text embeddings
    df['similarity'] = df['text_embedding'].apply(lambda text_embedding: question_embedding.similarity(text_embedding))

    # Find the text with the highest similarity as the answer
    best_answer = df.loc[df['similarity'].idxmax()]

    return best_answer['Context']

# Example question
question = "What will happen if a statement schedule is omitted?"

answer = question_answering(question, df)

print(f"Question: {question}")
print(f"Answer: {answer}")


Question: What will happen if a statement schedule is omitted?
Answer: money creates some conflicts with your interest.  You should understand and ask us about these conflicts because they can affect the services and investment advice we provide you.  Here are some examples to help you understand what this means.”   If you are a 


In [3]:
df['text_embedding']

0       (Commission, may, also, share, information, an...
1       (conference, with, the, entity, to, discuss, a...
2       (being, examined, or, inspected, with, written...
3       (voluntarily, come, forward, with, high, -, qu...
4       (whistleblower, must, comply, with, the, proce...
                              ...                        
1209    (Brokers, or, dealers, which, are, exempt, fro...
1210    (or, acquired, or, holds, funds, or, securitie...
1211    (is, any, person, who, works, a, minimum, of, ...
1212    (transactions, by, others, ,, such, as, volunt...
1213    (not, been, issued, ,, enter, the, control, nu...
Name: text_embedding, Length: 1214, dtype: object

## Created Embedding using Open AI Model Text Ada

In [5]:
df = pd.read_csv('sec_pdfs_data.csv')
dfcopy=df.iloc[:5].copy()
dfcopy

Unnamed: 0,Number,Last Updated,SEC Number,Topics,Context
0,,Jan. 2023,,,Commission may also share information and docu...
1,,Jan. 2023,,,conference with the entity to discuss any issu...
2,,Jan. 2023,,,being examined or inspected with written notif...
3,,Jan. 2023,,,"voluntarily come forward with high-quality, or..."
4,,Jan. 2023,,,whistleblower must comply with the procedures ...


In [9]:
import openai
import pandas as pd
import os
import numpy as np
from dotenv import load_dotenv
# imports
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
from scipy import spatial  # for calculating vector similarities for search


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

load_dotenv()

api_key = os.getenv('openapi_key') #Uncomment this and run the command.
openai.api_key = api_key

df = pd.read_csv('sec_pdfs_data.csv')
dfcopy=df.iloc[:5].copy()

def create_embedding(context):
    try:
        response = openai.Embedding.create(
            model=EMBEDDING_MODEL,
            input= context
        )
        embeddings = [item['embedding'] for item in response['data']]
        return embeddings 
    except Exception as e:
        print(f"Error in generating Embedding: {e}")
        return ""

dfcopy['embedding']=dfcopy['Context'].apply(create_embedding)

Error in generating Embedding: Internal server error {
    "error": {
        "message": "Internal server error",
        "type": "auth_subrequest_error",
        "param": null,
        "code": "internal_error"
    }
}
 500 {'error': {'message': 'Internal server error', 'type': 'auth_subrequest_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Fri, 20 Oct 2023 02:55:08 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '166', 'Connection': 'keep-alive', 'vary': 'Origin', 'x-request-id': '649a861b3f2afb94442f11ea8bd19f08', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '818dfacc8f804d06-BOS', 'alt-svc': 'h3=":443"; ma=86400'}
Error in generating Embedding: Internal server error {
    "error": {
        "message": "Internal server error",
        "type": "auth_subrequest_error",
        "param": null,
        "code": "internal_error"
    }
}
 500 {'error': {'message': 'I

In [53]:
df.to_csv('sec_pdfs_embb.csv', index=False)

2. Search in the Embeddings

In [6]:
%pip install transformers

Collecting transformers
  Using cached transformers-4.34.0-py3-none-any.whl (7.7 MB)
Collecting tokenizers<0.15,>=0.14
  Using cached tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
Collecting safetensors>=0.3.1
  Using cached safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting filelock
  Using cached filelock-3.12.4-py3-none-any.whl (11 kB)
Collecting huggingface-hub<1.0,>=0.16.4
  Using cached huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
Collecting pyyaml>=5.1
  Using cached PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (705 kB)
Collecting regex!=2019.12.17
  Using cached regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
Collecting fsspec>=2023.5.0
  Using cached fsspec-2023.9.2-py3-none-any.whl (173 kB)
Collecting huggingface-hub<1.0,>=0.16.4
  Using cached huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
Installing collected packages: sa

In [15]:
%pip install tiktoken

8319.65s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting tiktoken
  Downloading tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.5.1
Note: you may need to restart the kernel to use updated packages.


In [16]:
import os

# Set the TOKENIZERS_PARALLELISM environment variable to false
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Now you can run your Python code that uses the tokenizers library

In [2]:
%pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Collecting joblib>=1.1.1
  Using cached joblib-1.3.2-py3-none-any.whl (302 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.3.1 threadpoolctl-3.2.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
# Creating the functions for Search and Ask
 

# import pandas as pd
# import numpy as np
# import openai
# from sklearn.metrics.pairwise import cosine_similarity

# # Load your embeddings from the CSV
# df = pd.read_csv('sec_pdfs_embb.csv')

# def search(query_embedding):
#     # Calculate the cosine similarity between the query embedding and all context embeddings
#     similarities = cosine_similarity([query_embedding], np.vstack(df['embedding'].to_list()))
    
#     # Get the index of the most similar context
#     top_idx = np.argmax(similarities)
    
#     # Return the most similar context
#     return df.iloc[top_idx]['Context']

# def ask(question, context):
#     # Use GPT-3.5-turbo to generate an answer based on the context
#     response = openai.Completion.create(
#         model="gpt-turbo-3.5-turbo",
#         prompt=f"Question: {question}\nContext: {context}\nAnswer:",
#         max_tokens=150,
#         n=1,
#         stop=["\n"],
#         temperature=0.7,
#     )
    
#     return response.choices[0].text.strip()

# def get_answer(question, query_embedding):
#     context = search(query_embedding)
#     answer = ask(question, context)
#     return answer

In [11]:
#  Embedding the input



# question = 'What happens if a statement schedule is omitted?'
# from dotenv import load_dotenv
# import os

# # models
# EMBEDDING_MODEL = "text-embedding-ada-002"
# GPT_MODEL = "gpt-3.5-turbo"
# load_dotenv()

# api_key = os.getenv('openapi_key') #Uncomment this and run the command.
# openai.api_key = api_key

# def create_embedding(context):
#     try:
#         response = openai.Embedding.create(
#             model=EMBEDDING_MODEL,
#             input= context
#         )
#         embeddings = [item['embedding'] for item in response['data']]
#         flattened_embeddings = [value for sublist in embeddings for value in sublist]
#         return flattened_embeddings 
#     except Exception as e:
#         print(f"Error in generating Embedding: {e}")
#         return ""

# Search = create_embedding(question)

In [18]:
import pandas as pd
import openai
import tiktoken
import spacy
from scipy import spatial
from transformers import GPT2TokenizerFast


from dotenv import load_dotenv
import os

# # models
# EMBEDDING_MODEL = "text-embedding-ada-002"
# GPT_MODEL = "gpt-3.5-turbo"
load_dotenv()

api_key = os.getenv('openapi_key') #Uncomment this and run the command.

openai.api_key = api_key

nlp = spacy.load("en_core_web_md")


GPT_MODEL = "gpt-3.5-turbo"


tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def num_tokens(text):
    encoding = tokenizer.encode(text, add_special_tokens=False)
    return len(encoding)

def query_message(query, df, model, token_budget):
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the relevant documents from the SEC government data to answer the subsequent question. If the answer cannot be found in the documents, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_document = f'\n\nSEC Document Section:\n"""\n{string}\n"""'
        if num_tokens(message + next_document + question) > token_budget:
            break
        else:
            message += next_document
    return message + question

def strings_ranked_by_relatedness(query, df, top_n=100):
    
    query_embedding = nlp(query)

    
    df['text_embedding'] = df['Context'].apply(lambda text: nlp(text))

   
    df['similarity'] = df['text_embedding'].apply(lambda text_embedding: query_embedding.similarity(text_embedding))

    
    df = df.sort_values(by='similarity', ascending=False)

    
    top_strings = df['Context'][:top_n].tolist()
    top_relatednesses = df['similarity'][:top_n].tolist()

    return top_strings, top_relatednesses

def ask(question, df, model=GPT_MODEL, token_budget=4096 - 500, print_message=False):
    message = query_message(question, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about SEC government data."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(model=model, messages=messages, temperature=0)
    response_message = response["choices"][0]["message"]["content"]
    return response_message


df = pd.read_csv('sec_pdfs_data.csv')


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [19]:
question = 'What happens if a statement schedule is omitted?'
answer = ask(question, df)
print(answer)

Token indices sequence length is longer than the specified maximum sequence length for this model (1032 > 1024). Running this sequence through the model will result in indexing errors


SEC Document Section: "Schedules (or similar attachments) to the exhibits required by this Item are not required to be filed provided that they do not contain information material to an investment or voting decision and that information is not otherwise disclosed in the exhibit or the disclosure document. Each exhibit filed must contain a list briefly identifying the contents of all omitted schedules. Registrants need not prepare a separate list of omitted information if such information is already included within the report."

If a statement schedule is omitted, it is not required to be filed as long as it does not contain material information and that information is not disclosed elsewhere in the exhibit or the disclosure document. The exhibit filed must include a list briefly identifying the contents of all omitted schedules.


# Book 3: Fine Tuning and Creating a negative Feedback

## Create a fine tuning Dataset

In [2]:
import pandas as pd
df = pd.read_csv('sec_pdfs_data.csv')
df_qa= pd.read_csv('sec_pdfs_qa.csv')

In [9]:
result = pd.concat([df, df_qa], axis=1)

In [10]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(result, test_size=0.2, random_state=42)
len(train_df), len(test_df)

(971, 243)

## Create a fine-tuning Dataset for Q/A

In [None]:
import random

def get_random_similar_contexts(question, context, file_id=olympics_search_fileid, search_model='ada', max_rerank=10):
    """
    Find similar contexts to the given context using the search file
    """
    try:
        results = openai.Engine(search_model).search(
            search_model=search_model, 
            query=question, 
            max_rerank=max_rerank,
            file=file_id
        )
        candidates = []
        for result in results['data'][:3]:
            if result['text'] == context:
                continue
            candidates.append(result['text'])
        random_candidate = random.choice(candidates)
        return random_candidate
    except Exception as e:
        print(e)
        return ""

def create_fine_tuning_dataset(df, discriminator=False, n_negative=1, add_related=False):
    """
    Create a dataset for fine tuning the OpenAI model; either for a discriminator model, 
    or a model specializing in Q&A, where it says if no relevant context is found.

    Parameters
    ----------
    df: pd.DataFrame
        The dataframe containing the question, answer and context pairs
    discriminator: bool
        Whether to create a dataset for the discriminator
    n_negative: int
        The number of random negative samples to add (using a random context)
    add_related: bool
        Whether to add the related contexts to the correct context. These are hard negative examples

    Returns
    -------
    pd.DataFrame
        The dataframe containing the prompts and completions, ready for fine-tuning
    """
    rows = []
    for i, row in df.iterrows():
        for q, a in zip(("1." + row.questions).split('\n'), ("1." + row.answers).split('\n')):
            if len(q) >10 and len(a) >10:
                if discriminator:
                    rows.append({"prompt":f"{row.context}\nQuestion: {q[2:].strip()}\n Related:", "completion":f" yes"})
                else:
                    rows.append({"prompt":f"{row.context}\nQuestion: {q[2:].strip()}\nAnswer:", "completion":f" {a[2:].strip()}"})

    for i, row in df.iterrows():
        for q in ("1." + row.questions).split('\n'):
            if len(q) >10:
                for j in range(n_negative + (2 if add_related else 0)):
                    random_context = ""
                    if j == 0 and add_related:
                        # add the related contexts based on originating from the same wikipedia page
                        subset = df[(df.title == row.title) & (df.context != row.context)]
                        
                        if len(subset) < 1:
                            continue
                        random_context = subset.sample(1).iloc[0].context
                    if j == 1 and add_related:
                        # add the related contexts based on the most similar contexts according to the search
                        random_context = get_random_similar_contexts(q[2:].strip(), row.context, search_model='ada', max_rerank=10)
                    else:
                        while True:
                            # add random context, which isn't the correct context
                            random_context = df.sample(1).iloc[0].context
                            if random_context != row.context:
                                break
                    if discriminator:
                        rows.append({"prompt":f"{random_context}\nQuestion: {q[2:].strip()}\n Related:", "completion":f" no"})
                    else:
                        rows.append({"prompt":f"{random_context}\nQuestion: {q[2:].strip()}\nAnswer:", "completion":f" No appropriate context found to answer the question."})

    return pd.DataFrame(rows) 