In [None]:

from langchain.chains.combine_documents import collapse_docs, split_list_of_docs
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

# from langchain.chat_models import ChatAnthropic
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema import Document, StrOutputParser
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import format_document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
import json
from random import seed,shuffle
from time import sleep

In [None]:
import pandas as pd


In [None]:
from ragchat.doc_store import DocStore
from ragchat.configs import (DEBUG, REFERENCE_FOLDER, DB_NAME, 
COLLECTION_NAME, Q_DB_NAME, Q_COLLECTION_NAME, TOKENS_PER_SYNTH_QUESTION)
from ragchat.text_embedder import TextEmbedder


In [None]:
llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)


In [None]:
ds=DocStore(db_name=DB_NAME, collection_name=COLLECTION_NAME,)
q_ds=DocStore(db_name=Q_DB_NAME, collection_name=Q_COLLECTION_NAME,)

In [None]:
questions = []
# prompt = ChatPromptTemplate.from_template(
#     "Generate a list of 10 hypothetical questions that the below document could be used to answer:\n\n{doc}"
# )
# chain = {"doc": RunnablePassthrough()} | prompt | llm | StrOutputParser()
i = 0

for doc_chunk in ds.yield_from_db(query={}, chunk_size=100):
    for text_result in doc_chunk:
        n_tok=llm.get_num_tokens(text_result['cleaned'])
        n_q= n_tok//TOKENS_PER_SYNTH_QUESTION
        prompt = ChatPromptTemplate.from_template(
            f"Generate a list of {n_q} questions that the below document could be used to answer:\n\n{doc}"
        )
        chain = {"doc": RunnablePassthrough()} | prompt | llm | StrOutputParser()
        q = chain.invoke(TextEmbedder.add_title(text_result))
        q_list = re.split(r"\n[0-9]+\.\s", q)
        if q_list[0][:3] == "1. ":
            q_list[0] = q_list[0][3:]
        questions.append({'questions':q_list,source:{k:v for k,v in text_result.items() if k!='cleaned')}
        i += 1
        sleep(0.25)
        


In [None]:
if not os.path.exists("questions"):
    os.makedirs("questions")
    with open("questions/synthetic_questions.json", "w") as f:
        json.dump(questions, f)


In [None]:
question_sources = {}
for q_dict in questions:
    q_list=q_dict['questions']
    for i,q in enumerate(q_list):
        if q not in question_sources:
            question_sources[q] = []
        q_info={k:v for k,v in q_dict.items() if k!='questions'}
        q_info['question_number']=i
        question_sources[q].append(q_info)


In [None]:
# look for duplicate questions
(len(question_sources), len([q for q_list in questions.values() for q in q_list]))


In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")


In [None]:
docs = []
for q in question_sources.keys():
    docs.append(
        Document(
            page_content=q,
            metadata={
                "doc_list": question_sources[q],
            },
        )
    )


In [None]:
vector_store = None
n = len(docs)
ch_sz = 100
for ch_i in range(-(-n // ch_sz)):
    start = ch_i * ch_sz
    stop = start + ch_sz
    if vector_store is None:
        vector_store = FAISS.from_documents(docs[start:stop], embeddings)
    else:
        sleep(0.5)
        vector_store.add_documents(docs[start:stop])


In [None]:
vector_store.save_local("questions/questions_FAISS_index")
