In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pathlib
import os
from dotenv import load_dotenv

In [2]:
path_env = pathlib.Path(os.getcwd()).parent.parent / '.env'
path_env

PosixPath('/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/.env')

In [3]:
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key

## GPTCache

In [4]:
#!pip install gptcache

In [5]:
# get the content(only question) form the prompt to cache
def get_msg_func(data, **_):
    return data.get("messages")[-1].content

In [9]:
from gptcache import cache
from gptcache.embedding import Onnx
from gptcache.manager import CacheBase, VectorBase, get_data_manager
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation


onnx = Onnx()
cache_base = CacheBase('sqlite')
vector_base = VectorBase('faiss', dimension=onnx.dimension)
data_manager = get_data_manager(cache_base, vector_base)
cache.init(
    pre_embedding_func=get_msg_func,
    embedding_func=onnx.to_embeddings,
    data_manager=data_manager,
    similarity_evaluation=SearchDistanceEvaluation(),
    )
cache.set_openai_key()

In [12]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader

text_splitter =  RecursiveCharacterTextSplitter(chunk_overlap=500, chunk_size=2000)

In [11]:
# Original Rosie corpora
path_orig_corpus_es = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_strict_v2.0_es_compiled_documents_lang.parquet"
path_orig_corpus_en = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_strict_v3.0_en_compiled_documents_lang.parquet"

# Proccessed Rosie corpora
path_df_processed = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/df_0.1.parquet"

# Path save csv for LangChain index
path_to_index = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/to_index"

# Path models
model_path = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/LDA/rosie_0.1_100"

In [14]:
# Create index
test_en = pathlib.Path(path_to_index) / f"EN_{4}.csv"
test_es = pathlib.Path(path_to_index) / f"EN_{36}.csv"

loader_en = CSVLoader(
    file_path=test_en,
    source_column="doc_id",
    csv_args={
        "fieldnames": ["doc_id", "text", "url"],
    })

data_en = loader_en.load()

In [24]:
doc = loader_en.load()[2]

In [28]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain
from gptcache.adapter.langchain_models import LangChainChat

chat = ChatOpenAI(temperature=0) # using the following code to cache with gptcache
#chat = LangChainChat(chat=ChatOpenAI(temperature=0))

chain = QAGenerationChain.from_llm(chat, text_splitter=text_splitter)
qa = chain.run(doc.page_content)
qa

[{'question': "What is Kostmann's syndrome characterized by?",
  'answer': "Kostmann's syndrome is characterized by low neutrophils, one of the types of infection-fighting cells."}]

In [29]:
qa

[{'question': "What is Kostmann's syndrome characterized by?",
  'answer': "Kostmann's syndrome is characterized by low neutrophils, one of the types of infection-fighting cells."}]

In [30]:
doc

Document(page_content="doc_id: Kostmann's Syndrome What is Kostmann’s syndrome? Kostmann’s syndrome is one of severe congenital neutropenias. Children are born with this disorder that is characterized by low neutrophils, one on the types of infection-fighting cells. Children with Kostmann’s syndrome usually present with severe and recurrent infections, mainly in the respiratory tract and skin. Diagnosing Kostmann’s syndrome is generally done by a physical exam, medical history and confirmed with blood, genetic and molecular tests.\ntext: None\nurl: None", metadata={'source': "Kostmann's Syndrome What is Kostmann’s syndrome? Kostmann’s syndrome is one of severe congenital neutropenias. Children are born with this disorder that is characterized by low neutrophils, one on the types of infection-fighting cells. Children with Kostmann’s syndrome usually present with severe and recurrent infections, mainly in the respiratory tract and skin. Diagnosing Kostmann’s syndrome is generally done by