In [1]:
import os
import streamlit as st
import pinecone
import json
from pprint import pprint
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
from uuid import uuid4
from streamlit_chat import message
from streamlit_extras.colored_header import colored_header
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain.chat_models import ChatOpenAI
from langchain_streamlit_chatbot.generate_vectorstore import generate_vectorstore
from langchain.chat_models import ChatOpenAI   
from langchain.chains import ConversationalRetrievalChain   
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone


  from tqdm.autonotebook import tqdm


In [31]:
# load json files and store into one list
data = []
for i in range(4):
    for j in range(5):
        folder_name = r"""\4COffshore_MarketOverviewReport_202"""+str(i)+"_Q"+str(j)
        data_path = r"""C:\Users\fangning.zheng\Documents\weekly summary\week05\nlp-experiments-gists\fangzhen\langchain_streamlit_chatbot\src\langchain_streamlit_chatbot\data_ppt"""+folder_name
        if os.path.exists(data_path):
            print(data_path)
            os.chdir(data_path)
            files = [f for f in os.listdir('.') if os.path.isfile(f)]
            
            for file in files:
                data.append(json.loads(Path(file).read_text()))

C:\Users\fangning.zheng\Documents\weekly summary\week05\nlp-experiments-gists\fangzhen\langchain_streamlit_chatbot\src\langchain_streamlit_chatbot\data_ppt\4COffshore_MarketOverviewReport_2021_Q1
C:\Users\fangning.zheng\Documents\weekly summary\week05\nlp-experiments-gists\fangzhen\langchain_streamlit_chatbot\src\langchain_streamlit_chatbot\data_ppt\4COffshore_MarketOverviewReport_2021_Q2
C:\Users\fangning.zheng\Documents\weekly summary\week05\nlp-experiments-gists\fangzhen\langchain_streamlit_chatbot\src\langchain_streamlit_chatbot\data_ppt\4COffshore_MarketOverviewReport_2021_Q3
C:\Users\fangning.zheng\Documents\weekly summary\week05\nlp-experiments-gists\fangzhen\langchain_streamlit_chatbot\src\langchain_streamlit_chatbot\data_ppt\4COffshore_MarketOverviewReport_2021_Q4
C:\Users\fangning.zheng\Documents\weekly summary\week05\nlp-experiments-gists\fangzhen\langchain_streamlit_chatbot\src\langchain_streamlit_chatbot\data_ppt\4COffshore_MarketOverviewReport_2022_Q1
C:\Users\fangning.zh

In [32]:
# print the first dict
pprint(data[-1])

{'doc_name': '4COffshore_MarketOverviewReport_2023_Q2',
 'doc_quarter': '2',
 'doc_year': '2023',
 'page': '61',
 'text': 'Get in touch\n'
         'Contact us should you have any comments on the content of this '
         'report, or suggestions for future content.\n'
         'The team at 4C is always happy to help.\n'
         '\n'
         'Orbis Energy Centre, Wilde Street, Lowestoft, NR32 1XH\n'
         'Tel +44 (0)1502 307037    Email info@4coffshore.com    Web '
         'www.4coffshore.com\n'
         '\n',
 'text_type': 'text'}


# compile a new indexing using Pinecone

In [37]:
# compile a new indexing
index_name = 'langchain-retrieval-marketoverview'

# find API key in console at app.pinecone.io
PINECONE_API_KEY = st.secrets["pinecone_pass"]
# find ENV (cloud region) next to API key in console
PINECONE_ENVIRONMENT = st.secrets["pinecone_environment"]
# OPENAI API Key
OPENAI_API_KEY = st.secrets["openai_pass"]

# initiate pinecone vectore db
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )

In [38]:
# check index status
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

# Text chuncking, embedding, and indexing

In [35]:
tiktoken.encoding_for_model('gpt-3.5-turbo')
tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

# Compile text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

# compile embedding
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [None]:
batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(data):
    # first get metadata fields for this record
    metadata = {
        'doc_name': record['doc_name'],
        'doc_quarter': record['doc_quarter'],
        'doc_year': record['doc_year'],
        'page': record['page'],
        'text_type': record['text_type']
        }

    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]

    # append these to current batches
    texts.extend(record_texts)
    #print(record_texts)
    metadatas.extend(record_metadatas)
    #print(record_metadatas)
    
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))