In [65]:
import os
import streamlit as st
import json
from pprint import pprint
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
from uuid import uuid4
from streamlit_chat import message
from streamlit_extras.colored_header import colored_header
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain_streamlit_chatbot.generate_vectorstore import generate_vectorstore  
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.indexes import VectorstoreIndexCreator
import glob

In [None]:
root_dir = r"""C:\Users\fangning.zheng\Documents\weekly summary\week07\data_ppt"""
data = []
for filename in glob.iglob(root_dir + '**/**', recursive=True):
     try:
          print(filename)
          data.append(json.loads(Path(filename).read_text()))
     except:
          pass
                

In [None]:
# load json files and store into one list
data = []
for i in range(1,4):
    for j in range(1,5):
        folder_name = r"""\4COffshore_MarketOverviewReport_202"""+str(i)+"_Q"+str(j)
        print(folder_name)
        data_path = r"""C:\Users\fangning.zheng\Documents\weekly summary\week07\data_ppt"""+folder_name
        if os.path.exists(data_path):
            print(data_path)
            os.chdir(data_path)
            files = [f for f in os.listdir('.') if os.path.isfile(f)]
            
            for file in files:
                data.append(json.loads(Path(file).read_text()))
                #docs.append(Document(page_content=text, metadata=metadata))

In [91]:
# print the first dict
pprint(data[-1])

{'doc_name': '4COffshore_MarketOverviewReport_2023_Q2',
 'doc_quarter': '2',
 'doc_year': '2023',
 'page': '61',
 'text': 'Get in touch\n'
         'Contact us should you have any comments on the content of this '
         'report, or suggestions for future content.\n'
         'The team at 4C is always happy to help.\n'
         '\n'
         'Orbis Energy Centre, Wilde Street, Lowestoft, NR32 1XH\n'
         'Tel +44 (0)1502 307037    Email info@4coffshore.com    Web '
         'www.4coffshore.com\n'
         '\n',
 'text_type': 'text'}


# Text chunking, embedding, and indexing

In [60]:
tiktoken.encoding_for_model('gpt-3.5-turbo')
tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

# Compile text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

# compile embedding
model_name = 'text-embedding-ada-002'
OPENAI_API_KEY = st.secrets["openai_pass"]

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [61]:
docs = []
for i, record in enumerate(data):
    # first get metadata fields for this record
    metadata = {
        'doc_name': record['doc_name'],
        'doc_quarter': record['doc_quarter'],
        'doc_year': record['doc_year'],
        'page': record['page'],
        'text_type': record['text_type']
        }

    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, **metadata
    } for j, text in enumerate(record_texts)]

    #print(record_texts)
    #print(record_metadatas)
    
    # append chunk
    for record_text,record_metadata in zip(record_texts, record_metadatas):
        #print(record_text,record_metadata)
        docs.append(Document(page_content=record_text, metadata=record_metadata))


In [None]:
pprint(docs)

In [63]:
vector_db = FAISS.from_documents(docs, embed)

In [64]:
os.chdir(r"""C:\Users\fangning.zheng\Documents\weekly summary\week05\nlp-experiments-gists\fangzhen\langchain_streamlit_chatbot\src\langchain_streamlit_chatbot\vector_store""")
vector_db.save_local("foreseeGlobalMarket_index")

In [57]:
FAISS.load_local("foreseeGlobalMarket_index", embed)

<langchain.vectorstores.faiss.FAISS at 0x1d585b84cd0>

In [None]:
vector_db.docstore._dict