### Build Database 1 and Database 2

#### Define Functions

In [1]:
import os
from tqdm import tqdm
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders import UnstructuredMarkdownLoader

def get_files(dir_path: str) -> list:
    """Filter out docs that are not markdown nor txt"""
    file_list = [] # to store the paths of .md and .txt files
    for filepath, dirnames, filenames in os.walk(dir_path):
        for filename in filenames:
            if filename.endswith(".md"):
                file_list.append(os.path.join(filepath, filename))  # add .md file paths
            elif filename.endswith(".txt"):
                file_list.append(os.path.join(filepath, filename))  # add .txt file paths
    return file_list

def get_text(target_dirs: list) -> list:
    """Convert .md and .txt files to plain text and load them under given target directories"""
    docs = []  # to store unformated text
    for dir_path in target_dirs:
        file_lst = get_files(dir_path)  # get the paths of .md and .txt
        for one_file in tqdm(file_lst):
            file_type = one_file.split('.')[-1]
            if file_type == 'md':
                loader = UnstructuredMarkdownLoader(one_file)  # convert .md to plain text
            elif file_type == 'txt':
                loader = UnstructuredFileLoader(one_file)  # convert .txt to plain text
            else:
                continue
            docs.extend(loader.load())  # add the converted plain text to a list
    return docs

#### Convert .md and .txt files to plain text and load them

In [3]:
target_dirs = [
    "/root/data/InternLM",
    "/root/data/InternLM-XComposer",
    "/root/data/lagent",
    "/root/data/lmdeploy",
    "/root/data/opencompass",
    "/root/data/xtuner"
]

target_dirs_2 = [
    "/root/data/mmdetection",
    "/root/data/mmengine",
]

docs = get_text(target_dirs)
docs_2 = get_text(target_dirs_2)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)
embeddings = HuggingFaceEmbeddings(model_name="/root/data/model/sentence-transformer")  # model for chuncks vectorization

persist_directory = '/root/data_base/vector_db/chroma'  # where to store the vector database
split_docs = text_splitter.split_documents(docs)  # splitted text chuncks

persist_directory_2 = '/root/data_base/vector_db/chroma_2'  # where to store the vector database
split_docs_2 = text_splitter.split_documents(docs_2)  # splitted text chuncks

vectordb = Chroma.from_documents(
    documents=split_docs,
    embedding=embeddings,
    persist_directory=persist_directory
)

vectordb_2 = Chroma.from_documents(
    documents=split_docs_2,
    embedding=embeddings,
    persist_directory=persist_directory_2
)

vectordb.persist()  # store the database in the local disk
vectordb_2.persist()

 80%|████████  | 20/25 [00:00<00:00, 23.16it/s]

100%|██████████| 25/25 [00:00<00:00, 25.51it/s]
100%|██████████| 9/9 [00:00<00:00, 24.29it/s]
100%|██████████| 18/18 [00:00<00:00, 43.16it/s]
100%|██████████| 72/72 [00:02<00:00, 29.06it/s]
100%|██████████| 113/113 [00:04<00:00, 23.22it/s]
100%|██████████| 26/26 [00:01<00:00, 22.11it/s]
 49%|████▉     | 138/279 [00:07<00:07, 19.92it/s]No features in text.
100%|██████████| 279/279 [00:13<00:00, 21.46it/s]
100%|██████████| 128/128 [00:08<00:00, 14.80it/s]
