In [1]:
from mistralai import Mistral
import os
import faiss
from langchain.text_splitter import MarkdownTextSplitter
import time
import random
import numpy as np

directory = "/home/adrian/Documents/University Work/Part III Project/PaperQA2/LitQA2_Papers_Parsing"

mistral_client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])

def get_text_embedding(input):
    embeddings_batch_response = mistral_client.embeddings.create(
        model="mistral-embed",
        inputs=input
    )
    return embeddings_batch_response.data[0].embedding

full_paths = [os.path.abspath(os.path.join(directory, f)) for f in os.listdir(directory)]
chunks=[]
for doc in (full_paths):
    uploaded_pdf = mistral_client.files.upload(
        file={
            "file_name": "uploaded_file.pdf",
            "content": open(doc, "rb"),
        },
        purpose="ocr"
    )
    signed_url = mistral_client.files.get_signed_url(file_id=uploaded_pdf.id)
    ocr_response = mistral_client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": signed_url.url,
        }
    )
    for i in range(len(ocr_response.pages)):
        #default chunking strategy used by OpenAI file search
        splitter = MarkdownTextSplitter(chunk_size=800, chunk_overlap=400)
        chunks.extend(splitter.split_text(ocr_response.pages[i].markdown))
text_embeddings=[]
count=0
for chunk in chunks:
    print("Vector Store Progress: "+str(count/len(chunks)*100)+"%", end="\r", flush=True)
    count+=1
    text_embeddings.append(get_text_embedding(chunk))
    wait_time = random.random()
    time.sleep(wait_time)
text_embeddings = np.array(text_embeddings)

np.save('text_embeddings_LitQA2.npy', text_embeddings)

# Optionally, you might want to save the chunks as well for reference
with open('chunks_LitQA2.txt', 'w', encoding='utf-8') as f:
    for chunk in chunks:
        f.write(chunk + '\n---\n')

Vector Store Progress: 99.99570095868621%%%%