In [15]:
import mistralai
import os

MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
client = mistralai.Mistral(api_key=MISTRAL_API_KEY)
client

<mistralai.sdk.Mistral at 0x7f4cebdbbdd0>

In [20]:
import os

technical_reports_count = len([f for f in os.listdir("data/pdf/technical_reports") if os.path.isfile(os.path.join("data/pdf/technical_reports", f))])
market_reports_count = len([f for f in os.listdir("data/pdf/market_reports") if os.path.isfile(os.path.join("data/pdf/market_reports", f))])

print(f"Number of documents in data/pdf/technical_reports: {technical_reports_count}")
print(f"Number of documents in data/pdf/market_reports: {market_reports_count}")

Number of documents in data/pdf/technical_reports: 68
Number of documents in data/pdf/market_reports: 805


In [28]:
from PyPDF2 import PdfReader

def count_pages_in_folder(folder_path):
    total_pages = 0
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            try:
                with open(os.path.join(folder_path, file), "rb") as f:
                    reader = PdfReader(f)
                    total_pages += len(reader.pages)
            except Exception as e:
                pass
    return total_pages

technical_reports_pages = count_pages_in_folder("data/pdf/technical_reports")
market_reports_pages = count_pages_in_folder("data/pdf/market_reports")

print(f"Cumulative number of pages in technical reports: {technical_reports_pages}")
print(f"Cumulative number of pages in market reports: {market_reports_pages}")

Cumulative number of pages in technical reports: 328
Cumulative number of pages in market reports: 12154


1000 pages per dollar => approximately 12$ to process all pdfs

# Converting pdf into md with mistral OCR API

In [35]:
from tqdm import tqdm
from mistralai import DocumentURLChunk
from utils import get_combined_markdown

# Create directory structure if it doesn't exist
os.makedirs("data/md/technical_reports", exist_ok=True)
os.makedirs("data/md/market_reports", exist_ok=True)

# Define folders to process
folders = [
    {"src": "data/pdf/technical_reports", "dest": "data/md/technical_reports", "prefix": "technical_report-"},
    {"src": "data/pdf/market_reports", "dest": "data/md/market_reports", "prefix": "market_report-"}
]

# Process all folders in a single loop
for folder in folders:
    errors = []

    for file in tqdm(os.listdir(folder["src"]), desc=f"Processing {folder['src']}"):
        output_file_path = f"{folder['dest']}/{file}".replace(".pdf", ".md")

        if file.endswith(".pdf") and not os.path.exists(output_file_path):
            try:
                # Upload file for OCR processing
                ocr_data = client.files.upload(
                    file={
                        "file_name": f"{folder['prefix']}{file}",
                        "content": open(f"{folder['src']}/{file}", "rb")
                    },
                    purpose="ocr"
                )
                
                # Get signed URL
                signed_url = client.files.get_signed_url(file_id=ocr_data.id, expiry=1)
                
                # Process document with OCR
                ocr_result = client.ocr.process(
                    document=DocumentURLChunk(document_url=signed_url.url),
                    model="mistral-ocr-latest",
                    include_image_base64=True
                )
                
                # Get combined markdown
                markdown_content = get_combined_markdown(ocr_result)
                
                # Write to output file
                with open(output_file_path, "w", encoding="utf-8") as f:
                    f.write(markdown_content)
                    
            except Exception as e:
                errors.append(f"{folder['src']}/{file}: {str(e)}")
                continue

    print(f"Errors: {errors}")
    print(f"Number of errors: {len(errors)}")


Processing data/pdf/technical_reports: 100%|██████████| 68/68 [00:00<00:00, 145368.33it/s]


Errors: []
Number of errors: 0


Processing data/pdf/market_reports: 100%|██████████| 805/805 [00:07<00:00, 100.63it/s]

Errors: ['data/pdf/market_reports/2024-01-12_Déclaration_d\'intention_préalable_annexe_1.pdf: API error occurred: Status 422\n{"detail": "Invalid file format.", "message": "Received file with mimetype application/vnd.openxmlformats-officedocument.spreadsheetml.sheet, only application/pdf, image/.*, application/vnd.openxmlformats-officedocument.wordprocessingml.document, application/vnd.openxmlformats-officedocument.presentationml.presentation, application/epub+zip, application/docbook+xml, application/rtf, application/vnd.oasis.opendocument.text, application/x-biblatex, application/x-bibtex, application/x-endnote+xml, application/x-fictionbook+xml, application/x-ipynb+json, application/x-jats+xml, application/x-latex, application/x-opml+xml, text/troff, text/x-dokuwiki are currently supported"}', 'data/pdf/market_reports/2024-05-13_Divers_documents_PO_autres_secteurs_Divers_documents_PO_autres_secteurs,_notamment_les.pdf: API error occurred: Status 422\n{"detail": "Invalid file format.




In [None]:
concatenated_md = '\n'.join([page.markdown for page in ocr_result.pages])

def get_embeddings_by_chunks(data, chunk_size):
    chunks = [data[x : x + chunk_size] for x in range(0, len(data), chunk_size)]
    embeddings_response = [
        client.embeddings.create(model="mistral-embed", inputs=c) for c in chunks
    ]
    return [d.embedding for e in embeddings_response for d in e.data]

document_embeddings = get_embeddings_by_chunks(concatenated_md, 4000)
document_embeddings[:5]

[[-0.01129913330078125,
  0.0103759765625,
  0.0443115234375,
  -0.000988006591796875,
  0.01351165771484375,
  0.028778076171875,
  0.030548095703125,
  0.0056304931640625,
  -0.0023517608642578125,
  0.009490966796875,
  -0.043609619140625,
  0.0635986328125,
  -0.007724761962890625,
  0.00560760498046875,
  -0.05438232421875,
  0.01837158203125,
  0.003620147705078125,
  0.037261962890625,
  0.0254364013671875,
  0.017303466796875,
  -0.032135009765625,
  -0.0300140380859375,
  -0.03778076171875,
  0.01641845703125,
  -0.0083465576171875,
  -0.0007171630859375,
  0.007415771484375,
  -0.07061767578125,
  -0.0565185546875,
  0.0233154296875,
  0.0019092559814453125,
  -0.05792236328125,
  0.0051422119140625,
  -0.0021305084228515625,
  0.00860595703125,
  -0.00225067138671875,
  -0.0411376953125,
  -0.02252197265625,
  0.0212860107421875,
  0.0037975311279296875,
  -0.036376953125,
  -0.0204010009765625,
  0.00719451904296875,
  -0.01500701904296875,
  -0.0020751953125,
  -0.01209259

In [13]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

qdrant_client = QdrantClient(":memory:")

qdrant_client.create_collection(
    collection_name="vegetables_recommandations",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE)
)

True

In [14]:
from qdrant_client.models import PointStruct

points = [
    PointStruct(
        id=idx,
        vector=document_embeddings[idx],
        payload={"text": concatenated_md[idx*4000:(idx+1)*4000]}
    ) for idx in range(len(document_embeddings))
]

qdrant_client.upsert(
    collection_name="vegetables_recommandations",
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [17]:
query_embedding = client.embeddings.create(
    model="mistral-embed",
    inputs="How to grow tomatoes in a greenhouse?"
) 

chunks = qdrant_client.search(
    collection_name="vegetables_recommandations",
    query_vector=query_embedding.data[0].embedding,
    limit=5
)

result_text = "\n\n".join(chunk.payload["text"] for chunk in chunks)
print(result_text)


| 151.6 |
| 29-42 | 1.0 | 2.0 | 80.8 | 179.6 |
| 43-56 | 1.5 | 3.0 | 101.8 | 221.5 |
| 57-77 | 2.2 | 4.4 | 148.0 | 313.9 |
| 78-98 | 2.5 | 5.0 | 200.5 | 418.9 |

SUGGESTED FERTIGATION SCHEDULE FOR TOMATO* (high soil potassium)

| Days after planting | Daily nitrogen | Daily potash | Cumulative |  |
| :--: | :--: | :--: | :--: | :--: |
|  |  |  | Nitrogen | Potash |
|  | (Ib / A) |  |  |  |
| Preplant |  |  | 50.0 | 125.0 |
| $0-14$ | 0.5 | 0.5 | 57.0 | 132.0 |
| $15-28$ | 0.7 | 0.7 | 66.8 | 141.8 |
| 29-42 | 1.0 | 1.0 | 80.8 | 155.8 |
| 43-56 | 1.5 | 1.5 | 101.8 | 176.5 |
| 57-77 | 2.2 | 2.2 | 148.0 | 223.0 |
| 78-98 | 2.5 | 2.5 | 200.5 | 275.5 |

*Adjust based on tissue analysis.

Plasticulture. Yield, fruit size, and fruit quality of fresh market tomatoes can be increased and disease and weed issues reduced by the use of raised beds covered with plastic mulch in combination with drip irrigation. Early planted crops usually benefit from the soil warming provided by black plastic mulch

  chunks = qdrant_client.search(


In [20]:
client.chat.complete(
    model="mistral-large-latest",
    messages=[
        {"role": "system", "content": "You are a helpful assistant. You will be given a user question along with a context. Your task is to answer the question based on the context or ask the user more information if needed."},
        {"role": "user", "content": "How to grow tomatoes in a greenhouse? Here is the relevant context retrieved: " + result_text}
    ]
).choices[0].message.content

'To grow tomatoes in a greenhouse, follow these steps based on the provided context:\n\n1. **Variety Selection**: Choose a greenhouse-specific variety, as they are bred for greenhouse conditions and have better disease resistance. Most greenhouse varieties are indeterminate hybrids, which yield over a long harvest season.\n\n2. **Seedling Care**:\n   - Germinate seeds in a soilless media or plug-growing mix without fertilizer.\n   - Once cotyledons are fully expanded and true leaves begin to unfold, start liquid feeding with a water-soluble fertilizer at half the recommended rate.\n   - Harden transplants by gradually exposing them to more sunlight and reducing water, but do not allow them to wilt.\n\n3. **Greenhouse Preparation**:\n   - Ensure your greenhouse has proper heating, venting, and exhaust systems to avoid plant damage from improper environmental conditions or exhaust gases.\n   - Maintain optimal temperatures for germination and growth (60°-65°F day and 50°-60°F night).\n\n

In [None]:

# Export the combined markdown to a file
markdown_content = get_combined_markdown(ocr_result)

# Define the output file path
output_file_path = "vegetable_handbook_ocr.md"

# Write the markdown content to the file
with open(output_file_path, "w", encoding="utf-8") as f:
    f.write(markdown_content)

print(f"Markdown content has been exported to {output_file_path}")

Markdown content has been exported to vegetable_handbook_ocr.md
