#Step 1: Setting Up the Environment

In [25]:
pip install PyPDF2 python-docx requests faiss-cpu




#Step 2: Document Processing

In [26]:
from PyPDF2 import PdfReader
from docx import Document

def read_txt(file_path):
    """Read text from a .txt file."""
    with open(file_path, 'r') as file:
        return file.read()

def read_pdf(file_path):
    """Read text from a .pdf file."""
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def read_docx(file_path):
    """Read text from a .docx file."""
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])


#Step 3: Using Mistral API for Embeddings

In [27]:
import requests

def generate_embeddings(text, api_key):
    url = "https://api.mistral.ai/v1/embeddings"  # Mistral embeddings endpoint
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "mistral-embed",
        "input": text
    }

    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()  # raise error if not 200
        return response.json()["data"][0]["embedding"]
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error: {http_err}, Response: {response.text}")
    except Exception as err:
        print(f"Other error: {err}")

# Example usage
api_key = "eoijojgjvgvj3paddyourapi"  # your API key here
text = "Marijuana, also known as cannabis, is a psychoactive drug from the Cannabis plant."

embedding = generate_embeddings(text, api_key)
if embedding:
    print("Embedding length:", len(embedding))


Embedding length: 1024


#Step 4: Storing Embeddings in a Vector Store

In [28]:
import faiss
import numpy as np

def create_vector_store(embeddings):
    """Create a FAISS index for the given embeddings."""
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

def retrieve_relevant_chunks(query, index, sentences, k=3):
    """Retrieve relevant text chunks for a given query."""
    query_embedding = generate_embeddings(query, api_key)
    distances, indices = index.search(np.array([query_embedding]), k)
    return [sentences[i] for i in indices[0]]


#Step 5: Process each document

In [29]:
file_paths = ["marijuana_info.docx", "marijuana_info.pdf", "marijuana_info.txt"]  # edit if needed

sentences, sources = [], []

def split_lines_keep_text(txt):
    # simple splitter on newlines, filter empties
    return [s.strip() for s in txt.split("\n") if s.strip()]

for fp in file_paths:
    if fp.endswith('.txt'):
        content = read_txt(fp)
    elif fp.endswith('.pdf'):
        content = read_pdf(fp)
    elif fp.endswith('.docx'):
        content = read_docx(fp)
    else:
        continue

    lines = split_lines_keep_text(content)
    sentences.extend(lines)
    sources.extend([fp]*len(lines))

print(f"Loaded {len(sentences)} chunks from {len(file_paths)} files.")


Loaded 25 chunks from 3 files.


In [30]:
file_paths = ["marijuana_info.docx", "marijuana_info.pdf", "marijuana_info.txt"]
sentences = []

for file_path in file_paths:
    if file_path.endswith('.txt'):
        content = read_txt(file_path)
    elif file_path.endswith('.pdf'):
        content = read_pdf(file_path)
    elif file_path.endswith('.docx'):
        content = read_docx(file_path)

    sentences.extend([s for s in content.split('\n') if s.strip()])

# Step 6: Generate embeddings using Mistral API

In [31]:
embeddings = np.array([generate_embeddings(sentence, api_key) for sentence in sentences])


# Step 7: Create a vector store


In [32]:
index = create_vector_store(embeddings)

# Step 8: Test with the example query

In [33]:
query = "What are the medicinal benefits of marijuana?"
relevant_chunks = retrieve_relevant_chunks(query, index, sentences)
print(f"Relevant chunks for '{query}': {relevant_chunks}")

Relevant chunks for 'What are the medicinal benefits of marijuana?': ['In recent years, many countries and states have moved toward the legalization or decriminalization of marijuana, particularly for medicinal use. Studies suggest that cannabis may help alleviate symptoms associated with chronic pain, epilepsy, anxiety, and multiple sclerosis.', 'In recent years, many countries and states have moved toward the legalization or decriminalization of marijuana, particularly for medicinal use. Studies suggest that cannabis may help alleviate symptoms associated with chronic pain, epilepsy, anxiety, and multiple sclerosis.', 'Despite its benefits, marijuana use also carries risks, especially when used heavily or at a young age. It may impair short-term memory, judgment, and motor coordination, and its use has been linked to mental health issues in some individuals.']
