In [None]:
# Install required packages
!pip install langchain --quiet
!pip install pdf2image --quiet
!pip install pdfminer.six --quiet
!pip install singlestoredb --quiet
!pip install requests --quiet
!pip install tiktoken --quiet

In [None]:
!pip install -U langchain-community


In [None]:
!pip install unstructured


In [None]:
!pip install pdfminer.six


In [None]:
pip install pi-heif


In [None]:
pip install unstructured_inference

In [None]:
!apt-get install -y poppler-utils


In [None]:
!apt-get install -y tesseract-ocr
!pip install pytesseract


In [None]:
# Import necessary libraries
import nltk
from langchain_community.document_loaders import OnlinePDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import SingleStoreDB
import requests
import os

# Download necessary NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Set xAI API key
os.environ["XAI_API_KEY"] = "your-xai-api-key-here"  # Replace with your actual xAI API key

# Function to fetch embeddings from xAI API
def get_xai_embeddings(text, api_key=os.getenv("XAI_API_KEY")):
    url = "https://api.x.ai/v1"  # Replace with the correct xAI embeddings endpoint
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
    payload = {"text": text}

    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()["embeddings"]

# Load PDF file from URL
def load_pdf_from_url(url):
    loader = OnlinePDFLoader(url)
    return loader.load()

# Split the PDF text into chunks
def split_text(data, chunk_size=2000, chunk_overlap=0):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(data)

# Define function for xAI chat completion
def chat_with_xai(prompt, api_key=os.getenv("xai-nQp8Al83QHaSJrfuvrgnGxWmuDshXjXTtNaF1x0PpKOwYvWFcazCM2hvPCIy8qMT8Jc2e32CxURozfgh")):
    url = "https://api.x.ai/v1"  # Replace with the correct xAI chat endpoint
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
    payload = {
        "model": "grok-beta",  # Replace with the desired xAI model name
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
    }

    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()["choices"][0]["message"]["content"]

# Function to use xAI for direct text prediction
def xai_predict(prompt, api_key=os.getenv("api key")):
    url = "https://api.xai.com/v1/completions"  # Replace with the correct xAI endpoint
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
    payload = {
        "model": "xai-gpt-3.5",  # Replace with the desired model name
        "prompt": prompt,
        "temperature": 0.8,
    }

    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()["choices"][0]["text"]

# Main script
if __name__ == "__main__":
    # Load PDF data
    pdf_url = "https://unctad.org/system/files/official-document/wesp2023_en.pdf"
    data = load_pdf_from_url(pdf_url)

    # Check PDF loading
    if not data:
        print("Failed to load the PDF file.")
    else:
        print(f"You have {len(data)} document(s) in your data")
        print(f"There are {len(data[0].page_content)} characters in your document")

        # Split text into chunks
        texts = split_text(data)
        print(f"You have {len(texts)} chunks after splitting.")

        # Generate embeddings for text chunks
        embeddings = [get_xai_embeddings(text.page_content) for text in texts]

        # Create a vector store in SingleStoreDB
        docsearch = SingleStoreDB.from_documents(
            texts,
            embedding=embeddings,
            table_name="pdf_wes",
        )

        # Query and retrieve documents
        query = "What is Tunisia's GDP growth projected to be?"
        docs = docsearch.similarity_search(query)
        if docs:
            print(f"Most relevant document: {docs[0].page_content}")

            # Ask the question using xAI
            prompt = f"The user asked: {query}. The most similar text from the document is: {docs[0].page_content}"
            response = chat_with_xai(prompt)
            print("Response from xAI Chat Model:")
            print(response)

            # Direct text prediction using xAI LLM
            llm_prediction = xai_predict(query)
            print("Prediction from xAI LLM:")
            print(llm_prediction)
        else:
            print("No relevant documents found.")
