In [1]:
!pip install -U langchain langchain-core langchain_community langchain_openai openai tiktoken psutil chromadb unstructured jq lark



In [2]:
import os
import uuid
import requests
import tempfile
from typing import List
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain_core.output_parsers import JsonOutputParser
from langchain.vectorstores import Chroma
from langchain.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import UnstructuredExcelLoader, UnstructuredXMLLoader
from langchain_community.document_loaders.json_loader import JSONLoader

In [3]:
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [4]:
class InputData(BaseModel):
    file_url: str = Field(..., description="URL of the file to be processed")
    file_type: str = Field(..., description="Type of the file (e.g., csv, json, xls, xlsx, xml)")
    lang: str = Field(..., description="Language of the document")

class ConceptDefinitionRelation(BaseModel):
    concept: str = Field(..., description="The main concept of the flashcard")
    definition: str = Field(..., description="The detailed definition of the concept")

class OutputData(BaseModel):
    concepts: List[ConceptDefinitionRelation] = Field(..., description="List of concepts and their definitions")

In [5]:
from datetime import datetime

class FileHandler:
    def __init__(self, file_loader, file_extension):
        self.file_loader = file_loader
        self.file_extension = file_extension

    def load(self, url):
        # Generate a unique filename with a UUID prefix
        unique_filename = f"{uuid.uuid4()}.{self.file_extension}"

        # Download the file from the URL and save it to a temporary file
        response = requests.get(url)
        response.raise_for_status()  # Ensure the request was successful

        with tempfile.NamedTemporaryFile(delete=False, suffix=unique_filename) as temp_file:
            temp_file.write(response.content)
            temp_file_path = temp_file.name

        # Use the file_loader to load the documents
        try:

            if(self.file_loader == JSONLoader):
              loader = self.file_loader(file_path=temp_file_path, jq_schema=".", text_content=False)
            else:
              loader = self.file_loader(file_path=temp_file_path)
        except Exception as e:
            print(f"No such file found at {temp_file_path}")
            raise FileNotFoundError(f"No file found at {temp_file_path}") from e

        try:
            documents = loader.load()
            if documents:
              for doc in documents:
                  doc.metadata['file_type'] = self.file_extension
                  doc.metadata['processed_at'] = datetime.now().isoformat()
        except Exception as e:
            print(f"File content might be private or unavailable or the URL is incorrect.")
            raise ValueError(f"No file content available at {temp_file_path}") from e

        # Remove the temporary file
        os.remove(temp_file_path)

        return documents

In [6]:
def load_csv_documents(csv_url: str, verbose=False):
    csv_loader = FileHandler(CSVLoader, "csv")
    docs = csv_loader.load(csv_url)

    if docs:
        if verbose:
            print(f"Found CSV file")
            print(f"Loaded {len(docs)} documents")
        return docs

def load_xls_documents(xls_url: str, verbose=False):
    xls_handler = FileHandler(UnstructuredExcelLoader, 'xls')
    docs = xls_handler.load(xls_url)
    if docs:
        if verbose:
            print(f"Found XLS file")
            print(f"Loaded {len(docs)} documents")
        return docs

def load_xlsx_documents(xlsx_url: str, verbose=False):
    xlsx_handler = FileHandler(UnstructuredExcelLoader, 'xlsx')
    docs = xlsx_handler.load(xlsx_url)
    if docs:
        if verbose:
            print(f"Found XLSX file")
            print(f"Loaded {len(docs)} documents")
        return docs

def load_xml_documents(xml_url: str, verbose=False):
    xml_handler = FileHandler(UnstructuredXMLLoader, 'xml')
    docs = xml_handler.load(xml_url)
    if docs:
        if verbose:
            print(f"Found XML file")
            print(f"Loaded {len(docs)} documents")
        return docs
def load_json_documents(json_url: str, verbose=False):
    json_handler = FileHandler(JSONLoader, 'json')
    docs = json_handler.load(json_url)
    if docs:
        if verbose:
            print(f"Found JSON file")
            print(f"Loaded {len(docs)} documents")
        return docs

In [7]:
def load_documents(file_url: str, file_type: str, verbose=False) -> List[Document]:
    if file_type.lower() == "csv":
        return load_csv_documents(file_url, verbose)
    elif file_type.lower() == "xls":
        return load_xls_documents(file_url, verbose)
    elif file_type.lower() == "xlsx":
        return load_xlsx_documents(file_url, verbose)
    elif file_type.lower() == "xml":
        return load_xml_documents(file_url, verbose)
    elif file_type.lower() == "json":
        return load_json_documents(file_url, verbose)
    else:
        raise ValueError(f"Unsupported file type: {file_type}")

In [8]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

In [9]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [10]:
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [11]:
document_content_description = "Documents loaded from the file, containing data in the specified language for key concept retrieving."

In [12]:
metadata_field_info = [
    AttributeInfo(
        name="file_type",
        description="The type of the file (e.g., csv, json, xml, xls, xlsx).",
        type="string"
    ),
    AttributeInfo(
        name="processed_at",
        description="The timestamp when the document was processed, in ISO 8601 format.",
        type="string"
    ),
]

In [13]:
def build_vectorstore(documents: List[Document], embedding_model):
    vectorstore = Chroma.from_documents(documents, embedding_model)
    return vectorstore

In [14]:
parser = JsonOutputParser(pydantic_object=OutputData)
format_instructions = parser.get_format_instructions()

In [15]:
prompt_template = """
Extract the key concepts from the following documents:

{context}

Formatting:
-----------------------------
{format_instructions}

Respond only according to the format instructions. You must respond in this language: {lang}
"""

In [16]:
prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "lang"],
    partial_variables={"format_instructions": format_instructions}
)

In [17]:
chain = prompt | llm | parser

In [18]:
def run_chain(input_data: InputData):
    documents = load_documents(input_data.file_url, input_data.file_type, verbose=True)
    if not documents:
        print("No documents loaded.")
        return

    split_docs = splitter.split_documents(documents)

    vectorstore = build_vectorstore(split_docs, embedding_model)

    retriever = SelfQueryRetriever.from_llm(
        llm=llm,
        vectorstore=vectorstore,
        document_contents=document_content_description,
        metadata_field_info=metadata_field_info,
        verbose=True,
        enable_limit=True
    )

    user_query = "Extract key concepts from all the documents"
    relevant_docs = retriever.invoke(user_query)

    context = "\n".join([doc.page_content for doc in relevant_docs])

    output = chain.invoke({'context': context, "lang": input_data.lang})

    print(output)

    del vectorstore
    del retriever

    return output

In [19]:
if __name__ == "__main__":
    input_data = InputData(
        file_url="https://raw.githubusercontent.com/AaronSosaRamos/mission-flights/refs/heads/main/files-for-test/file.json",
        file_type="json",
        lang="en"
    )

    result = run_chain(input_data)

Found JSON file
Loaded 1 documents
{'concepts': [{'concept': 'Adaptability', 'definition': 'With fine-tuning, LLMs can be adapted to perform specific tasks, from customer support to legal document analysis.'}, {'concept': 'Contextual Understanding', 'definition': 'LLMs can capture and generate coherent text based on previous context, making their responses more accurate and relevant.'}, {'concept': 'Data Bias', 'definition': 'LLMs can reflect biases present in the training data, leading to potentially harmful or unethical outputs.'}, {'concept': 'Computational Resources', 'definition': 'Training and running LLMs require substantial computational power and resources, making them costly to develop and maintain.'}, {'concept': 'Lack of Causality', 'definition': "While LLMs can generate coherent text, they lack true understanding of causality, meaning they can't reason about real-world consequences or make informed decisions based on external factors."}, {'concept': 'Incoherence in Long Te

In [20]:
result

{'concepts': [{'concept': 'Adaptability',
   'definition': 'With fine-tuning, LLMs can be adapted to perform specific tasks, from customer support to legal document analysis.'},
  {'concept': 'Contextual Understanding',
   'definition': 'LLMs can capture and generate coherent text based on previous context, making their responses more accurate and relevant.'},
  {'concept': 'Data Bias',
   'definition': 'LLMs can reflect biases present in the training data, leading to potentially harmful or unethical outputs.'},
  {'concept': 'Computational Resources',
   'definition': 'Training and running LLMs require substantial computational power and resources, making them costly to develop and maintain.'},
  {'concept': 'Lack of Causality',
   'definition': "While LLMs can generate coherent text, they lack true understanding of causality, meaning they can't reason about real-world consequences or make informed decisions based on external factors."},
  {'concept': 'Incoherence in Long Texts',
   '

In [19]:
if __name__ == "__main__":
    input_data = InputData(
        file_url="https://raw.githubusercontent.com/AaronSosaRamos/mission-flights/refs/heads/main/files-for-test/file.json",
        file_type="json",
        lang="es"
    )

    result2 = run_chain(input_data)

Found JSON file
Loaded 1 documents
{'concepts': [{'concept': 'Modelo de Lenguaje Grande (LLM)', 'definition': 'Un tipo de modelo de inteligencia artificial que procesa y genera texto similar al humano basado en grandes cantidades de datos textuales. Los LLM se construyen utilizando técnicas de aprendizaje profundo y se entrenan en conjuntos de datos diversos, lo que les permite entender y generar texto en varios idiomas y contextos.'}, {'concept': 'Arquitectura', 'definition': 'Los LLMs utilizan típicamente arquitecturas de transformadores, que les permiten procesar texto en paralelo, haciendo que los modelos sean altamente eficientes en la captura de dependencias a largo plazo en el texto.'}, {'concept': 'Datos de Entrenamiento', 'definition': 'Los LLMs se entrenan en vastas cantidades de datos textuales de fuentes como libros, artículos, sitios web y otros textos disponibles públicamente para aprender patrones y contextos del lenguaje.'}, {'concept': 'Tokenización', 'definition': 'Lo

In [20]:
result2

{'concepts': [{'concept': 'Modelo de Lenguaje Grande (LLM)',
   'definition': 'Un tipo de modelo de inteligencia artificial que procesa y genera texto similar al humano basado en grandes cantidades de datos textuales. Los LLM se construyen utilizando técnicas de aprendizaje profundo y se entrenan en conjuntos de datos diversos, lo que les permite entender y generar texto en varios idiomas y contextos.'},
  {'concept': 'Arquitectura',
   'definition': 'Los LLMs utilizan típicamente arquitecturas de transformadores, que les permiten procesar texto en paralelo, haciendo que los modelos sean altamente eficientes en la captura de dependencias a largo plazo en el texto.'},
  {'concept': 'Datos de Entrenamiento',
   'definition': 'Los LLMs se entrenan en vastas cantidades de datos textuales de fuentes como libros, artículos, sitios web y otros textos disponibles públicamente para aprender patrones y contextos del lenguaje.'},
  {'concept': 'Tokenización',
   'definition': 'Los LLMs utilizan l