In [1]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the API key from the .env file
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("API key not found in the .env file. Please ensure it is set as OPENAI_API_KEY.")

# Initialize the LLM
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=api_key)

print("Model initialized successfully!")


Model initialized successfully!


In [6]:
from langchain.document_loaders import PyMuPDFLoader, UnstructuredURLLoader, YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI
import base64
import httpx
import re


# Function to load data from PDFs
def load_pdfs(file_paths):
    documents = []
    for file_path in file_paths:
        loader = PyMuPDFLoader(file_path)
        documents.extend(loader.load())
    return documents


# Function to load data from URLs
def load_urls(urls):
    documents = []  # Initialize the documents list
    loader = UnstructuredURLLoader(urls=urls)
    documents.extend(loader.load())
    return documents


# Function to load data from YouTube videos
def load_youtube(video_urls):
    documents = []  # Initialize the documents list
    video_id_pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
    for video_url in video_urls:
        match = re.search(video_id_pattern, video_url)
        if match:
            video_id = match.group(1)  # Extract video ID
            loader = YoutubeLoader(video_id)  # Use video ID directly
            documents.extend(loader.load())
        else:
            print(f"Invalid YouTube URL: {video_url}")
    return documents


# Function to process images
def process_images(image_urls):
    model = llm
    documents = []  # Initialize the documents list
    for image_url in image_urls:
        # Encode image as base64
        image_data = base64.b64encode(httpx.get(image_url).content).decode("utf-8")
        
        # Create message for the model
        message = HumanMessage(
            content=[
                {"type": "text", "text": "Describe the content of this image. "},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
                },
            ],
        )
        
        # Get response from the model
        response = model.invoke([message])
        documents.append(Document(page_content=response.content))
    return documents


# Function to load all types of data
def load_all_data(pdf_paths, urls, youtube_links, image_urls):
    pdf_docs = load_pdfs(pdf_paths)
    url_docs = load_urls(urls)
    youtube_docs = load_youtube(youtube_links)
    image_docs = process_images(image_urls)
    
    all_documents = pdf_docs + url_docs + youtube_docs + image_docs
    return all_documents


# Example data inputs
pdf_files = [r"C:\Users\dhruv\Downloads\CC_04 (1).pdf"]
web_urls = ["https://www.w3schools.com/python/"]  # List of URLs
youtube_videos = ["https://youtu.be/67_aMPDk2zw?si=mqjQoQTu40xfasM7"]  # YouTube links
image_urls=["https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRhmeXu7EIsaqIOuC3nxof9Rkj1xsK8EGJRR203k13OKG9zzRxT4eyDWpwBc9_Ydhq3yl0&usqp=CAU"]
# Load data
documents = load_all_data(pdf_files, web_urls, youtube_videos, image_urls)

print(len(documents))  


46


In [8]:
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
vectorstore = FAISS.from_documents(splits, embeddings)

retriever = vectorstore.as_retriever()

In [9]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an intelligent assistant designed for answering questions "
    "based on retrieved context. Carefully read the provided context and "
    "use it exclusively to formulate your response. If the context does not "
    "contain enough information to answer the question, explicitly state: "
    "'I don't know based on the provided context.' "
    "Your responses should be clear, relevant, and limited to three sentences "
    "or fewer. Do not include information beyond the given context."
    "\n\n{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [11]:

results = rag_chain.invoke({"input": "What is python?"})
print(results["answer"])


Python is a popular programming language that can be used on a server to create web applications. It is known for its simplicity and versatility, making it a favored choice for many developers.
