In [None]:
!pip install -q transformers accelerate langchain faiss-cpu pypdf sentence-transformers

In [None]:
!pip install -U langchain langchain-community

In [None]:
from langchain.document_loaders import PyPDFLoader

# === Load PDF ===
pdf_path = "/content/Stock_Market_Performance_2024.pdf"  # Update this path after uploading
loader = PyPDFLoader(pdf_path)
pages = loader.load()

In [None]:
import requests
print(requests.get("https://huggingface.co").status_code)


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import torch

# === Split text into chunks ===
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
docs = text_splitter.split_documents(pages)

# === Create embeddings and store in FAISS ===
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(docs, embedding_model)
retriever = db.as_retriever()

# === Load smaller model (FLAN-T5) ===
model_name = "google/flan-t5-base"  # or "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")

qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, device=0)

# === Define simple RAG ===
def ask(question):
    context_docs = retriever.get_relevant_documents(question)
    context = "\n\n".join([doc.page_content for doc in context_docs[:3]])

    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
    result = qa_pipeline(prompt)[0]["generated_text"]
    print(result.strip())

# === Example Usage ===
ask("Summarize the key insights from the stock market PDF.")