In [1]:
from fastapi import FastAPI, UploadFile, File
import fitz  # PyMuPDF
from pinecone import Pinecone
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from transformers import pipeline
import os
from dotenv import load_dotenv

load_dotenv()

nltk.download("punkt")
nltk.download("stopwords")
nlp = spacy.load("en_core_web_sm")

app = FastAPI()
YOUR_PINECONE_API_KEY = os.getenv("PINCECONE_API_KEYS")
print(YOUR_PINECONE_API_KEY)

pc = Pinecone(api_key="pcsk_2Wf51V_3db8TfcJj91FkyhKNPhDwainW3yWC2ErC3z4T8hoLx8fowaFYfhAbLS5TK22n1q")
index = pc.Index("applabqatar")
# index = pinecone.Index("applabqatar")

# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))

  from tqdm.autonotebook import tqdm
[nltk_data] Downloading package punkt to /home/ali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


pcsk_2Wf51V_3db8TfcJj91FkyhKNPhDwainW3yWC2ErC3z4T8hoLx8fowaFYfhAbLS5TK22n1q


  embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))


In [2]:
# Summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Error while downloading from https://cdn-lfs.hf.co/facebook/bart-large-cnn/40041830399afb5348525ef8354b007ecec4286fdf3524f7e6b54377e17096cb?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1739486621&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczOTQ4NjYyMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9mYWNlYm9vay9iYXJ0LWxhcmdlLWNubi80MDA0MTgzMDM5OWFmYjUzNDg1MjVlZjgzNTRiMDA3ZWNlYzQyODZmZGYzNTI0ZjdlNmI1NDM3N2UxNzA5NmNiP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=XotXGoqZLcIOWqP6pN8W0EbVETdlaLXmn9cNeqBi06VbSuYFnhDcS5DT88enK%7ETymKjCJ9XHAO8JaUmKOw%7E%7EeCopMS7GX3P%7EV7lW49cY6W44f4BT9VTe3d-M68-bmaDon3i3RvMi4PLTNOcuKLLiFOE-vDKGZTyOaoGBjvYzI1osMl8N6J4J%7EtHtRXxLH17a23dLlYv6p6nlBhcUgAJTPXXswvCijrkDPapnIitcBMCvxLqen6jfRQwZSaolo2Qo6aXaLBPVW3XgOlMat5TR7ZFQtsbRi2mMtKc1IWXTtJkFu-MBKrhQNJORvOrtIgtCPvdFixCTgHjxhAl%7E1djvlg__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConn

In [3]:
def preprocess_text(text):
    """Apply NLP preprocessing (Tokenization, Stopword Removal, Lemmatization)"""
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalnum()]
    tokens = [t for t in tokens if t not in stopwords.words("english")]
    return " ".join(tokens)


def extract_named_entities(text):
    """Extract key Named Entities from text using Spacy"""
    doc = nlp(text)
    entities = {ent.label_: ent.text for ent in doc.ents}
    return entities


def extract_text_from_pdf(pdf_path):
    """Extract text from the PDF and apply summarization"""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])

    # Summarize long text
    if len(text) > 1000:
        text = summarizer(text[:1024], max_length=300, min_length=100, do_sample=False)[0]["summary_text"]

    return preprocess_text(text)

In [5]:
@app.post("/upload/")
async def upload_pdf(file: UploadFile = File(...)):
    pdf_path = f"temp_{file.filename}"
    with open(pdf_path, "wb") as buffer:
        buffer.write(await file.read())

    text = extract_text_from_pdf(pdf_path)

    # Chunk and embed text
    sentences = sent_tokenize(text)
    doc_embeddings = [embeddings.embed_query(sent) for sent in sentences]

    # Store in Pinecone
    for i, vector in enumerate(doc_embeddings):
        index.upsert([(f"{file.filename}_{i}", vector, {"text": sentences[i]})])

    return {"message": "PDF uploaded and processed successfully!", "entities": extract_named_entities(text)}

In [6]:

def rank_text_with_tfidf(query, texts):
    """Rank retrieved text chunks based on TF-IDF relevance"""
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([query] + texts)
    scores = tfidf_matrix[0].dot(tfidf_matrix.T).toarray()[0][1:]
    ranked_texts = [texts[i] for i in sorted(range(len(scores)), key=lambda x: scores[x], reverse=True)]
    return " ".join(ranked_texts[:3])  # Top 3 ranked chunks


@app.post("/chat/")
async def chat(query: str):
    results = index.query(query, top_k=5, include_metadata=True)
    retrieved_texts = [r["metadata"]["text"] for r in results["matches"]]

    # Rank retrieved texts using TF-IDF
    context = rank_text_with_tfidf(query, retrieved_texts)

    chat_model = ChatOpenAI(model="gpt-4", temperature=0)
    response = chat_model.predict(f"Context: {context} \n\nAnswer the question: {query}")

    return {"response": response}


In [8]:
import streamlit as st
import requests

st.title("PDF Chatbot with NLP")

uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
if uploaded_file:
    files = {"file": uploaded_file.getvalue()}
    response = requests.post("http://127.0.0.1:8000/upload/", files=files)
    data = response.json()
    st.success(data["message"])
    st.write("Extracted Named Entities:", data["entities"])

query = st.text_input("Ask a question:")
if query:
    response = requests.post("http://127.0.0.1:8000/chat/", json={"query": query})
    st.write("Chatbot:", response.json()["response"])


2025-02-14 03:20:11.592 
  command:

    streamlit run /home/ali/Desktop/My Projects/Chatbot-Development-with-Python-for-AI-ML/ApplabQatar/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-02-14 03:20:11.596 Session state does not function when running a script without `streamlit run`
