In [35]:
file_path = r"C:\Users\acksh\OneDrive\Desktop\GAI\SciChat\Liu_Video_Swin_Transformer_CVPR_2022_paper.pdf"


In [2]:
from langchain_community.document_loaders import PyPDFLoader


In [3]:
import pypdf


In [37]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import spacy
import os
from dotenv import load_dotenv
import pdfplumber
import re 
import openai



load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# Load Models
nlp = spacy.load('en_core_web_sm')
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
os.environ["OPENAI_API_KEY"] = api_key

# Function to Parse and Extract Key Information from PDFs
def parse_and_extract(file_path):
    """Parse and extract key information from a PDF."""
    # Load the PDF document
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    # Extract text from the document
    text = " ".join([doc.page_content for doc in documents])

    # Use spaCy for Named Entity Recognition (NER)
    doc = nlp(text)
    extracted_info = {
        "title": text.split("\n")[0],  # First line is typically the title
        "emails": [word for word in text.split() if "@" in word and "." in word],
    }
    return extracted_info, documents

def check_entities(names, entity_type):
    """
    Check if the names list contains only the specified entity type (PERSON/ORG).
    Returns two lists: one with valid entities and another with invalid entities.
    """
    valid_names = []
    invalid_names = []

    for name in names:
        doc = nlp(name)
        # Check the entity type
        if any(ent.label_ == entity_type for ent in doc.ents):
            valid_names.append(name)
        else:
            invalid_names.append(name)

    return valid_names, invalid_names

# Function to Generate Embeddings and Store in FAISS
def process_and_store_embeddings(documents):
    """Generate embeddings and store in FAISS."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    texts = [chunk.page_content for chunk in chunks]
    vector_store = FAISS.from_texts(texts, embedding_model)
    return vector_store

def extract_authors_and_organizations(file_path):
    """Extract authors and organizations using layout-based extraction with PDFPlumber."""
    authors = []
    organizations = []
    with pdfplumber.open(file_path) as pdf:
        # Process the first page only
        first_page = pdf.pages[0]
        text = first_page.extract_text()
        if not text:
            raise ValueError("No text found on the first page of the PDF.")
        
        # Extract lines from the text
        lines = text.split("\n")
        
        # Process lines to extract names and organizations
        for line in lines[1:6]:
            if "Abstract" in line:
                break
            # Fallback: Use regex to extract additional names
            name_pattern = (
                r'\b([A-Z][a-z]+[A-Z][a-z]+)(?:\d+|\*\d+|\d+\+)?\b'  # Matches patterns like "Firstname Lastname12"
            )
            regex_names = re.findall(name_pattern,line)
            authors.extend(regex_names)
            
            # Fallback: Identify organization-like phrases
            org_pattern = r'(?:\d+)?([A-Z][a-zA-Z]*(?:[A-Z][a-zA-Z]*)*)'
            regex_orgs = re.findall(org_pattern,line)
            organizations.extend(regex_orgs)
                  
    
    # Deduplicate results
    authors = list(set(authors))
    authors = [author for author in authors if not any(word.lower() in author.lower() for word in ["university", "institute"])]
    organizations = list(set(organizations)-set(authors))

    return authors, organizations

# Usage
authors, organizations = extract_authors_and_organizations(file_path)

# Function to Create a Q&A Chain
def create_qa_chain(vector_store):
    """Create a Q&A chain with GPT-4 and FAISS retriever."""
    llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.0)
    retriever = vector_store.as_retriever()
    qa_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
    return qa_chain

# Function to Answer Questions
def answer_question(qa_chain, question, chat_history):
    """Answer user questions based on the document."""
    result = qa_chain({"question": question, "chat_history": chat_history})
    return result["answer"]

# Step 1: Define Section Titles
section_titles = [
    "Abstract", "Introduction", "Literature Survey", "Dataset",
    "Methodology", "Results and Analysis", "Conclusion and Future Scope", "References"
]

# Step 2: Upload and Parse PDF  # Replace with actual file path
extracted_info, documents = parse_and_extract(file_path)

# Step 3: Display Extracted Information
print("Title of the Paper:", extracted_info["title"])
print("Authors of the Paper:", authors)
print("Organizations:", organizations)
print("Email IDs:", extracted_info["emails"])







Title of the Paper: Video Swin Transformer
Authors of the Paper: ['HanHu', 'ZeLiu', 'StephenLin', 'JiaNing', 'ZhengZhang', 'YixuanWei', 'YueCao']
Organizations: ['MicrosoftResearchAsia', 'TsinghuaUniversity', 'UniversityofScienceandTechnologyofChina', 'HuazhongUniversityofScienceandTechnology']
Email IDs: ['{t-liuze,v-jianing,yuecao,t-yixuanwei,zhez,stevelin,hanhu}@microsoft.com']


In [48]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

def summarize_sections(documents, section_titles):
    """
    Summarize each section of the paper using LangChain with GPT-3.5 Turbo.
    """
    for document in documents:
        lines = document.page_content.split("\n")
        # Skip the first 5 lines *only if* they contain metadata (title, authors, etc.)
        if "Abstract" not in lines[0:5]:  # Ensure you're not skipping the Abstract
            document.page_content = "\n".join(lines[5:])
        
        
    # Split the document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=150)
    chunks = text_splitter.split_documents(documents)

    # Initialize a dictionary to store section-wise content
    summaries = {}

    # Initialize LangChain LLM
    llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.3, max_tokens =300)

    # Create a prompt template
    prompt_template = PromptTemplate(
        input_variables=["section_title", "content"],
        template=(
            "You are a helpful assistant summarizing sections of a research paper. "
            "Please summarize the content under the section titled '{section_title}' based on the following content:\n\n"
            "{content}\n\n"
            "Provide a detailed, in-depth summary of the section in 250 words, including key points, "
            "arguments, and supporting details."
        )
    )

    summarization_chain = LLMChain(llm=llm, prompt=prompt_template)

    for section_title, fallbacks in section_titles.items():
        # Extract all chunks for the primary or fallback titles
        section_text = None
        for title_option in [section_title] + fallbacks:
            section_text = " ".join([
                chunk.page_content for chunk in chunks
                if title_option.lower() in chunk.page_content.lower()
            ])
            if section_text.strip():  # If content is found, stop looking at fallbacks
                break
        

        # Summarize content for other sections
        if section_text and section_text.strip():
            summaries[section_title] = summarization_chain.run({
                "section_title": section_title,
                "content": section_text
            })

        # Handle missing content
        else:
            summaries[section_title] = f"No content found for {section_title}."

    return summaries

section_titles = {
    "Abstract": ["Summary"],
    "Introduction": ["Background"],
    "Related Works": ["Literature Review","Prior Work"],
    "Architecture": ["Methodology"],
    "Experiments": ["Results"],
    "Conclusion":[] ,

}

summaries = summarize_sections(documents, section_titles)
for section, summary in summaries.items():
    print(f"\n{section} Summary:\n{summary}")



Abstract Summary:


The abstract section of this research paper discusses the shift from using Convolutional Neural Networks (CNNs) to Transformers in the vision community. The authors argue that pure Transformer architectures have achieved top accuracy on major video recognition benchmarks, and that these video models are all built on Transformer layers that globally connect patches across the spatial and temporal dimensions. However, the authors propose an alternative approach that incorporates an inductive bias of locality in video Transformers, which they claim leads to a better speed-accuracy trade-off.

To achieve this locality, the authors adapt the Swin Transformer, originally designed for the image domain, and continue to leverage the power of pre-trained image models. This approach has been shown to achieve state-of-the-art accuracy on a broad range of video recognition benchmarks, including action recognition and temporal modeling.

The authors also discuss the current tren

In [None]:
import pinecone
import openai

pine_api_key = os.getenv("PINE_API_KEY")
pinecone.init(api_key= pine_api_key, environment="us-west1-gcp")

def get_embedding(text):
    response = openai.Embedding.create(model="text-embedding-ada-002", input=text)
    return np.array(response['data'][0]['embedding'])

def process_and_store_embeddings(documents, section_titles):
    index_name = "document-embeddings"  
    dimension = 1536  
    pinecone.create_index(index_name, dimension=dimension)  
    index = pinecone.Index(index_name)

    all_embeddings = []
    metadata = []

    for document in documents:
        document_id = document.get("id")  

        # Get embeddings for title, authors, organization, email, content, and full document
        title_embedding = get_embedding(document.get("title"))
        author_embedding = get_embedding(document.get("authors"))
        org_embedding = get_embedding(document.get("organizations"))
        email_embedding = get_embedding(document.get("emails"))
        content_embedding = get_embedding(document.get("content"))
        full_doc_embedding = get_embedding(document.get("full_content"))

        # Store embeddings and metadata in Pinecone
        all_embeddings.extend([title_embedding, author_embedding, org_embedding, email_embedding, content_embedding, full_doc_embedding])
        metadata.extend([
            {"type": "title", "document_id": document_id},
            {"type": "authors", "document_id": document_id},
            {"type": "organizations", "document_id": document_id},
            {"type": "emails", "document_id": document_id},
            {"type": "content", "document_id": document_id},
            {"type": "full_document", "document_id": document_id}
        ])

    # Upsert the embeddings into Pinecone
    index.upsert(vectors=zip(range(len(all_embeddings)), all_embeddings, metadata))

    return index

# Function to create a question-answering chain
def create_qa_chain(index):
    # Set up the retrieval-based QA chain
    def qa_chain(question, chat_history):
        # Get the embedding for the question
        question_embedding = get_embedding(question)
        
        # Query the Pinecone index to get the most relevant documents
        result = index.query(query_vector=question_embedding, top_k=3, include_metadata=True)
        
        # Retrieve the top document and answer the question
        relevant_content = "\n".join([item['metadata']['type'] + ": " + item['metadata']['document_id'] for item in result['matches']])
        
        # Generate an answer using the context (could also use LangChain's LLMChain here)
        llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.3)
        prompt_template = PromptTemplate(
            input_variables=["question", "context"],
            template="Answer the following question based on the context provided:\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:"
        )
        summarization_chain = LLMChain(llm=llm, prompt=prompt_template)
        answer = summarization_chain.run({
            "question": question,
            "context": relevant_content
        })
        return answer
    
    return qa_chain

# Example loop for asking questions
chat_history = []
documents = [{"id": "doc1", "title": "AI Research", "authors": "John Doe, Jane Smith", "organizations": "Tech Institute", "emails": "john@example.com", "content": "AI is transforming the world...", "full_content": "AI is transforming the world in many ways..."}, ...]  # List of documents

# Step 1: Store embeddings in Pinecone
vector_store = process_and_store_embeddings(documents, section_titles)

# Step 2: Create QA Chain
qa_chain = create_qa_chain(vector_store)

# Step 3: Interactive Q&A Loop
while True:
    question = input("Ask a question (or type 'exit' to quit): ")
    if question.lower() == "exit":
        break
    answer = qa_chain(question, chat_history)
    print(f"Answer: {answer}")
    chat_history.append((question, answer))

In [None]:
# Step 5: Generate Embeddings and Store in FAISS
vector_store = process_and_store_embeddings(documents)

# Step 6: Set Up Q&A
qa_chain = create_qa_chain(vector_store)

# Step 7: Interactive Q&A Loop
chat_history = []
while True:
    question = input("Ask a question (or type 'exit' to quit): ")
    if question.lower() == "exit":
        break
    answer = answer_question(qa_chain, question, chat_history)
    print(f"Answer: {answer}")
    chat_history.append((question, answer))