In [11]:
file_path = r"C:\Users\acksh\OneDrive\Desktop\GAI\SciChat\Liu_Video_Swin_Transformer_CVPR_2022_paper.pdf"



In [5]:
from langchain_community.document_loaders import PyPDFLoader


In [6]:
import pypdf


In [51]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import spacy
import os
from dotenv import load_dotenv
import pdfplumber
import re 

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# Load Models
nlp = spacy.load('en_core_web_sm')
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
os.environ["OPENAI_API_KEY"] = api_key

# Function to Parse and Extract Key Information from PDFs
def parse_and_extract(file_path):
    """Parse and extract key information from a PDF."""
    # Load the PDF document
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    # Extract text from the document
    text = " ".join([doc.page_content for doc in documents])

    # Use spaCy for Named Entity Recognition (NER)
    doc = nlp(text)
    extracted_info = {
        "title": text.split("\n")[0],  # First line is typically the title
        "emails": [word for word in text.split() if "@" in word and "." in word],
    }
    return extracted_info, documents

# Function to Generate Summaries for Each Section
def summarize_sections(documents, section_titles):
    """Summarize each section of the paper based on the structure."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_documents(documents)
    summaries = {}
    
    for section_title in section_titles:
        section_text = " ".join([chunk.page_content for chunk in chunks if section_title in chunk.page_content])
        summaries[section_title] = section_text  # Replace with summarization logic if needed
    return summaries

# Function to Generate Embeddings and Store in FAISS
def process_and_store_embeddings(documents):
    """Generate embeddings and store in FAISS."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    texts = [chunk.page_content for chunk in chunks]
    vector_store = FAISS.from_texts(texts, embedding_model)
    return vector_store

def extract_authors_and_organizations(file_path):
    """Extract authors and organizations using layout-based extraction with PDFPlumber."""
    authors = []
    organizations = []
    with pdfplumber.open(file_path) as pdf:
        # Process the first page only
        first_page = pdf.pages[0]
        text = first_page.extract_text()
        if not text:
            raise ValueError("No text found on the first page of the PDF.")
        
        # Extract lines from the text
        lines = text.split("\n")
        
        # Process lines to extract names and organizations
        for line in lines[1:5]:
            
            # Fallback: Use regex to extract additional names
            name_pattern = (
                r'\b([A-Z][a-z]+[A-Z][a-z]+)(?:\d+|\*\d+|\d+\+)?\b'  # Matches patterns like "Firstname Lastname12"
            )
            regex_names = re.findall(name_pattern,line)
            authors.extend(regex_names)
            
            # Fallback: Identify organization-like phrases
            org_pattern = r'(?:\d+)?([A-Z][a-zA-Z]*(?:[A-Z][a-zA-Z]*)*)'
            regex_orgs = re.findall(org_pattern,line)
            organizations.extend(regex_orgs)
                  
    
    # Deduplicate results
    authors = list(set(authors))
    organizations = list(set(organizations)-set(authors))
    
    return authors, organizations

# Usage
authors, organizations = extract_authors_and_organizations(file_path)

# Function to Create a Q&A Chain
def create_qa_chain(vector_store):
    """Create a Q&A chain with GPT-4 and FAISS retriever."""
    llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.0)
    retriever = vector_store.as_retriever()
    qa_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
    return qa_chain

# Function to Answer Questions
def answer_question(qa_chain, question, chat_history):
    """Answer user questions based on the document."""
    result = qa_chain({"question": question, "chat_history": chat_history})
    return result["answer"]

# Step 1: Define Section Titles
section_titles = [
    "Abstract", "Introduction", "Literature Survey", "Dataset",
    "Methodology", "Results and Analysis", "Conclusion and Future Scope", "References"
]

# Step 2: Upload and Parse PDF  # Replace with actual file path
extracted_info, documents = parse_and_extract(file_path)

# Step 3: Display Extracted Information
print("Title of the Paper:", extracted_info["title"])
print("Authors of the Paper:", authors)
print("Organizations:", organizations)
print("Email IDs:", extracted_info["emails"])

# Step 4: Summarize Sections
summaries = summarize_sections(documents, section_titles)
for section, summary in summaries.items():
    print(f"\n{section} Summary:\n{summary}")






Processing line: ZeLiu*12 JiaNing∗13 YueCao1† YixuanWei14 ZhengZhang1 StephenLin1 HanHu1†
Processing line: 1MicrosoftResearchAsia 2UniversityofScienceandTechnologyofChina
Processing line: 3HuazhongUniversityofScienceandTechnology 4TsinghuaUniversity
Processing line: {t-liuze,v-jianing,yuecao,t-yixuanwei,zhez,stevelin,hanhu}@microsoft.com
Title of the Paper: Video Swin Transformer
Authors of the Paper: ['JiaNing', 'ZeLiu', 'YixuanWei', 'YueCao', 'HanHu', 'ZhengZhang', 'StephenLin']
Organizations: ['UniversityofScienceandTechnologyofChina', 'TsinghuaUniversity', 'MicrosoftResearchAsia', 'HuazhongUniversityofScienceandTechnology']
Email IDs: ['{t-liuze,v-jianing,yuecao,t-yixuanwei,zhez,stevelin,hanhu}@microsoft.com']

Abstract Summary:
Video Swin Transformer
Ze Liu*12 Jia Ning∗13 Yue Cao1† Yixuan Wei14 Zheng Zhang1 Stephen Lin1 Han Hu1†
1Microsoft Research Asia 2University of Science and Technology of China
3Huazhong University of Science and Technology 4Tsinghua University
{t-liuze,v-jia

In [None]:
# Step 5: Generate Embeddings and Store in FAISS
vector_store = process_and_store_embeddings(documents)

# Step 6: Set Up Q&A
qa_chain = create_qa_chain(vector_store)

# Step 7: Interactive Q&A Loop
chat_history = []
while True:
    question = input("Ask a question (or type 'exit' to quit): ")
    if question.lower() == "exit":
        break
    answer = answer_question(qa_chain, question, chat_history)
    print(f"Answer: {answer}")
    chat_history.append((question, answer))