In [1]:
# Installing all required dependencies for LangChain, LangSmith 
try:
    %pip install -U langchain langchain_community langsmith langgraph langchainhub langchain_experimental pandas neo4j pypdf2 spacy

except Exception as e:  
    print(f"An error occurred during installation: {e}")

Collecting spacy
  Using cached spacy-3.8.2-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Using cached thinc-8.3.2-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting blis<1.1.0,>=1.0.0 (from thinc<8.4.0,>=8.3.0->spacy)
  Using cached blis-1.0.1-cp311-cp311-win_amd64.whl.metadata (7.8 kB)
INFO: pip is looking at multiple versions of thinc to determine which version is compatible with other requirements. This could take a while.
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Using cached thinc-8.3.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
  Using cached thinc-8.3.0-cp311-cp311-win_amd64.whl.metadata (15 kB)
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os

# Set the NEO4J_URI environment variable
os.environ['NEO4J_URI'] = "bolt://localhost:7687"
os.environ['NEO4J_USERNAME'] = 'neo4j'
os.environ['NEO4J_PASSWORD'] = 'your_password'

from langchain_community.graphs import Neo4jGraph

graph = Neo4jGraph()

In [3]:
from neo4j import GraphDatabase
from langchain.tools import Tool
import requests
import xml.etree.ElementTree as ET
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# ArXiv Research Tool
from datetime import datetime

class ArxivResearchTool:
    def __init__(self, max_results=5):
        self.max_results = max_results

    def download_paper_pdf(self, pdf_link, save_path):
        response = requests.get(pdf_link)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            print(f"Paper downloaded successfully to {save_path}")
            return f"Paper downloaded successfully to {save_path}"
        else:
            return f"Failed to download paper (status code: {response.status_code})"
        
    def fetch_papers(self, topic: str):
        url = f"http://export.arxiv.org/api/query?search_query=all:{topic}&start=0&max_results={self.max_results}"
        response = requests.get(url)

        if response.status_code != 200:
            return f"Error: Unable to fetch data from ArXiv API (status code: {response.status_code})"

        root = ET.fromstring(response.content)
        papers = []

        for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
            title_text = entry.find('{http://www.w3.org/2005/Atom}title').text
            title = title_text.replace('\n', '')
            summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
            link = entry.find('{http://www.w3.org/2005/Atom}id').text
            pdf_link = link.replace('/abs/', '/pdf/') + ".pdf"  # PDF link for full paper
            local_pdf_path = os.path.join("D:\Coding\GraphRAG-with-Llama-3.1-main\papers", title + ".pdf") 
            self.download_paper_pdf(pdf_link, local_pdf_path)
            authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')]
            published = entry.find('{http://www.w3.org/2005/Atom}published').text

            papers.append({
                "title": title,
                "summary": summary,
                "link": link,
                "pdf_link": local_pdf_path,
                "authors": authors,
                "published": published,
                "year": published[:4]  # Extract the year from the published date
            })

        return papers


# Neo4j Storage Tool

class Neo4jResearchStore:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def store_paper_with_entities(self, paper):
        with self.driver.session() as session:
            session.write_transaction(self._create_paper_with_entities, paper)

    @staticmethod
    def _create_paper_with_entities(tx, paper):
        # Create the Paper node
        tx.run("""
            MERGE (p:Paper {title: $title})
            SET p.link = $link
            """,
            title=paper['title'],
            link=paper['link']
        )

        # Create Summary node and relationship
        tx.run("""
            MERGE (s:Summary {text: $summary})
            WITH s
            MATCH (p:Paper {title: $title})
            MERGE (p)-[:HAS_SUMMARY]->(s)
            """,
            summary=paper['summary'][:500],  # Optional: Limit summary length for storage
            title=paper['title']
        )

        # Create Year node and relationship
        tx.run("""
            MERGE (y:Year {year: $year})
            WITH y
            MATCH (p:Paper {title: $title})
            MERGE (p)-[:PUBLISHED_IN]->(y)
            """,
            year=paper['year'],
            title=paper['title']
        )

        # Create Author nodes and relationships
        for author_name in paper['authors']:
            tx.run("""
                MERGE (a:Author {name: $author_name})
                WITH a
                MATCH (p:Paper {title: $title})
                MERGE (a)-[:AUTHORED]->(p)
                """,
                author_name=author_name,
                title=paper['title']
            )

    def store_paper_chunks(self, title, chunks, embeddings):
        # Store each chunk as a node with a relationship to the paper
        with self.driver.session() as session:
            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
                session.write_transaction(self._create_chunk_node, title, chunk, embedding, i)

    @staticmethod
    def _create_chunk_node(tx, title, chunk, embedding, chunk_id):
        # Create Chunk node and connect it to the Paper node
        tx.run("""
            MATCH (p:Paper {title: $title})
            CREATE (c:Chunk {text: $chunk, embedding: $embedding, chunk_id: $chunk_id})
            MERGE (p)-[:HAS_CHUNK]->(c)
            """,
            title=title,
            chunk=chunk,
            embedding=embedding.tolist(),  # Convert embedding to a list for Neo4j
            chunk_id=chunk_id
        )

def extract_text_from_pdf(pdf_path):
    text = ""
    reader = PdfReader(pdf_path)
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text() or ""  # Adds page text if present
    return text

def create_chunks_and_embeddings(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)  # Adjust size as needed
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    # Step 1: Split text into chunks
    chunks = text_splitter.split_text(text)

    # Step 2: Generate embeddings for each chunk
    embeddings = embedding_model.encode(chunks)
    return chunks, embeddings


# Main function to fetch papers and store in Neo4j
def fetch_and_store_arxiv_papers(topic, max_results, neo4j_uri, neo4j_user, neo4j_password):
    # Fetch papers
    arxiv_tool = ArxivResearchTool(max_results=max_results)
    papers = arxiv_tool.fetch_papers(topic)

    neo4j_store = Neo4jResearchStore(neo4j_uri, neo4j_user, neo4j_password)

    for paper in papers:
        try:
            title = paper["title"]
            text = extract_text_from_pdf(paper["pdf_link"])
            neo4j_store.store_paper_with_entities(paper)
            chunks, embeddings = create_chunks_and_embeddings(text)
            neo4j_store.store_paper_chunks(title, chunks, embeddings) 
        except:
            continue   
    
    neo4j_store.close()
    print(f"Stored {len(papers)} papers on the topic '{topic}' in the Neo4j database.")
    return f"Stored {len(papers)} papers on the topic '{topic}' in the Neo4j database."

# Define the LangChain Tool that combines fetching and storing
def arxiv_to_neo4j_tool_func(topic: str):
    # Neo4j connection details
    neo4j_uri = "bolt://localhost:7687"
    neo4j_user = "neo4j"
    neo4j_password = "your_password"

    return fetch_and_store_arxiv_papers(topic, max_results=5, neo4j_uri=neo4j_uri, neo4j_user=neo4j_user, neo4j_password=neo4j_password)

arxiv_neo4j_tool = Tool(
    name="ArXiv to Neo4j Tool",
    func=arxiv_to_neo4j_tool_func,
    description="Fetches research papers from ArXiv by topic and stores them in Neo4j."
)

  from tqdm.autonotebook import tqdm, trange


In [4]:
from langchain_community.vectorstores import Neo4jVector
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

embeddings=HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",      
    encode_kwargs={'normalize_embeddings':True}
)

vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    url="bolt://localhost:7687",
    username="neo4j",              
    password="your_password",
    search_type="hybrid",
    node_label="Title",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)
vector_retriever = vector_index.as_retriever()



In [None]:
import spacy
# from keybert import KeyBERT

nlp = spacy.load("en_core_web_sm")

def get_topics(text):
    doc = nlp(text)
    nouns = []
    
    for token in doc:
        if token.pos_ in ['NOUN', 'PROPN']:
            # If it's part of a compound noun, get the full phrase
            if token.dep_ == 'compound':
                compound = token.text + ' ' + token.head.text
                nouns.append(compound)
            # If it's not part of a compound already added
            elif token.dep_ != 'compound':
                nouns.append(token.text)
    
    return nouns

text = "Give the latest advancements in Gen AI by microsoft in 2021"

compound_nouns = get_compound_nouns(text)
print("Compound nouns:", compound_nouns)

Compound nouns: ['advancements', 'Gen AI', 'AI', 'microsoft']


In [3]:
import spacy
# from keybert import KeyBERT

nlp = spacy.load("en_core_web_sm")
# kw_model = KeyBERT()

def extract_topics(text):
    doc = nlp(text)
    
    # Identify potential topics by looking for:
    # 1. Subject noun phrases (using dependency parsing)
    # 2. Noun phrases that are objects of verbs like "discuss", "cover", "explain"
    # 3. Noun compounds and their modifiers
    
    topics = []
    
    for sent in doc.sents:
        # Find subject phrases
        subjects = [tok for tok in sent 
                   if tok.dep_ in ('nsubj', 'nsubjpass') 
                   and not tok.is_stop]
        
        for subject in subjects:
            # Get the full noun phrase containing the subject
            phrase = ' '.join([tok.text for tok in subject.subtree 
                             if not tok.dep_ in ('punct', 'det')])
            if phrase:
                topics.append(phrase.lower())
        
        # Find noun compounds
        for token in sent:
            if token.dep_ == 'compound' and token.head.pos_ == 'NOUN':
                phrase = ' '.join([token.text, token.head.text])
                topics.append(phrase.lower())
    
    # Remove duplicates
    topics = list(set(topics))

    authors = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    titles = [ent.text for ent in doc.ents if ent.label_ == "WORK_OF_ART"]
    years = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
    print(topics + authors + titles + years)
    return topics + authors + titles + years

# Test it
text = """
Give me all papers published in Gen AI by microsoft in 2021
"""

topics = extract_topics(text)
print("Identified topics:", topics)

# For more complex text:
complex_text = """
The integration of quantum computing with machine learning algorithms presents new opportunities 
for solving complex optimization problems. Researchers are exploring how neural networks can 
benefit from quantum mechanics principles. The field of computational biology is also seeing 
major breakthroughs in protein folding prediction.
"""

complex_topics = extract_topics(complex_text)
print("\nComplex text topics:", complex_topics)

['2021']
Identified topics: ['2021']
['optimization problems', 'neural networks', 'researchers', 'learning algorithms', '\n integration of quantum computing with machine learning algorithms', 'mechanics principles', 'field of computational biology', 'quantum principles', 'major breakthroughs in protein']

Complex text topics: ['optimization problems', 'neural networks', 'researchers', 'learning algorithms', '\n integration of quantum computing with machine learning algorithms', 'mechanics principles', 'field of computational biology', 'quantum principles', 'major breakthroughs in protein']


In [7]:
# import spacy
# from spacy.tokens import Span

# nlp = spacy.load("en_core_web_sm")
# ruler = nlp.add_pipe("entity_ruler", before="ner")
# patterns = [{"label": "TOPIC", "pattern": "machine learning"}, {"label": "TOPIC", "pattern": "artificial intelligence"}]
# ruler.add_patterns(patterns)

# # Test with a sentence
# doc = nlp("Find papers on machine learning from 2021.")
# for ent in doc.ents:
#     print(ent.text, ent.label_)


# def extract_entities_for_research_query(question):
#     doc = nlp(question)

#     authors = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
#     titles = [ent.text for ent in doc.ents if ent.label_ == "WORK_OF_ART"]
#     years = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
#     topics = [ent.text for ent in doc.ents if ent.label_ == "TOPIC"]

#     return {"authors": authors, "titles": titles, "years": years, "topics": topics}
#     # return authors + titles + years + topics

def extract_papers_from_topics(topics):
    print('hi1')
    list_of_papers = []
    for topic in topics:
        arxiv_to_neo4j_tool_func(topic)
        list_of_papers.append(ArxivResearchTool.fetch_papers(topic))
    
    return list_of_papers
    
def full_extractor(question):
    
    topics = extract_topics(question)
    print('hi2')
    return extract_papers_from_topics(topics)
        

# Example usage
question = "Give me all papers published in deep learning in 2021"
# print(extract_entities_for_research_query(question))


In [8]:
# # Define a test topic to fetch and store papers
# test_topics = 

# # Run the tool function
# output_message = arxiv_to_neo4j_tool_func(test_topic)

# print(output_message)

In [9]:
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars


def generate_full_text_query(input: str) -> str:
    words = [el for el in remove_lucene_chars(input).split() if el]
    if not words:
        return ""
    full_text_query = " AND ".join([f"{word}~2" for word in words])
    print(f"Generated Query: {full_text_query}")
    return full_text_query.strip()


# # Fulltext index query
def graph_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    entities_dic = extract_entities_for_research_query(question)
    entities = entities_dic["authors"] + entities_dic["titles"] + entities_dic["years"] + entities_dic["topics"]
    for entity in entities:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('fulltext_entity_id', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": entity},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [10]:
def full_retriever(question: str):
    graph_data = graph_retriever(question)
    vector_data = [el.page_content for el in vector_retriever.invoke(question)]
    final_data = f"""Graph data: {graph_data} \n
    vector data: {"#Title ". join(vector_data)}
    """
    return final_data

In [11]:
token="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"
from huggingface_hub import login
login(token = token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\vishn\.cache\huggingface\token
Login successful


In [12]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="meta-llama/Llama-3.2-3B-Instruct",
    model_kwargs={"temperature":0.1,"max_length":-1},
    huggingfacehub_api_token = token
)
text = ("Hello"
       )
prompt = "Help"
query= text + prompt
hf.invoke(query)

  hf=HuggingFaceHub(


"HelloHelp\nI'm a large language model, so I can understand and respond to a wide range of questions and topics. I can provide information, answer questions, and even engage in conversation. I'm here to help with any questions or topics you'd like to discuss. What's on your mind?\n\nWould you like to:\n1. Ask a question on a specific topic?\n2. Discuss a particular subject or issue?\n3. Play a game or have some fun?\n4. Get recommendations or suggestions?\n"

In [13]:
from langchain_core.runnables import  RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

question_answer_template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
question_answer_prompt = ChatPromptTemplate.from_template(question_answer_template)

question_answer_chain = (
        {
            "context": full_retriever,
            "question": RunnablePassthrough(),
        }
    | question_answer_prompt
    | hf
    | StrOutputParser()
)

In [14]:
paper_retrieval_template = """Present papers on the topic asked by the user in a nice format with year, author and topic. The list of papers available with you are:
{context}

prompt: {prompt}
Use natural language and be concise.
Answer:"""

paper_retrieval_prompt = ChatPromptTemplate.from_template(paper_retrieval_template)

paper_retrieval_chain = (
    {
        "context" : full_extractor,
        "prompt" : RunnablePassthrough()
    }
    | paper_retrieval_prompt
    | hf
    | StrOutputParser()
)

In [None]:
result2 = paper_retrieval_chain.invoke(input="Give me all papers published in generative ai by microsoft in 2021")
#print(full_extractor("What is optimal control?"))
# res_start2 = "Use natural language and be concise.\nAnswer:"
# res_index2 = result2.find(res_start2)
# # second_index = result['result'].find(res_start, res_index + len(res_start))
# result_start2 = res_index2 + len(res_start2)
# print(result2[result_start2:])

['papers published in', '2021']
hi2
hi1
Paper downloaded successfully to D:\Coding\GraphRAG-with-Llama-3.1-main\papers\The Evolving Ecosystem of Predatory Journals: A Case Study in Indian  Perspective.pdf
Paper downloaded successfully to D:\Coding\GraphRAG-with-Llama-3.1-main\papers\Publication and collaboration anomalies in academic papers originating  from a paper mill: evidence from a Russia-based paper mill.pdf


  session.write_transaction(self._create_paper_with_entities, paper)


In [None]:
print(result2)

Human: Present papers on the topic asked by the user in a nice format with year, author and topic. The list of papers available with you are:
None

prompt: Give me all papers published in deep learning in 2021
Use natural language and be concise.
Answer: I'd be happy to help you with that! Unfortunately, I don't have a comprehensive list of papers on deep learning from 2021. However, I can suggest some popular and influential papers in the field that were published in 2021. Here are a few:

1. **"Deep Learning" by Ian Goodfellow, Yoshua Bengio, and Aaron Courville** (2021) - This book is a comprehensive introduction to deep learning, covering topics such as neural networks,
