In [1]:
# %pip install -q pandas python-dotenv langchain-community langchain-openai langchain-neo4j sentence_transformers

# 1. Imports

In [2]:
import os
import json
import random
import pandas as pd

from dotenv import load_dotenv

from langchain_neo4j import Neo4jGraph

from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI


from langchain_community.llms import HuggingFaceHub

# Load environment variables from a .env file
load_dotenv()


True

# 2. Import Dataset

##### We'll use a public dataset published by MANISH KUMAR on `Kaggle`, which contains data about professional profiles on LinkedIn: https://www.kaggle.com/datasets/manishkumar7432698/linkedinuserprofiles?select=LinkedIn+people+profiles+datasets.csv 

In [3]:
# Read the LinkedIn people profiles dataset
df = pd.read_csv('../data/raw/LinkedIn people profiles datasets.csv')
df.head()

Unnamed: 0,timestamp,id,name,city,country_code,region,current_company:company_id,current_company:name,position,following,...,people_also_viewed,educations_details,education,avatar,languages,certifications,recommendations,recommendations_count,volunteer_experience,сourses
0,2023-01-10,catherinemcilkenny,"Catherine Fitzpatrick (McIlkenny), B.A",Canada,CA,,,,Snr Business Analyst at Emploi et Développemen...,,...,"[{""profile_link"":""https://ca.linkedin.com/in/l...",Queen's University Belfast,"[{""degree"":""Bachelor of Arts (B.A.) Honours"",""...",https://media.licdn.com/dms/image/C4E03AQEcz_j...,,,,,,
1,2022-12-17,margot-bon-51a04624,Margot Bon,"The Randstad, Netherlands",NL,EU,gemeente-utrecht,Gemeente Utrecht,Communicatieadviseur Corporate & Strategie Gem...,,...,"[{""profile_link"":""https://nl.linkedin.com/in/j...",,"[{""degree"":""Scrum en Agile werken"",""end_year"":...",https://static.licdn.com/sc/h/244xhbkr7g40x6bs...,"[{""subtitle"":""Professional working proficiency...","[{""meta"":""Issued Jun 2013"",""subtitle"":""Van der...",Menno H. Poort “Ik werk al jaren prettig met M...,2.0,"[{""cause"":"""",""duration"":""Sep 2010 Jul 2020 9 y...",
2,2023-05-17,mike-dean-8509a193,Mike Dean,"England, United Kingdom",UK,,network-rail,Network Rail,Network Data Manager at Network Rail,,...,"[{""profile_link"":""https://uk.linkedin.com/in/g...",Brighton Polytechnic,"[{""degree"":""2:2"",""end_year"":""1991"",""field"":""El...",https://media.licdn.com/dms/image/C4D03AQHLj-Z...,,,,,,
3,2022-05-29,giovanna-panarella-99a0a4167,Giovanna Panarella,"Avellino, Campania, Italy",IT,EU,,Freelance,Architetto (Freelance),500.0,...,"[{""profile_link"":""https://it.linkedin.com/in/e...",Università di Camerino,"[{""degree"":""“Corso di aggiornamento profession...",https://media-exp1.licdn.com/dms/image/C4D03AQ...,,,,,"[{""cause"":""Arts and Culture"",""duration"":""Jan 2...",
4,2022-12-06,steve-latimer-3364327,Steve Latimer,"Ontario, Canada",CA,,mid-range-computer-group-inc.,Mid-Range Computer Group Inc.,Senior Account Executive at Mid-Range Computer...,,...,"[{""profile_link"":""https://ca.linkedin.com/in/d...",St. Michael's College School,"[{""degree"":"""",""end_year"":""1978"",""field"":"""",""me...",,,"[{""meta"":""Issued Jan 2022 See credential"",""sub...","Blake Reeves “If I was a customer, I would wan...",1.0,,


We will clean the dataset to keep only the data we need for our knowledge graph:

- **id**: unique identifier of the professional.
- **name**: name of the professional.
- **company**: name of the company where they work.
- **education**: educational institution where they studied.
- **languages**: languages they speak.
- **industry**: main industry in which they have experience.
- **country**: nationality of the worker.


In [4]:
# We will fill the empty data with this random values (just for demo purposes)
industries = ['Advertising Services', 'IT Services and IT Consulting', 'Hospitals and Health Care', 'Higher Education', 'Retail', 'Financial Services', 'Telco', 'Media & Entertainment']
countries= ['United States', 'Argentina', 'Spain', 'France', 'Mexico', 'United Kingdom', 'Sweden']

# Function to extract the industry from the company information
def extract_industry(json_str):
    try:
        data = json.loads(json_str)
        return data.get('industry', random.choice(industries))
    except json.JSONDecodeError:
        return None

# Function to extract the languages from the languages structure
def extract_languages(json_list):
    try:
        languages = [entry['title'] for entry in json.loads(json_list)]
        return '|'.join(languages)
    except: 
        return None

# Function to extract the country from the city structure
def extract_country(string):
    if isinstance(string, str):
        elements = string.split(',')
        return elements[-1].strip()  
    else:
        return random.choice(countries)

# Extract the industry, languages and country information
df['industry'] = df['current_company'].apply(lambda x: extract_industry(x))
df['languages'] = df['languages'].apply(lambda x: extract_languages(x))
df['country'] = df['city'].apply(lambda x: extract_country(x))

# Remove the rows with empty values in these key columns (just for demo purposes)
df = df [['id','name','current_company:name','educations_details','languages','industry','country']].dropna()

# Rename some columns for better readability
df = df.rename(columns={'current_company:name': 'company','educations_details':'education'})

# Preview the curated data
df.head(300)

Unnamed: 0,id,name,company,education,languages,industry,country
15,jessica-mccray-051b6bb6,Jessica McCray,Verizon Connect,Appalachian State University,English,Higher Education,United States
17,mohamed-ghiati-841871127,Mohamed Ghiati,Mercure Industrie,Ecole Marocaine des Sciences de l'ingénieur,Français|Arabe|Anglais,Hospitals and Health Care,Morocco
19,sophia-ngadi-54346631,Sophia NGADI,African Reinsurance Brokers,Université Paris 1 Panthéon-Sorbonne,Francais|Anglais|Arabe|Italien,Hospitals and Health Care,Morocco
20,adil-grini-303b74163,Adil Grini,Thenext.click,Ecole Nationale des Sciences Appliquées - Fès,Français|Anglais|Arabe,Hospitals and Health Care,Morocco
24,duncanperry,Duncan Perry,North Star Executive Advisors,Cornell University - S.C. Johnson Graduate Sch...,French,North Star Executive Advisors,United States
...,...,...,...,...,...,...,...
986,nickrramos,Nick Ramos,Oracle,"University of California, Los Angeles",English|Spanish,IT Services and IT Consulting,United States
989,mohamed-negm-796334a8,Mohamed Negm,ALDI Stores Australia,edX,English|Arabic|French,ALDI Stores Australia,Australia
990,nagylagouveia,Nágyla Gouveia,Emilio Ribas,Universidade 7 de Setembro - UNI7,Inglês|Português,Emilio Ribas,Greater Fortaleza
991,haris-nadeem,Haris Nadeem,JLL,University of South Florida,Urdu|English|Pashto|Punjabi,Real Estate,United States


In [5]:
# OPTIONAL: With this sentence you can save the curated csv in a new file called 'clean_data.csv'
df.to_csv('../data/processed/clean_linkedin_data.csv', index=False)

# 3 - Insert Data into Neo4J

The first step is to prepare the connector to Neo4j using the `Neo4jGraph` utility from LangChain.

In [6]:
from langchain_community.graphs import Neo4jGraph #DEPRACATED!!!!!!!!

# Retrieve connection information to Neo4j from environment variables
neo4j_url = os.getenv("NEO4J_URI")
neo4j_user = os.getenv("NEO4J_USER")
neo4j_password = os.getenv("NEO4J_PASSWORD")

# https://api.python.langchain.com/en/latest/graphs/langchain_community.graphs.neo4j_graph.Neo4jGraph.html
graph = Neo4jGraph(neo4j_url,neo4j_user,neo4j_password)

graph.refresh_schema()
print(graph.schema)

  graph = Neo4jGraph(neo4j_url,neo4j_user,neo4j_password)


Node properties:
Person {name: STRING}
Company {name: STRING}
School {name: STRING}
Industry {name: STRING}
Country {name: STRING}
Language {name: STRING}
Chunk {embedding: LIST, id: STRING, text: STRING, question: STRING, query: STRING}
ExampleNode {embedding: LIST, id: STRING, question: STRING, query: STRING}
Relationship properties:

The relationships:
(:Person)-[:SPEAKS]->(:Language)
(:Person)-[:WORKS_IN]->(:Company)
(:Person)-[:LIVES_IN]->(:Country)
(:Person)-[:EDUCATED_AT]->(:School)
(:Company)-[:IS_IN]->(:Industry)


We continue loading the previously prepared information into Neo4j using the LangChain utility.

In [7]:
# We set up the Cypher query to load the information from the csv that we have published on github
people_query = """
LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/jmunizwizeline/talent-land-2024/main/files/clean_data.csv'
AS row
MERGE (person:Person {name: row.name})
MERGE (company:Company {name: row.company})
MERGE (school:School {name: row.education})
MERGE (industry:Industry {name: row.industry})
MERGE (country:Country {name: row.country})

FOREACH (lang in split(row.languages, '|') | 
    MERGE (language:Language {name:trim(lang)})
    MERGE (person)-[:SPEAKS]->(language))

MERGE (person)-[:WORKS_IN]->(company)
MERGE (person)-[:LIVES_IN]->(country)
MERGE (person)-[:EDUCATED_AT]->(school)
MERGE (company)-[:IS_IN]->(industry)
"""

graph.query(people_query)

[]

Finally, we confirm that the schema in the database has been modified and explore the relationships it has created for us.

In [8]:
# We confirm that the schematic has been loaded
graph.refresh_schema()
print(graph.schema)

Node properties:
Person {name: STRING}
Company {name: STRING}
School {name: STRING}
Industry {name: STRING}
Country {name: STRING}
Language {name: STRING}
Chunk {embedding: LIST, id: STRING, text: STRING, question: STRING, query: STRING}
ExampleNode {embedding: LIST, id: STRING, question: STRING, query: STRING}
Relationship properties:

The relationships:
(:Person)-[:SPEAKS]->(:Language)
(:Person)-[:WORKS_IN]->(:Company)
(:Person)-[:LIVES_IN]->(:Country)
(:Person)-[:EDUCATED_AT]->(:School)
(:Company)-[:IS_IN]->(:Industry)


# 4 - Perform Our First Query on Our Knowledge Graph


In [9]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

# We create the chain with the graph and the LLM
chain = GraphCypherQAChain.from_llm(
    graph=graph,
    llm=llm,
    verbose=True,
    allow_dangerous_requests=True  # This is to allow the chain to execute any query
)

In [10]:
# List of questions that we want to run against the Knowledge Graph
questions = ["List all companies in Advertising Services industry!",
             "A worker who graduated from Simon Fraser University what is his name?",
             "Where is Paul Lukes working?",
             "A worker residing in Canada who is proficient in Vietnamese?",
             "How many workers from the United States speak Urdu?",
             "How many workers work for Capgemini?"]
for q in questions:
    print('====== START ======')
    print(chain.invoke(q)['result'])
    print('====== END ====== \n')



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Company)-[:IS_IN]->(i:Industry {name: 'Advertising Services'})
RETURN c.name
[0m
Full Context:
[32;1m[1;3m[{'c.name': 'Toolbox Creative'}, {'c.name': 'Sirabi Joukakelian Zoboyan, PPCC'}, {'c.name': 'CURTI Costruzioni Meccaniche Spa - Aerospace and Meccatronic'}, {'c.name': 'SAP'}, {'c.name': 'Baked Advertising'}, {'c.name': 'Nevo Medical'}, {'c.name': 'SMB Capital'}, {'c.name': 'Zaldivar Child Consultant Agency'}, {'c.name': 'Studiovarustamo Oy'}, {'c.name': 'Search Engine People'}][0m

[1m> Finished chain.[0m
The companies in the Advertising Services industry are Baked Advertising and Search Engine People.



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (p:Person)-[:EDUCATED_AT]->(s:School {name: "Simon Fraser University"})
RETURN p.name
[0m
Full Context:
[32;1m[1;3m[{'p.name': 'Damoon Tahmasbi'}][0m

[1m> Finished chain.[0m


# 5 - Enhancing Prompting Strategy Using Role Prompting and Few-Shot Learning


In [11]:
# We define some examples to show the model more details of the domain's structure
examples= [
    {
        "question": "Which workers speak French?",
        "query": "MATCH (p:Person)-[:SPEAKS]->(l:Language {{name: 'French'}}) RETURN p.name",
    },
    {
        "question": "What industries are workers named Emily associated with?",
        "query": "MATCH (p:Person {{name: 'Emily'}})-[:WORKS_IN]->(c:Company)-[:IS_IN]->(i:Industry) RETURN i.name",
    },
    {
        "question": "Which workers live in Canada and speak German?",
        "query": "MATCH (p:Person)-[:LIVES_IN]->(:Country {{name: 'Canada'}}), (p)-[:SPEAKS]->(:Language {{name: 'German'}}) RETURN p.name",
    },
    {
        "question": "In which countries do workers who speak Spanish live?",
        "query": "MATCH (p:Person)-[:SPEAKS]->(:Language {{name: 'Spanish'}})<-[:SPEAKS]-(worker:Person)-[:LIVES_IN]->(c:Country) RETURN DISTINCT c.name AS Country",
    },
    {
        "question": "What companies do workers named John work in?",
        "query": "MATCH (p:Person {{name: 'John'}})-[:WORKS_IN]->(c:Company) RETURN c.name",
    },
    {
        "question":"How many workers in Hospital and Health Care industry able to speak Korea",
        "query": "MATCH (p:Person)-[:WORKS_IN]->(:Company)-[:IS_IN]->(:Industry {{name: 'Hospitals and Health Care'}}),(p)-[:SPEAKS]->(:Language {{name: 'Korean'}}) RETURN COUNT(DISTINCT p) AS NumberOfWorkers",
    },
    {
        "question": "What companies are located in the technology industry?",
        "query": "MATCH (c:Company)-[:IS_IN]->(:Industry {{name: 'Technology'}}) RETURN c.name",
    },
    {
        "question": "Where do workers named Alice live?",
        "query": "MATCH (p:Person {{name: 'Alice'}})-[:LIVES_IN]->(c:Country) RETURN c.name",
    },
]

In [None]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate

# We use another Langchain utility to implement the few-shot and prompting improvements
example_prompt = PromptTemplate.from_template(
    "User input: {question}\nCypher query: {query}"
)
prompt = FewShotPromptTemplate(
    examples=examples[:3],
    example_prompt=example_prompt,
    prefix="You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.\n\nHere is the schema information\n{schema}.\n\nBelow are a number of examples of questions and their corresponding Cypher queries.",
    suffix="User input: {question}\nCypher query: ",
    input_variables=["question", "schema"],
)

# We create a new connector with the new strategy that we have just created
chain2 = GraphCypherQAChain.from_llm(graph=graph, llm=llm, cypher_prompt=prompt, verbose=True, allow_dangerous_requests=True)

In [13]:
# This is an example of the prompt that we will run when we make a question
print(prompt.format(question="Where do Michael work?", schema="foo"))

You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.

Here is the schema information
foo.

Below are a number of examples of questions and their corresponding Cypher queries.

User input: Which workers speak French?
Cypher query: MATCH (p:Person)-[:SPEAKS]->(l:Language {name: 'French'}) RETURN p.name

User input: What industries are workers named Emily associated with?
Cypher query: MATCH (p:Person {name: 'Emily'})-[:WORKS_IN]->(c:Company)-[:IS_IN]->(i:Industry) RETURN i.name

User input: Which workers live in Canada and speak German?
Cypher query: MATCH (p:Person)-[:LIVES_IN]->(:Country {name: 'Canada'}), (p)-[:SPEAKS]->(:Language {name: 'German'}) RETURN p.name

User input: Where do Michael work?
Cypher query: 


In [14]:
# We run again the questions with this new improved strategy
questions = ["List all companies in Advertising Services industry!",
             "A worker who graduated from Simon Fraser University what is his name?",
             "Where is Paul Lukes working?",
             "A worker residing in Canada who is proficient in Vietnamese?",
             "How many workers from the United States speak Urdu?",
             "How many workers work for Capgemini?"]
for q in questions:
    print('====== START ======')
    chain2.invoke(q)
    print('====== END ====== \n')



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (c:Company)-[:IS_IN]->(i:Industry {name: 'Advertising Services'}) RETURN c.name[0m
Full Context:
[32;1m[1;3m[{'c.name': 'Toolbox Creative'}, {'c.name': 'Sirabi Joukakelian Zoboyan, PPCC'}, {'c.name': 'CURTI Costruzioni Meccaniche Spa - Aerospace and Meccatronic'}, {'c.name': 'SAP'}, {'c.name': 'Baked Advertising'}, {'c.name': 'Nevo Medical'}, {'c.name': 'SMB Capital'}, {'c.name': 'Zaldivar Child Consultant Agency'}, {'c.name': 'Studiovarustamo Oy'}, {'c.name': 'Search Engine People'}][0m

[1m> Finished chain.[0m



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person)-[:EDUCATED_AT]->(:School {name: 'Simon Fraser University'}) RETURN p.name[0m
Full Context:
[32;1m[1;3m[{'p.name': 'Damoon Tahmasbi'}][0m

[1m> Finished chain.[0m



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {name: 'Paul Lu

# 6 - Enhancing Example Quality Using Similarity Search

In [40]:
from langchain_community.vectorstores import Neo4jVector
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_community.embeddings import HuggingFaceEmbeddings

# We use yet another Langchain utility to 
example_selector = SemanticSimilarityExampleSelector.from_examples(
    examples,
    HuggingFaceEmbeddings(),
    Neo4jVector,
    url = neo4j_url,
    username = neo4j_user,
    password = neo4j_password,
    k=4,
    input_keys=["question"],
)

# DEPRACATED!!!!!


  HuggingFaceEmbeddings(),


In [None]:
"""

from langchain_community.vectorstores.neo4j_vector import Neo4jVector
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document

docs = [
    Document(
        page_content=ex["question"],
        metadata={"question": ex["question"], "query": ex["query"]}
    )
    for ex in examples
]


# Initialize the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create the Neo4jVector instance
vectorstore = Neo4jVector.from_documents(
    docs,
    embedding_model,
    url=neo4j_url,
    username=neo4j_user,
    password=neo4j_password,
    index_name="example_index",
    node_label="ExampleNode",
    embedding_node_property="embedding",
    text_node_property="question",
)

# Create a SemanticSimilarityExampleSelector that selects examples based on the "question" key.
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
example_selector = SemanticSimilarityExampleSelector(
    vectorstore=vectorstore,
    k=4,
    input_keys=["question"],
)
"""


In [41]:
# Now we can see that the set of 3 examples that we have selected are better and also are sort
example_selector.select_examples({"question": "Where do Michael live?"})

[{'query': "MATCH (p:Person {{name: 'Alice'}})-[:LIVES_IN]->(c:Country) RETURN c.name",
  'question': 'Where do workers named Alice live?'},
 {'query': "MATCH (p:Person {{name: 'John'}})-[:WORKS_IN]->(c:Company) RETURN c.name",
  'question': 'What companies do workers named John work in?'},
 {'query': "MATCH (p:Person)-[:LIVES_IN]->(:Country {{name: 'Canada'}}), (p)-[:SPEAKS]->(:Language {{name: 'German'}}) RETURN p.name",
  'question': 'Which workers live in Canada and speak German?'},
 {'query': "MATCH (p:Person)-[:SPEAKS]->(:Language {{name: 'Spanish'}})<-[:SPEAKS]-(worker:Person)-[:LIVES_IN]->(c:Country) RETURN DISTINCT c.name AS Country",
  'question': 'In which countries do workers who speak Spanish live?'}]

In [42]:
# Use the example selector and get rid of the examples=examples[:3]
dynamic_prompt = FewShotPromptTemplate(
    example_selector=example_selector,
    example_prompt=example_prompt,
    prefix="You are a Neo4j expert. Given an input question, create a syntactically correct Cypher query to run.\n\nHere is the schema information\n{schema}.\n\nBelow are a number of examples of questions and their corresponding Cypher queries.",
    suffix="User input: {question}\nCypher query: ",
    input_variables=["question", "schema"],
)

# We create a new connector with the new strategy that we have just created
chain3 = GraphCypherQAChain.from_llm(graph=graph, cypher_prompt=dynamic_prompt, llm=llm, verbose=True, top_k=32, return_intermediate_steps=True, allow_dangerous_requests=True)

In [43]:
# We run again the questions with this new improved strategy
questions = questions = ["List all companies in Advertising Services industry!",
             "A worker who graduated from Simon Fraser University what is his name?",
             "Where is Paul Lukes working?",
             "A worker residing in Canada who is proficient in Vietnamese?",
             "How many workers from the United States speak Urdu?",
             "How many workers work for Capgemini?"]

for q in questions:
    print('====== START ======')
    chain3.invoke(q)
    print('====== END ====== \n')



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (c:Company)-[:IS_IN]->(:Industry {name: 'Advertising Services'}) RETURN c.name[0m
Full Context:
[32;1m[1;3m[{'c.name': 'Toolbox Creative'}, {'c.name': 'Sirabi Joukakelian Zoboyan, PPCC'}, {'c.name': 'CURTI Costruzioni Meccaniche Spa - Aerospace and Meccatronic'}, {'c.name': 'SAP'}, {'c.name': 'Baked Advertising'}, {'c.name': 'Nevo Medical'}, {'c.name': 'SMB Capital'}, {'c.name': 'Zaldivar Child Consultant Agency'}, {'c.name': 'Studiovarustamo Oy'}, {'c.name': 'Search Engine People'}][0m

[1m> Finished chain.[0m



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person)-[:EDUCATED_AT]->(:School {name: 'Simon Fraser University'}) RETURN p.name[0m
Full Context:
[32;1m[1;3m[{'p.name': 'Damoon Tahmasbi'}][0m

[1m> Finished chain.[0m



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {name: 'Paul Luk