In [1]:
import os
import certifi

# SSL_CERT_FILE 환경 변수가 설정되어 있지 않거나,
# 설정된 경로에 파일이 존재하지 않는 경우, certifi의 CA 번들 경로로 설정
# 또는 현재 설정된 SSL_CERT_FILE이 certifi의 경로와 다른 경우에도 certifi로 강제
current_ssl_cert_file = os.environ.get('SSL_CERT_FILE')
certifi_path = certifi.where()

if not current_ssl_cert_file or current_ssl_cert_file != certifi_path:
    print(f"기존 SSL_CERT_FILE: {current_ssl_cert_file}")
    print(f"certifi CA 경로로 SSL_CERT_FILE 설정: {certifi_path}")
    os.environ['SSL_CERT_FILE'] = certifi_path
else:
    print(f"SSL_CERT_FILE이 이미 certifi 경로로 설정됨: {current_ssl_cert_file}")

기존 SSL_CERT_FILE: None
certifi CA 경로로 SSL_CERT_FILE 설정: c:\Users\jjw37\neo4j-rag\neo4j-graphrag\.venv\Lib\site-packages\certifi\cacert.pem


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
from langchain_neo4j import Neo4jGraph

graph = Neo4jGraph( 
    url=os.getenv("NEO4J_URI"), 
    username=os.getenv("NEO4J_USERNAME"), 
    password=os.getenv("NEO4J_PASSWORD"),
    database=os.getenv("NEO4J_DATABASE"),
    enhanced_schema=True
)

### 데이터베이스 생성

In [4]:
def reset_database(graph):
    """
    데이터베이스 초기화하기
    """
    # 모든 노드와 관계 삭제
    graph.query("MATCH (n) DETACH DELETE n")
    
    # 모든 제약조건 삭제
    constraints = graph.query("SHOW CONSTRAINTS")
    for constraint in constraints:
        constraint_name = constraint.get("name")
        if constraint_name:
            graph.query(f"DROP CONSTRAINT {constraint_name}")
    
    # 모든 인덱스 삭제
    indexes = graph.query("SHOW INDEXES")
    for index in indexes:
        index_name = index.get("name")
        index_type = index.get("type")
        if index_name and index_type != "CONSTRAINT":
            graph.query(f"DROP INDEX {index_name}")
    
    print("데이터베이스가 초기화되었습니다.")

# 데이터베이스 초기화
reset_database(graph)

데이터베이스가 초기화되었습니다.


In [5]:
graph.query("MATCH (n) RETURN n LIMIT 5")

[]

In [6]:
# 제약조건 및 인덱스 생성
def create_constraints_and_indexes(graph):
    """
    필요한 제약조건과 인덱스를 생성합니다.
    """
    
    # 제약조건 생성 (중복 방지 및 데이터 무결성)
    constraints = [
        "CREATE CONSTRAINT category_name_unique IF NOT EXISTS FOR (c:Category) REQUIRE c.name IS UNIQUE",
        "CREATE CONSTRAINT subcategory_name_unique IF NOT EXISTS FOR (sc:SubCategory) REQUIRE sc.name IS UNIQUE", 
        "CREATE CONSTRAINT flavor_name_unique IF NOT EXISTS FOR (f:Flavor) REQUIRE f.name IS UNIQUE",
        "CREATE CONSTRAINT subflavor_name_unique IF NOT EXISTS FOR (sf:SubFlavor) REQUIRE sf.name IS UNIQUE"
    ]
    
    # 인덱스 생성 (쿼리 성능 향상)
    indexes = [
        "CREATE INDEX category_name_index IF NOT EXISTS FOR (c:Category) ON (c.name)",
        "CREATE INDEX subcategory_name_index IF NOT EXISTS FOR (sc:SubCategory) ON (sc.name)",
        "CREATE INDEX flavor_name_index IF NOT EXISTS FOR (f:Flavor) ON (f.name)",
        "CREATE INDEX subflavor_name_index IF NOT EXISTS FOR (sf:SubFlavor) ON (sf.name)"
    ]
    
    print("제약조건 생성 중...")
    for constraint in constraints:
        try:
            graph.query(constraint)
            print(f"✓ {constraint.split()[2]} 제약조건 생성 완료")
        except Exception as e:
            print(f"✗ 제약조건 생성 실패: {e}")
    
    print("\n인덱스 생성 중...")
    for index in indexes:
        try:
            graph.query(index)
            print(f"✓ {index.split()[2]} 인덱스 생성 완료")
        except Exception as e:
            print(f"✗ 인덱스 생성 실패: {e}")
    
    print("\n제약조건 및 인덱스 생성이 완료되었습니다.")

# 제약조건 및 인덱스 생성 실행
create_constraints_and_indexes(graph)


제약조건 생성 중...
✓ category_name_unique 제약조건 생성 완료
✓ subcategory_name_unique 제약조건 생성 완료
✓ flavor_name_unique 제약조건 생성 완료
✓ subflavor_name_unique 제약조건 생성 완료

인덱스 생성 중...
✓ category_name_index 인덱스 생성 완료
✓ subcategory_name_index 인덱스 생성 완료
✓ flavor_name_index 인덱스 생성 완료
✓ subflavor_name_index 인덱스 생성 완료

제약조건 및 인덱스 생성이 완료되었습니다.


In [7]:
allnode_query = """
MATCH (n)
WHERE n:Category OR n:SubCategory OR n:Flavor OR n:SubFlavor
SET n:AllNode
"""

graph.query(allnode_query)

[]

In [8]:
cypher_query = """
LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/Bushanhui/brewingrecipe/refs/heads/main/ccc_flavor_graph_updated_250629.csv' AS row

// 공통 노드 생성
MERGE (c:Category {name: row.Category})
MERGE (sc:SubCategory {name: row.`Sub-category`})
MERGE (f:Flavor {name: row.Flavor})

// 공통 관계 연결
MERGE (c)-[:INCLUDES]->(sc)
MERGE (sc)-[:INCLUDES]->(f)

// 조건: Sub-flavor가 존재할 경우에만 노드 및 관계 생성
WITH row, f
WHERE row.`Sub-flavor` IS NOT NULL AND trim(row.`Sub-flavor`) <> ''
MERGE (sf:SubFlavor {name: row.`Sub-flavor`})
MERGE (f)-[:INCLUDES]->(sf)
"""

graph.query(cypher_query)

[]

In [9]:
graph.query("SHOW DATABASES")

[{'name': 'neo4j',
  'type': 'standard',
  'aliases': [],
  'access': 'read-write',
  'address': 'localhost:7687',
  'role': 'primary',
  'writer': True,
  'requestedStatus': 'online',
  'currentStatus': 'online',
  'statusMessage': '',
  'default': True,
  'home': True,
  'constituents': []},
 {'name': 'system',
  'type': 'system',
  'aliases': [],
  'access': 'read-write',
  'address': 'localhost:7687',
  'role': 'primary',
  'writer': True,
  'requestedStatus': 'online',
  'currentStatus': 'online',
  'statusMessage': '',
  'default': False,
  'home': False,
  'constituents': []}]

In [10]:
# 생성된 제약조건과 인덱스 확인
print("=== 생성된 제약조건 ===")
constraints = graph.query("SHOW CONSTRAINTS")
for constraint in constraints:
    print(f"- {constraint['name']}: {constraint['labelsOrTypes']} ({constraint['properties']})")

print("\n=== 생성된 인덱스 ===")
indexes = graph.query("SHOW INDEXES")
for index in indexes:
    if index['type'] != 'CONSTRAINT':
        print(f"- {index['name']}: {index['labelsOrTypes']} ({index['properties']})")

print("\n=== 데이터 확인 ===")
result = graph.query("""
MATCH (c:Category)-[:INCLUDES]->(sc:SubCategory)-[:INCLUDES]->(f:Flavor)
RETURN c.name as category, sc.name as subcategory, f.name as flavor
LIMIT 5
""")
for row in result:
    print(f"카테고리: {row['category']} -> 서브카테고리: {row['subcategory']} -> 맛: {row['flavor']}")

=== 생성된 제약조건 ===
- category_name_unique: ['Category'] (['name'])
- flavor_name_unique: ['Flavor'] (['name'])
- subcategory_name_unique: ['SubCategory'] (['name'])
- subflavor_name_unique: ['SubFlavor'] (['name'])

=== 생성된 인덱스 ===
- category_name_unique: ['Category'] (['name'])
- flavor_name_unique: ['Flavor'] (['name'])
- subcategory_name_unique: ['SubCategory'] (['name'])
- subflavor_name_unique: ['SubFlavor'] (['name'])

=== 데이터 확인 ===
카테고리: fruit -> 서브카테고리: tropical fruit -> 맛: tropical juice
카테고리: fruit -> 서브카테고리: tropical fruit -> 맛: lychee juice
카테고리: fruit -> 서브카테고리: tropical fruit -> 맛: tropical candy
카테고리: fruit -> 서브카테고리: tropical fruit -> 맛: banana
카테고리: fruit -> 서브카테고리: tropical fruit -> 맛: green papaya


In [11]:
fulltext_index_query = """
CREATE FULLTEXT INDEX flavorFulltextIndex
IF NOT EXISTS
FOR (n:Flavor | SubFlavor | SubCategory | Category)
ON EACH [n.name]
"""

graph.query(fulltext_index_query)

[]

In [12]:
graph.query("""SHOW FULLTEXT INDEXES""")

[{'id': 9,
  'name': 'flavorFulltextIndex',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'FULLTEXT',
  'entityType': 'NODE',
  'labelsOrTypes': ['Flavor', 'SubFlavor', 'SubCategory', 'Category'],
  'properties': ['name'],
  'indexProvider': 'fulltext-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': None}]

### 검색

In [13]:
search_query = """
CALL db.index.fulltext.queryNodes("flavorFulltextIndex", $search_term)
YIELD node, score
RETURN node.name AS flavor, score AS SearchRelevance
ORDER BY SearchRelevance DESC
LIMIT 10
"""

graph.query(search_query, params={"search_term": "lemon"})

[{'flavor': 'lemon', 'SearchRelevance': 1.8783544301986694},
 {'flavor': 'lemon jest', 'SearchRelevance': 1.4283926486968994},
 {'flavor': 'yellow lemon', 'SearchRelevance': 1.4283926486968994},
 {'flavor': 'lemon rosso', 'SearchRelevance': 1.4283926486968994},
 {'flavor': 'lemon peel', 'SearchRelevance': 1.4283926486968994},
 {'flavor': 'lemon zest', 'SearchRelevance': 1.4283926486968994},
 {'flavor': 'lemon candy', 'SearchRelevance': 1.4283926486968994},
 {'flavor': 'lemon juice', 'SearchRelevance': 1.4283926486968994},
 {'flavor': 'lemon balm', 'SearchRelevance': 1.4283926486968994},
 {'flavor': 'candid lemon peel', 'SearchRelevance': 1.1523466110229492}]

In [17]:
fuzzy_search_query = """
CALL db.index.fulltext.queryNodes("flavorFulltextIndex", $search_term)
YIELD node, score
RETURN node.name AS flavor, score AS SearchRelevance
ORDER BY SearchRelevance DESC
LIMIT 10
"""

fuzzy_result = graph.query(fuzzy_search_query, params={"search_term": "bery~0.7"})

for result in fuzzy_result:
    print(f'{result["flavor"]} - {result["SearchRelevance"]}')

berry - 1.7959753274917603
red berry - 1.3631441593170166
mixed berry - 1.3631441593170166


In [18]:
wildcard_search_query = """
CALL db.index.fulltext.queryNodes("flavorFulltextIndex", $search_term)
YIELD node, score
RETURN node.name AS flavor, score AS SearchRelevance
ORDER BY SearchRelevance DESC
LIMIT 10
"""

wildcard_result = graph.query(wildcard_search_query, params={"search_term": "*berry*"})

for result in wildcard_result:
    print(f'{result["flavor"]} - {result["SearchRelevance"]}')

blueberry jam - 1.0
strawberry jam - 1.0
raspberry jam - 1.0
red berry - 1.0
mixed berry - 1.0
blackberry - 1.0
blueberry - 1.0
strawberry - 1.0
raspberry - 1.0
blueberry chocolate - 1.0


In [20]:
phrase_search_query = """
CALL db.index.fulltext.queryNodes("flavorFulltextIndex", $search_term)
YIELD node, score
RETURN node.name AS flavor, score AS SearchRelevance
ORDER BY SearchRelevance DESC
LIMIT 10
"""

phrase_result = graph.query(phrase_search_query, params={"search_term": "\"citrus fruit\""})

for result in phrase_result:
    print(f'{result["flavor"]} - {result["SearchRelevance"]}')

citrus fruit - 3.0276856422424316


In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

In [8]:
embeddings = HuggingFaceEmbeddings(
    model_name = "BAAI/bge-m3",
    cache_folder = "data/hf_embedding_cache/bge-m3",
    model_kwargs = {"device": "cuda"},
    encode_kwargs = {"query_instruction": ""},
)

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
vector_index_query = """
CREATE VECTOR INDEX all_node_index IF NOT EXISTS
FOR (n:AllNode)
ON (n.content_embedding)
OPTIONS {
    indexConfig: {
      `vector.dimensions`: 1024,
      `vector.similarity_function`: 'cosine'
    }
}
"""
graph.query(vector_index_query)

[]

In [27]:
vector_index_query = """
CREATE VECTOR INDEX category_index IF NOT EXISTS
FOR (n:Category)
ON (n.content_embedding)
OPTIONS {
    indexConfig: {
      `vector.dimensions`: 1024,
      `vector.similarity_function`: 'cosine'
    }
}
"""

graph.query(vector_index_query)

vector_index_query = """
CREATE VECTOR INDEX subcategory_index IF NOT EXISTS
FOR (n:SubCategory)
ON (n.content_embedding)
OPTIONS {
    indexConfig: {
      `vector.dimensions`: 1024,
      `vector.similarity_function`: 'cosine'
    }
}
"""

graph.query(vector_index_query)

vector_index_query = """
CREATE VECTOR INDEX flavor_index IF NOT EXISTS
FOR (n:Flavor)
ON (n.content_embedding)
OPTIONS {
    indexConfig: {
      `vector.dimensions`: 1024,
      `vector.similarity_function`: 'cosine'
    }
}
"""

graph.query(vector_index_query)

vector_index_query = """
CREATE VECTOR INDEX subflavor_index IF NOT EXISTS
FOR (n:SubFlavor)
ON (n.content_embedding)
OPTIONS {
    indexConfig: {
      `vector.dimensions`: 1024,
      `vector.similarity_function`: 'cosine'
    }
}
"""

graph.query(vector_index_query)

[]

In [9]:
check_vector_index_query="""
SHOW VECTOR INDEXES
"""

vector_indexes = graph.query(check_vector_index_query)
for index in vector_indexes:
    print(f"Index Name: {index['name']}")
    print(f"Index Type: {index['type']}")
    print(f"Properties: {index['properties']}")
    print("-"*40)

Index Name: category_index
Index Type: VECTOR
Properties: ['content_embedding']
----------------------------------------
Index Name: flavor_index
Index Type: VECTOR
Properties: ['content_embedding']
----------------------------------------
Index Name: subcategory_index
Index Type: VECTOR
Properties: ['content_embedding']
----------------------------------------
Index Name: subflavor_index
Index Type: VECTOR
Properties: ['content_embedding']
----------------------------------------


In [11]:
labels = ["Flavor", "SubFlavor", "SubCategory", "Category"]

for label in labels:
    node_query = f"""
    MATCH (n:{label})
    WHERE n.name IS NOT NULL
    RETURN n.name AS name
    """

    nodes = graph.query(node_query)

    for node in nodes:
        name = node["name"]
        embedding = embeddings.embed_documents([name])[0]

        update_query = f"""
        MATCH (n:{label} {{name: $name}})
        SET n.content_embedding = $embedding
        RETURN count(n) as updated
        """

        graph.query(update_query, params={"name": name, "embedding": embedding})

In [10]:
# remove_query = """
# MATCH (n)
# REMOVE n.content_embedding
# """

# graph.query(remove_query)

[]

In [9]:
def semantic_search(search_text, k=5):
    query_embedding = embeddings.embed_query(search_text)

    vector_search_query = """
    CALL db.index.vector.queryNodes(
        'flavor_index',
        $top_k,
        $query_embedding
    ) YIELD node, score
    RETURN node.name AS name, score
    """

    results = graph.query(
        vector_search_query,
        params={
            "top_k": k,
            "query_embedding": query_embedding
        }
    )

    return results    

In [10]:
search_text = "peach"

results = semantic_search(search_text)

for result in results:
    print(f"Name: {result['name']}, Score: {result['score']}")

Name: peach, Score: 0.9992799758911133
Name: pear, Score: 0.8749494552612305
Name: pecan, Score: 0.8692240715026855
Name: papaya, Score: 0.8316278457641602
Name: pineapple, Score: 0.8314447402954102


In [17]:
search_text = "sweet and fruity"

results = semantic_search(search_text)

for result in results:
    print(f"Name: {result['name']}, Score: {result['score']}")

Name: fruit candy, Score: 0.8967852592468262
Name: fruit cocktail, Score: 0.8638811111450195
Name: fruit tea, Score: 0.8463926315307617
Name: pickled fruit, Score: 0.8454952239990234
Name: passion fruit, Score: 0.840181827545166


#### 각 노드의 속성에 이름 외에도 '보편성', '빈도' 같이 검색 결과의 랭킹의 기준이 될 수 있는 속성을 더하는 것도 가능할 것 같음

In [18]:
search_text = "달콤한 과일향"

results = semantic_search(search_text)

for result in results:
    print(f"Name: {result['name']}, Score: {result['score']}")

Name: fruit candy, Score: 0.8656268119812012
Name: tropical candy, Score: 0.8228607177734375
Name: fruit cocktail, Score: 0.8198614120483398
Name: orange candy, Score: 0.8150873184204102
Name: dried fruit, Score: 0.815086841583252


In [21]:
graph.refresh_schema()
print(graph.schema)

Node properties:
- **Category**
  - `name`: STRING Example: "beverage"
- **SubCategory**
  - `name`: STRING Example: "beverage"
- **Flavor**
  - `name`: STRING Example: "champagne sparkling"
- **SubFlavor**
  - `name`: STRING Example: "rosewater"
Relationship properties:

The relationships:
(:Category)-[:INCLUDES]->(:SubCategory)
(:SubCategory)-[:INCLUDES]->(:Flavor)
(:Flavor)-[:INCLUDES]->(:SubFlavor)


In [11]:
from langchain_openai import ChatOpenAI
from langchain_neo4j import GraphCypherQAChain

In [12]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [16]:
cypher_chain = GraphCypherQAChain.from_llm(
    llm = llm,
    graph = graph,
    allow_dangerous_requests=True,
    verbose=True
)

In [17]:
answer = cypher_chain.invoke({'query': 'What is the name of the flavor that is most similar to "berry"?'})
print(answer)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Category)-[:INCLUDES]->(sc:SubCategory)-[:INCLUDES]->(f:Flavor)
WHERE f.name CONTAINS "berry"
RETURN f.name
[0m
Full Context:
[32;1m[1;3m[{'f.name': 'blackberry'}, {'f.name': 'blueberry'}, {'f.name': 'blueberry chocolate'}, {'f.name': 'cranberry'}, {'f.name': 'mixed berry'}, {'f.name': 'raspberry'}, {'f.name': 'red berry'}, {'f.name': 'strawberry'}][0m

[1m> Finished chain.[0m
{'query': 'What is the name of the flavor that is most similar to "berry"?', 'result': 'The flavor that is most similar to "berry" is mixed berry.'}


In [23]:
answer = cypher_chain.invoke({'query': "MATCH (start:Flavor {name: 'peach'})-[*..2]-(connected) RETURN connected.name 쿼리를 실행해줘"})
print(answer)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (start:Flavor {name: 'peach'})-[*..2]-(connected) RETURN connected.name[0m
Full Context:
[32;1m[1;3m[{'connected.name': 'nectarine'}, {'connected.name': 'peach juice'}, {'connected.name': 'peach iced tea'}, {'connected.name': 'white peach'}, {'connected.name': 'apricot'}, {'connected.name': 'apricot jam'}, {'connected.name': 'peach tea'}, {'connected.name': 'yellow peach'}, {'connected.name': 'peach jam'}, {'connected.name': 'stone fruit'}][0m

[1m> Finished chain.[0m
{'query': "MATCH (start:Flavor {name: 'peach'})-[*..2]-(connected) RETURN connected.name 쿼리를 실행해줘", 'result': 'The flavors connected to peach are nectarine, peach juice, peach iced tea, white peach, apricot, apricot jam, peach tea, yellow peach, peach jam, and stone fruit.'}


In [26]:
answer = cypher_chain.invoke({'query': "모든 종류의 노드 중에서 name 속성이 'peach'인 노드를 찾고, 어떤 관계든 상관없이 2단계 이내로 연결된 모든 노드들을 찾아줘"})
print(answer)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (n {name: 'peach'})-[:INCLUDES*1..2]->(m) RETURN m
[0m
Full Context:
[32;1m[1;3m[{'m': {'name': 'nectarine', 'content_embedding': [0.006853054743260145, 0.0031263341661542654, -0.01122612040489912, 0.003333860542625189, -0.013913421891629696, 0.016365746036171913, -0.01979779452085495, 0.022879457101225853, 0.0033096179831773043, 0.03535079210996628, 0.03102763369679451, -0.018273908644914627, -0.0321488194167614, -0.028299503028392792, 0.02405344322323799, -0.03561544045805931, -0.012940572574734688, -0.002464959630742669, -0.02256644144654274, -0.05747599899768829, 0.019302330911159515, -0.02987484820187092, 0.004578710068017244, 0.019193941727280617, 0.0074603501707315445, 0.06500763446092606, 0.003169317962601781, 0.012827758677303791, 0.0089299650862813, 0.005058636888861656, -0.009925137273967266, -0.012590286321938038, -0.040746189653873444, -0.06101135164499283, 0.008091248571872

In [28]:
search_query = "MATCH (n {name: 'peach'})-[:INCLUDES*1..2]->(m) RETURN m.name"

graph.query(search_query)

[{'m.name': 'nectarine'},
 {'m.name': 'peach juice'},
 {'m.name': 'peach iced tea'},
 {'m.name': 'white peach'},
 {'m.name': 'apricot'},
 {'m.name': 'apricot jam'},
 {'m.name': 'peach tea'},
 {'m.name': 'yellow peach'},
 {'m.name': 'peach jam'}]

In [29]:
from langchain_neo4j import Neo4jVector

In [32]:
graph_db = Neo4jVector.from_existing_index(
    embeddings,
    url=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_USER"),
    password=os.getenv("NEO4J_PASSWORD"),
    index_name="all_node_index",
    text_node_property="name"
)

In [34]:
query = "초콜릿"

similar_docs = graph_db.similarity_search_with_score(
    query,
    k=5,
    return_embeddings=False
)

for doc, score in similar_docs:
    print(doc, score)

page_content='cocoa' 0.8466353416442871
page_content='coconut' 0.8429388999938965
page_content='chocolate' 0.8336925506591797
page_content='chocolate' 0.8336925506591797
page_content='milk chocolate' 0.8275504112243652


In [35]:
doc.metadata

{}