In [None]:
!pip install neo4j



In [None]:
!pip install sentence-transformers



In [None]:
!pip install graphdatascience

Collecting graphdatascience
  Downloading graphdatascience-1.11-py3-none-any.whl.metadata (7.4 kB)
Collecting multimethod<2.0,>=1.0 (from graphdatascience)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting textdistance<5.0,>=4.0 (from graphdatascience)
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Downloading graphdatascience-1.11-py3-none-any.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ------------------------- -------------- 1.0/1.6 MB 10.1 MB/s eta 0:00:01
   ---------------------------------------- 1.6/1.6 MB 5.9 MB/s eta 0:00:00
Downloading multimethod-1.12-py3-none-any.whl (10 kB)
Downloading textdistance-4.6.3-py3-none-any.whl (31 kB)
Installing collected packages: textdistance, multimethod, graphdatascience
Successfully installed graphdatascience-1.11 multimethod-1.12 textdistance-4.6.3


In [None]:
import os
import uuid
import random
import neo4j
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase

  from tqdm.autonotebook import tqdm, trange





In [None]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
folder_path = 'text_files'
file_data_dict = {}

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            content = file.read()
            unique_id = int(uuid.uuid4())  # Generate a unique ID
            unique_id = unique_id % 1000000000
            file_data_dict[unique_id] = {
                'filename': filename,
                'content': content
            }

In [None]:
file_ids = list(file_data_dict.keys())
random.shuffle(file_ids)

In [None]:
print(file_data_dict)

{68303731: {'filename': 'data.txt', 'content': 'Artificial intelligence (AI) has evolved rapidly since its inception. Early AI focused on simple tasks and rule-based systems, but advancements in machine learning and neural networks have transformed it. Today, AI can recognize speech, drive cars, and even create art. These advancements are driven by increased computational power, vast data availability, and sophisticated algorithms. As AI continues to evolve, ethical considerations and the potential for widespread societal impact are at the forefront of discussions. The future promises even more integration of AI into daily life, potentially revolutionizing industries and the way we interact with technology.\n'}, 913707013: {'filename': 'data1.txt', 'content': 'Climate change is one of the most pressing global issues today. Rising temperatures, melting ice caps, and more frequent extreme weather events are just a few manifestations. These changes threaten biodiversity, food security, an

In [None]:
def assign_files_to_names(file_data_dict, file_ids, names_list):
    """
    Assigns files to names in a round-robin fashion.

    Args:
        file_data_dict (dict): Dictionary where keys are file IDs and values are file data.
        file_ids (list): List of file IDs.
        names_list (list): List of names to assign files to.

    Returns:
        dict: A dictionary where keys are names and values are lists of file data with IDs.
    """
    # Initialize the result dictionary with names as keys and empty lists as values
    names_dict = {name: [] for name in names_list}

    # Assign files to names
    for i, file_id in enumerate(file_ids):
        name = names_list[i % len(names_list)]  # Use modulo to cycle through names
        file_data = file_data_dict[file_id]
        # Format the file data with ID first, then filename, and content last
        formatted_file_data = {
            'id': file_id,
            'filename': file_data['filename'],
            'content': file_data['content']
        }
        names_dict[name].append(formatted_file_data)

    return names_dict


In [None]:
names_list=["Amaan", "Huzan", "Umair", "Waleed"]
names_dict = assign_files_to_names(file_data_dict, file_ids, names_list)
print(names_dict)

{'Amaan': [{'id': 478708217, 'filename': 'data2.txt', 'content': 'The COVID-19 pandemic accelerated the adoption of remote work, transforming the traditional workplace. Companies and employees have discovered the benefits of flexibility, reduced commute times, and the ability to work from anywhere. However, remote work also presents challenges, such as maintaining productivity, ensuring cybersecurity, and managing work-life balance. Organizations are now exploring hybrid models that combine remote and in-office work. As technology continues to improve, remote work is likely to remain a significant component of the modern work environment, offering both opportunities and challenges for businesses and employees alike.'}, {'id': 664475849, 'filename': 'data15.txt', 'content': 'The future of transportation is being shaped by technological advancements and the need for sustainable solutions. Electric vehicles (EVs) are gaining popularity as they reduce emissions and dependence on fossil fue

In [None]:
def extract_metadata(names_dict):
    """Extract metadata from the names_dict."""
    metadata = {}
    for name, paragraphs in names_dict.items():
        metadata[name] = {
            'name': name,
            'file_id': [para['id'] for para in paragraphs], # Access the 'ID' value for each paragraph in the list
            'filename': [para['filename'] for para in paragraphs], # Access the 'text' value for each paragraph in the list
            'content': [para['content'] for para in paragraphs], # Access the 'text' value for each paragraph in the list# Access the 'text' value for each paragraph in the list           'content': [para['content'] for para in paragraphs], # Access the 'text' value for each paragraph in the list
        }
    return metadata

In [None]:
metadata = extract_metadata(names_dict)
for key, value in metadata.items():
    print(f"{key}: {value}\n")

Amaan: {'name': 'Amaan', 'file_id': [478708217, 664475849, 913707013, 799420906], 'filename': ['data2.txt', 'data15.txt', 'data1.txt', 'data12.txt'], 'content': ['The COVID-19 pandemic accelerated the adoption of remote work, transforming the traditional workplace. Companies and employees have discovered the benefits of flexibility, reduced commute times, and the ability to work from anywhere. However, remote work also presents challenges, such as maintaining productivity, ensuring cybersecurity, and managing work-life balance. Organizations are now exploring hybrid models that combine remote and in-office work. As technology continues to improve, remote work is likely to remain a significant component of the modern work environment, offering both opportunities and challenges for businesses and employees alike.', 'The future of transportation is being shaped by technological advancements and the need for sustainable solutions. Electric vehicles (EVs) are gaining popularity as they redu

In [None]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "12345678"

driver = GraphDatabase.driver(uri, auth=(username, password))
print(driver)

<neo4j._sync.driver.BoltDriver object at 0x0000025BD8D51C50>


In [None]:
def delete_all_nodes(tx):
    tx.run("MATCH (n) DETACH DELETE n")

with driver.session() as session:
    session.execute_write(delete_all_nodes)


In [None]:
vector = model.encode(content)
print(type(vector))

<class 'numpy.ndarray'>


In [None]:
def create_person_and_files(tx, person_data):
    # Create person node
    tx.run("""
        MERGE (p:Person {name: $name})
    """, name=person_data['name'])

    # Create file nodes and relationships
    tx.run("""
        UNWIND $files AS file
        MERGE (f:File {file_id: file.file_id, filename: file.filename, name: $name})
        WITH f,file
        MATCH (p:Person {name: $name})
        MERGE (p)-[:HAS_FILE]->(f)
        SET f.content = file.content
    """, name=person_data['name'], files=[
        {"file_id": file_id, "filename": filename, "content": content}
        for file_id, filename, content in zip(person_data['file_id'], person_data['filename'], person_data['content'])
    ])


# Usage example (assuming you have your driver and metadata)
# load_metadata_to_db(driver, metadata)

In [None]:
def load_metadata_to_db(file_data_dict):
    with driver.session() as session:
        for person_data in metadata.values():
            session.execute_write(create_person_and_files, person_data)


In [None]:
load_metadata_to_db(metadata)

Failed to write data to connection IPv4Address(('localhost', 7687)) (ResolvedIPv4Address(('127.0.0.1', 7687)))
Transaction failed and will be retried in 1.1688901351012955s (Failed to write data to connection IPv4Address(('localhost', 7687)) (ResolvedIPv4Address(('127.0.0.1', 7687))))


In [None]:
query = "art"
query_vector = model.encode(query)

In [None]:
def run_query(tx):
    result = tx.run("""
        MATCH (n:File)
        RETURN n.name AS name, n.filename AS filename, n.file_id AS file_id
    """)
    return list(result)

def execute_query(driver):
    with driver.session() as session:
        result = session.execute_read(run_query)
        for record in result:
            print(f"Name: {record['name']}")
            print(f"Filename: {record['filename']}")
            print(f"File ID: {record['file_id']}")
        return result

result = execute_query(driver)


Name: Amaan
Filename: data2.txt
File ID: 478708217
Name: Amaan
Filename: data15.txt
File ID: 664475849
Name: Amaan
Filename: data1.txt
File ID: 913707013
Name: Amaan
Filename: data12.txt
File ID: 799420906
Name: Huzan
Filename: data11.txt
File ID: 624085277
Name: Huzan
Filename: data13.txt
File ID: 955084061
Name: Huzan
Filename: data14.txt
File ID: 923760653
Name: Huzan
Filename: data3.txt
File ID: 589436211
Name: Umair
Filename: data10.txt
File ID: 329172251
Name: Umair
Filename: data8.txt
File ID: 223363074
Name: Umair
Filename: data.txt
File ID: 68303731
Name: Umair
Filename: data4.txt
File ID: 904676878
Name: Waleed
Filename: data6.txt
File ID: 579357924
Name: Waleed
Filename: data7.txt
File ID: 69624098
Name: Waleed
Filename: data9.txt
File ID: 306053457
Name: Waleed
Filename: data5.txt
File ID: 234740196


In [None]:

def run_query(tx):
    result = tx.run("""
        MATCH (n:File {name: 'Amaan'})
        RETURN n.name AS name, n.filename AS filename, n.file_id AS file_id
    """)
    return list(result)

def execute_query(driver):
    with driver.session() as session:
        result = session.execute_read(run_query)
        for record in result:
            print(f"Name: {record['name']}")
            print(f"Filename: {record['filename']}")
            print(f"File ID: {record['file_id']}")
        return result

execute_query(driver)


Name: Amaan
Filename: data2.txt
File ID: 478708217
Name: Amaan
Filename: data15.txt
File ID: 664475849
Name: Amaan
Filename: data1.txt
File ID: 913707013
Name: Amaan
Filename: data12.txt
File ID: 799420906


[<Record name='Amaan' filename='data2.txt' file_id=478708217>,
 <Record name='Amaan' filename='data15.txt' file_id=664475849>,
 <Record name='Amaan' filename='data1.txt' file_id=913707013>,
 <Record name='Amaan' filename='data12.txt' file_id=799420906>]

In [None]:
def run_query(tx):
    tx.run("""
        MATCH (p:File)-[r:HAS_FILE]->(f:File)
        WHERE f.content CONTAINS "artificial intelligence"
        RETURN p, r, f
    """)

def execute_query(driver):
    with driver.session() as session:
        result = session.execute_read(run_query)
        for record in result:
            print(f"Name: {record['name']}")
            print(f"Filename: {record['filename']}")
            print(f"File ID: {record['file_id']}")
        return result

result = execute_query(driver)

TypeError: 'NoneType' object is not iterable

In [None]:
def vector_similarity_search(tx, query_vector):
    result = tx.run("""
        MATCH (c:File)
        WHERE c.content IS NOT NULL
        WITH c, properties(c) AS props, vector.similarity.cosine(c.vector, $query_vector) AS score
        RETURN props, score
        ORDER BY score DESCENDING
        LIMIT 20
    """, query_vector=query_vector)
    return list(result)

def perform_similarity_search(driver, query_vector):
    with driver.session() as session:
        similar_items = session.execute_read(vector_similarity_search, query_vector)
        for item in similar_items:
            print(f"Item properties: {item['props']}, Cosine Similarity: {item['score']}")
    return similar_items

perform_similarity_search(driver, query_vector)


Item properties: {'content': 'The COVID-19 pandemic accelerated the adoption of remote work, transforming the traditional workplace. Companies and employees have discovered the benefits of flexibility, reduced commute times, and the ability to work from anywhere. However, remote work also presents challenges, such as maintaining productivity, ensuring cybersecurity, and managing work-life balance. Organizations are now exploring hybrid models that combine remote and in-office work. As technology continues to improve, remote work is likely to remain a significant component of the modern work environment, offering both opportunities and challenges for businesses and employees alike.', 'file_id': 478708217, 'name': 'Amaan', 'filename': 'data2.txt'}, Cosine Similarity: None
Item properties: {'content': 'The future of transportation is being shaped by technological advancements and the need for sustainable solutions. Electric vehicles (EVs) are gaining popularity as they reduce emissions an