In [98]:
!pip install sentence-transformers



In [99]:
!pip install psycopg2-binary



In [100]:
import os
import numpy as np
import psycopg2
import uuid
import random
from sentence_transformers import SentenceTransformer
from pgvector.psycopg2 import register_vector
from psycopg2.extras import execute_values


In [101]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [102]:
folder_path = 'C:/Users/Amaan/OneDrive/Desktop/pgvector/text_files'
file_data_dict = {}

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            content = file.read()
            unique_id = int(uuid.uuid4())  # Generate a unique ID
            unique_id = unique_id % 1000000000
            file_data_dict[unique_id] = {
                'filename': filename,
                'content': content
            }


In [103]:
file_ids = list(file_data_dict.keys())
random.shuffle(file_ids)

In [104]:
def assign_files_to_names(file_data_dict, file_ids, names_list):
    """
    Assigns files to names in a round-robin fashion.

    Args:
        file_data_dict (dict): Dictionary where keys are file IDs and values are file data.
        file_ids (list): List of file IDs.
        names_list (list): List of names to assign files to.

    Returns:
        dict: A dictionary where keys are names and values are lists of file data with IDs.
    """
    # Initialize the result dictionary with names as keys and empty lists as values
    names_dict = {name: [] for name in names_list}

    # Assign files to names
    for i, file_id in enumerate(file_ids):
        name = names_list[i % len(names_list)]  # Use modulo to cycle through names
        file_data = file_data_dict[file_id]
        # Format the file data with ID first, then filename, and content last
        formatted_file_data = {
            'id': file_id,
            'filename': file_data['filename'],
            'content': file_data['content']
        }
        names_dict[name].append(formatted_file_data)

    return names_dict


In [105]:
names_list=["Amaan", "Huzan", "Umair", "Waleed"]
names_dict = assign_files_to_names(file_data_dict, file_ids, names_list)
print(names_dict)

{'Amaan': [{'id': 51088553, 'filename': 'data12.txt', 'content': "In today's digital age, cybersecurity is crucial for protecting sensitive information and maintaining privacy. Cyber threats, such as hacking, phishing, and malware, pose significant risks to individuals and organizations. Effective cybersecurity measures include using strong passwords, encrypting data, and implementing multi-factor authentication. Awareness and education about cyber risks are essential for preventing attacks. Governments and businesses must invest in robust cybersecurity infrastructure to safeguard against breaches. As technology advances, so do the tactics of cybercriminals. Continuous vigilance and proactive strategies are necessary to protect digital assets and ensure the security of information systems."}, {'id': 523481290, 'filename': 'data4.txt', 'content': 'Social media has profoundly impacted society, changing how people communicate, access information, and perceive the world. Platforms like Fac

In [106]:
def extract_metadata(names_dict):
    """Extract metadata from the names_dict."""
    metadata = {}
    for name, paragraphs in names_dict.items():
        metadata[name] = {
            'name': name,
            'file_id': [para['id'] for para in paragraphs], # Access the 'ID' value for each paragraph in the list
            'filename': [para['filename'] for para in paragraphs], # Access the 'text' value for each paragraph in the list
            'content': [para['content'] for para in paragraphs], # Access the 'text' value for each paragraph in the list# Access the 'text' value for each paragraph in the list           'content': [para['content'] for para in paragraphs], # Access the 'text' value for each paragraph in the list
        }
    return metadata

In [107]:
metadata = extract_metadata(names_dict)
for key, value in metadata.items():
    print(f"{key}: {value}\n")

Amaan: {'name': 'Amaan', 'file_id': [51088553, 523481290, 289299187, 369124287], 'filename': ['data12.txt', 'data4.txt', 'data2.txt', 'data.txt'], 'content': ["In today's digital age, cybersecurity is crucial for protecting sensitive information and maintaining privacy. Cyber threats, such as hacking, phishing, and malware, pose significant risks to individuals and organizations. Effective cybersecurity measures include using strong passwords, encrypting data, and implementing multi-factor authentication. Awareness and education about cyber risks are essential for preventing attacks. Governments and businesses must invest in robust cybersecurity infrastructure to safeguard against breaches. As technology advances, so do the tactics of cybercriminals. Continuous vigilance and proactive strategies are necessary to protect digital assets and ensure the security of information systems.", 'Social media has profoundly impacted society, changing how people communicate, access information, and

In [128]:
# Provide your Neon connection string
connection_string = "postgresql://postgresdb_owner:uLa1XRwpC0Qh@ep-misty-art-a2qi2l6c.eu-central-1.aws.neon.tech/postgresdb?sslmode=require"

# Connect using the connection string
connection = psycopg2.connect(connection_string)

# Create a new cursor object
cursor = connection.cursor()

In [129]:
# Execute this query to test the database connection
cursor.execute("SELECT 1;")
result = cursor.fetchone()

# Check the query result
if result == (1,):
    print("Your database connection was successful!")
else:
    print("Your connection failed.")

Your database connection was successful!


In [130]:
# Execute this query to install the pgvector extension
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")

In [132]:
cursor.execute('DROP TABLE documents')

In [133]:
cursor.execute("""
CREATE TABLE IF NOT EXISTS documents (
    id SERIAL PRIMARY KEY,
    name TEXT,
    file_id INTEGER[],
    filename TEXT[],
    content TEXT[],
    embedding vector(384)
);
""")

# Prepare data for insertion
data_to_insert = []
for person, data in metadata.items():
    # Combine all content into a single string for embedding
    combined_content = " ".join(data['content'])
    
    # Generate embedding
    embedding = model.encode(combined_content)
    
    # Prepare data tuple
    data_tuple = (
        data['name'],
        data['file_id'],
        data['filename'],
        data['content'],
        embedding.tolist()
    )
    data_to_insert.append(data_tuple)

# Insert data into the table
execute_values(cursor, """
    INSERT INTO documents (name, file_id, filename, content, embedding)
    VALUES %s
""", data_to_insert)

# Commit the transaction and close the connection
connection.commit()

print("Data inserted successfully!")

Data inserted successfully!


In [134]:
def get_all_documents():
    cursor.execute("SELECT id, name, file_id, filename FROM documents;")
    return cursor.fetchall()

# Test the functions
print("All documents:")
print(get_all_documents())

All documents:
[(1, 'Amaan', [51088553, 523481290, 289299187, 369124287], ['data12.txt', 'data4.txt', 'data2.txt', 'data.txt']), (2, 'Huzan', [418711620, 611911868, 875983723, 597257227], ['data15.txt', 'data14.txt', 'data3.txt', 'data5.txt']), (3, 'Umair', [623380281, 825528165, 985949571, 391852447], ['data1.txt', 'data10.txt', 'data11.txt', 'data8.txt']), (4, 'Waleed', [104236840, 744920665, 271366396, 78483970], ['data13.txt', 'data7.txt', 'data9.txt', 'data6.txt'])]


In [135]:
def get_document_by_name(name):
    cursor.execute("SELECT id, name, file_id, filename, content FROM documents WHERE name = %s;", (name,))
    return cursor.fetchone()

print("\nDocument for Amaan:")
print(get_document_by_name("Amaan"))



Document for Amaan:
(1, 'Amaan', [51088553, 523481290, 289299187, 369124287], ['data12.txt', 'data4.txt', 'data2.txt', 'data.txt'], ["In today's digital age, cybersecurity is crucial for protecting sensitive information and maintaining privacy. Cyber threats, such as hacking, phishing, and malware, pose significant risks to individuals and organizations. Effective cybersecurity measures include using strong passwords, encrypting data, and implementing multi-factor authentication. Awareness and education about cyber risks are essential for preventing attacks. Governments and businesses must invest in robust cybersecurity infrastructure to safeguard against breaches. As technology advances, so do the tactics of cybercriminals. Continuous vigilance and proactive strategies are necessary to protect digital assets and ensure the security of information systems.", 'Social media has profoundly impacted society, changing how people communicate, access information, and perceive the world. Plat

In [141]:
def similarity_search_cosine(query, num_results=3):
    # Generate embedding for the query
    query_embedding = model.encode(query)
    
    # Perform the similarity search using cosine similarity
    cursor.execute("""
    SELECT id, name, file_id, filename,
        (embedding <=> %s::vector) AS cosine_similarity
    FROM documents
    ORDER BY cosine_similarity DESC
    LIMIT %s;
    """, (query_embedding.tolist(), num_results))
    
    return cursor.fetchall()

print("\nSimilarity search using cosine similarity:")
print(similarity_search_cosine("artificial intelligence"))



Similarity search using cosine similarity:
[(3, 'Umair', [623380281, 825528165, 985949571, 391852447], ['data1.txt', 'data10.txt', 'data11.txt', 'data8.txt'], 0.96514157240462), (4, 'Waleed', [104236840, 744920665, 271366396, 78483970], ['data13.txt', 'data7.txt', 'data9.txt', 'data6.txt'], 0.8709616356963267), (2, 'Huzan', [418711620, 611911868, 875983723, 597257227], ['data15.txt', 'data14.txt', 'data3.txt', 'data5.txt'], 0.7804007018464986)]


In [144]:
def similarity_search_euclidean(query, num_results=3):
    query_embedding=model.encode(query)
    cursor.execute("""
    SELECT id, name, file_id, filename,
           (embedding <-> %s::vector) AS euclidean_distance
    FROM documents
    ORDER BY euclidean_distance ASC
    LIMIT %s;
    """, (query_embedding.tolist(), num_results))
    
    return cursor.fetchall()

print("\nSimilarity search using euclidean distance:")
print(similarity_search_euclidean("cyber security"))


Similarity search using euclidean distance:
[(1, 'Amaan', [51088553, 523481290, 289299187, 369124287], ['data12.txt', 'data4.txt', 'data2.txt', 'data.txt'], 4.470328525699823), (3, 'Umair', [623380281, 825528165, 985949571, 391852447], ['data1.txt', 'data10.txt', 'data11.txt', 'data8.txt'], 6.684463524644643), (2, 'Huzan', [418711620, 611911868, 875983723, 597257227], ['data15.txt', 'data14.txt', 'data3.txt', 'data5.txt'], 6.785968316851653)]


In [145]:
def similarity_search_innerProd(query, num_results=3):
    query_embedding=model.encode(query)
    cursor.execute("""
    SELECT id, name, file_id, filename,
           (embedding <#> %s::vector) AS inner_product
    FROM documents
    ORDER BY inner_product DESC
    LIMIT %s;
    """, (query_embedding.tolist(), num_results))
    
    return cursor.fetchall()

print("\nSimilarity search using inner product:")
print(similarity_search_innerProd("social media"))


Similarity search using inner product:
[(2, 'Huzan', [418711620, 611911868, 875983723, 597257227], ['data15.txt', 'data14.txt', 'data3.txt', 'data5.txt'], -1.952012300491333), (3, 'Umair', [623380281, 825528165, 985949571, 391852447], ['data1.txt', 'data10.txt', 'data11.txt', 'data8.txt'], -2.395846366882324), (4, 'Waleed', [104236840, 744920665, 271366396, 78483970], ['data13.txt', 'data7.txt', 'data9.txt', 'data6.txt'], -4.670923709869385)]


In [147]:
def similarity_search_manhattan(query, num_results=3):
    query_embedding=model.encode(query)
    cursor.execute("""
    SELECT id, name, file_id, filename,
           (embedding <+> %s::vector) AS manhattan_distance
    FROM documents
    ORDER BY manhattan_distance ASC
    LIMIT %s;
    """, (query_embedding.tolist(), num_results))
    
    return cursor.fetchall()

print("\nSimilarity search using manhattan distance:")
print(similarity_search_manhattan("art"))


Similarity search using manhattan distance:
[(4, 'Waleed', [104236840, 744920665, 271366396, 78483970], ['data13.txt', 'data7.txt', 'data9.txt', 'data6.txt'], 61.8812141418457), (1, 'Amaan', [51088553, 523481290, 289299187, 369124287], ['data12.txt', 'data4.txt', 'data2.txt', 'data.txt'], 94.71932983398438), (3, 'Umair', [623380281, 825528165, 985949571, 391852447], ['data1.txt', 'data10.txt', 'data11.txt', 'data8.txt'], 98.23468780517578)]
