In [1]:
!pip install chromadb


Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.28.2-py3

In [2]:
import random
import chromadb
import os

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [145]:
def delete_ids_that_from_file(db_path, collection_name, input_file="deleted_ids.txt"):
    """
    Reads IDs from a file and deletes them from a specified collection.

    Args:
        db_path (str): Path to the Chroma DB database.
        collection_name (str): Name of the collection to modify.
        input_file (str): Path to the file containing IDs to delete.
    """
    # Step 1: Ensure the file exists
    if not os.path.exists(input_file):
        print(f"File {input_file} does not exist.")
        return

    # Step 2: Load the IDs from the file
    with open(input_file, "r") as file:
        ids_to_delete = file.read().splitlines()

    if not ids_to_delete:
        print("No IDs found in the file to delete.")
        return

    print(f"Loaded {len(ids_to_delete)} IDs from {input_file}.")

    # Step 3: Load the Chroma DB client using PersistentClient with the correct path
    client = chromadb.PersistentClient(path=db_path)

    # Step 4: Retrieve the collection
    collection = client.get_collection(collection_name)
    # Step 3: Get all document IDs from the collection by fetching all documents
    all_indices = collection.get(ids=None)  # This retrieves all documents, including their IDs
    document_ids = all_indices['ids']  # Extract document IDs

    # Count the number of documents before deletion
    initial_count = len(document_ids)
    print(f"Initial number of documents: {initial_count}")


    # Step 5: Delete the IDs from the collection
    for doc_id in ids_to_delete:
        collection.delete(ids=[doc_id])

    print(f"Deleted {len(ids_to_delete)} IDs from the collection '{collection_name}'.")

     # Step 7: Count the number of documents after deletion
    all_indices_after_deletion = collection.get(ids=None)  # Fetch remaining documents
    remaining_document_ids = all_indices_after_deletion['ids']
    remaining_count = len(remaining_document_ids)

    print(f"Remaining number of documents: {remaining_count}")


In [173]:
db_path = "/content/drive/MyDrive/All DB/Code/grapg/Python_VDB_Graph_Code_BERT"

In [174]:
client_py = chromadb.PersistentClient(path=db_path)

In [175]:
collections = client_py.list_collections()


In [176]:
collections

[Collection(name=codePY)]

# comparison

In [148]:
 # Step 4: Retrieve the collection

client = chromadb.PersistentClient(path="/content/drive/MyDrive/All DB/Code/grapg/Java_Script_VDB_Graph_Code_Bert")
collection = client.get_collection("CODEJSCRIPT")
# Step 3: Get all document IDs from the collection by fetching all documents
all_indices = collection.get(ids=None)  # This retrieves all documents, including their IDs
document_ids = all_indices['ids']  # Extract document IDs


In [157]:
 # Step 4: Retrieve the collection

client2 = chromadb.PersistentClient(path="/content/drive/MyDrive/All DB/Code/javascript_codet5_code_Vdb")
collection2 = client2.get_collection("javascript_codet5_code_collection")
# Step 3: Get all document IDs from the collection by fetching all documents
all_indices2 = collection2.get(ids=None)  # This retrieves all documents, including their IDs
document_ids2 = all_indices2['ids']  # Extract document IDs


In [167]:
unique_to_document_ids = set(document_ids) - set(document_ids2)
print(len(unique_to_document_ids))
print(unique_to_document_ids)

1
{'0'}


In [165]:
common_ids = set(document_ids) & set(document_ids2)
print(len(common_ids))

1038


# graph code bert

In [134]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/grapg/C++_VDB_Graph_Code_Bert",
    collection_name="codeC"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138




Deleted 100 IDs from the collection 'codeC'.
Remaining number of documents: 1039


In [146]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/grapg/Java_Script_VDB_Graph_Code_Bert",
    collection_name="CODEJSCRIPT"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
id 511
id 309
id 915
id 388
id 645
id 996
id 213
id 1071
id 126
id 639
id 284
id 410
id 1017
id 1007
id 189
id 737
id 1125
id 609
id 451
id 82
id 889
id 1115
id 10
id 374
id 479
id 566
id 396
id 682




id 712
id 490
id 1138
id 903
id 600
id 227
id 604
id 783
id 218
id 282
id 821
id 659
id 197
id 857
id 711
id 221
id 998
id 426
id 71
id 773
id 941
id 957
id 602
id 411
id 901
id 466
id 1123
id 1044
id 542
id 486
id 129
id 571
id 994
id 977
id 928
id 59
id 569
id 753
id 709
id 1045
id 150
id 806
id 596
id 235
id 813
id 51
id 649
id 53
id 81
id 687
id 918
id 384
id 1113
id 546
id 671
id 968
id 96
id 89
id 204
id 674
id 641
id 73
id 814
id 307
id 612
id 7
id 893
id 171
id 93
id 1022
id 525
id 449
Deleted 100 IDs from the collection 'CODEJSCRIPT'.
Remaining number of documents: 1039


In [172]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/grapg/Java_VDB_Graph_Code_Bert",
    collection_name="CODEJAVA"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
id 511
id 309
id 915
id 388
id 645
id 996
id 213
id 1071
id 126
id 639
id 284
id 410
id 1017
id 1007
id 189
id 737
id 1125
id 609
id 451
id 82
id 889
id 1115
id 10
id 374
id 479
id 566
id 396
id



 682
id 712
id 490
id 1138
id 903
id 600
id 227
id 604
id 783
id 218
id 282
id 821
id 659
id 197
id 857
id 711
id 221
id 998
id 426
id 71
id 773
id 941
id 957
id 602
id 411
id 901
id 466
id 1123
id 1044
id 542
id 486
id 129
id 571
id 994
id 977
id 928
id 59
id 569
id 753
id 709
id 1045
id 150
id 806
id 596
id 235
id 813
id 51
id 649
id 53
id 81
id 687
id 918
id 384
id 1113
id 546
id 671
id 968
id 96
id 89
id 204
id 674
id 641
id 73
id 814
id 307
id 612
id 7
id 893
id 171
id 93
id 1022
id 525
id 449
Deleted 100 IDs from the collection 'CODEJAVA'.
Remaining number of documents: 1039


In [177]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/grapg/Python_VDB_Graph_Code_BERT",
    collection_name="codePY"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
id 511
id 309
id 915
id 388
id 645
id 996
id 213
id 1071
id 126
id 639
id 284
id 410
id 1017
id 1007
id 189
id 737
id 1125
id 609
id 451
id 82
id 889
id 1115
id 10
id 374
id 479
id 566
id 396
id 682
id 712
id 490




id 1138
id 903
id 600
id 227
id 604
id 783
id 218
id 282
id 821
id 659
id 197
id 857
id 711
id 221
id 998
id 426
id 71
id 773
id 941
id 957
id 602
id 411
id 901
id 466
id 1123
id 1044
id 542
id 486
id 129
id 571
id 994
id 977
id 928
id 59
id 569
id 753
id 709
id 1045
id 150
id 806
id 596
id 235
id 813
id 51
id 649
id 53
id 81
id 687
id 918
id 384
id 1113
id 546
id 671
id 968
id 96
id 89
id 204
id 674
id 641
id 73
id 814
id 307
id 612
id 7
id 893
id 171
id 93
id 1022
id 525
id 449
Deleted 100 IDs from the collection 'codePY'.
Remaining number of documents: 1039


In [181]:
import chromadb

def delete_id0_Graph_Code_Bert(db_path, collection_name):
    # Initialize the PersistentClient and connect to the database
    client = chromadb.PersistentClient(path=db_path)

    # Retrieve the collection
    collection = client.get_collection(collection_name)

    # Delete the document with the specific ID '0'
    collection.delete(ids=['0'])

    # Step 3: Get all document IDs from the collection by fetching all documents
    all_indices = collection.get(ids=None)  # This retrieves all documents, including their IDs
    document_ids = all_indices['ids']  # Extract document IDs
    print("remain",len(document_ids))

    print(f"Deleted document with ID '0' from collection '{collection_name}' in database '{db_path}'")


In [182]:
delete_id0_Graph_Code_Bert(
    db_path="/content/drive/MyDrive/All DB/Code/grapg/C++_VDB_Graph_Code_Bert",
    collection_name="codeC"
)

delete_id0_Graph_Code_Bert(
    db_path="/content/drive/MyDrive/All DB/Code/grapg/Java_Script_VDB_Graph_Code_Bert",
    collection_name="CODEJSCRIPT"
)

delete_id0_Graph_Code_Bert(
    db_path="/content/drive/MyDrive/All DB/Code/grapg/Java_VDB_Graph_Code_Bert",
    collection_name="CODEJAVA"
)

delete_id0_Graph_Code_Bert(
    db_path="/content/drive/MyDrive/All DB/Code/grapg/Python_VDB_Graph_Code_BERT",
    collection_name="codePY"
)




remain 1038
Deleted document with ID '0' from collection 'codeC' in database '/content/drive/MyDrive/All DB/Code/grapg/C++_VDB_Graph_Code_Bert'
remain 1038
Deleted document with ID '0' from collection 'CODEJSCRIPT' in database '/content/drive/MyDrive/All DB/Code/grapg/Java_Script_VDB_Graph_Code_Bert'
remain 1038
Deleted document with ID '0' from collection 'CODEJAVA' in database '/content/drive/MyDrive/All DB/Code/grapg/Java_VDB_Graph_Code_Bert'
remain 1038
Deleted document with ID '0' from collection 'codePY' in database '/content/drive/MyDrive/All DB/Code/grapg/Python_VDB_Graph_Code_BERT'


# code

In [123]:
# delete_ids_that_from_file(
#     db_path="/content/drive/MyDrive/All DB/Code/C++_VDB_Graph_Code_Bert",
#     collection_name="codeC"
# )

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1039


OperationalError: attempt to write a readonly database

In [21]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/C_codet5_code_Vdb",
    collection_name="C_codet5_code_collection"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Successfully deleted 1 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 2 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 3 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 4 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 5 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 6 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 7 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 8 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 9 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 10 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 11 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 12 IDs from the collection 'C_codet5_code_collection'.
Successfully deleted 13 ID

In [128]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/JAVA-qween-VectorDB",
    collection_name="Java-VectorDataBase-Qwen2.5-Coder-7B"
)



Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1038




Deleted 100 IDs from the collection 'Java-VectorDataBase-Qwen2.5-Coder-7B'.
Remaining number of documents: 1038


In [32]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/JavaScript-qween-VectorDB",
    collection_name="JavaScript-VectorDataBase-Qwen2.5-Coder-7B"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'JavaScript-VectorDataBase-Qwen2.5-Coder-7B'.
Remaining number of documents: 1038


In [37]:
# delete_ids_that_from_file(
#     db_path="/content/drive/MyDrive/All DB/Code/Java_Script_VDB_Graph_Code_Bert",
#     collection_name="CODEJSCRIPT"
# )

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138




Deleted 100 IDs from the collection 'CODEJSCRIPT'.
Remaining number of documents: 1039


In [42]:
# delete_ids_that_from_file(
#     db_path="/content/drive/MyDrive/All DB/Code/Java_VDB_Graph_Code_Bert",
#     collection_name="CODEJAVA"
# )

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138




Deleted 100 IDs from the collection 'CODEJAVA'.
Remaining number of documents: 1039


In [47]:
# delete_ids_that_from_file(
#     db_path="/content/drive/MyDrive/All DB/Code/Python_VDB_Graph_Code_BERT",
#     collection_name="codePY"
# )

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138




Deleted 100 IDs from the collection 'codePY'.
Remaining number of documents: 1039


In [52]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/c++-qwen-VectorDB",
    collection_name="CVectorDataBase-Qwen2.5-Coder-7B"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'CVectorDataBase-Qwen2.5-Coder-7B'.
Remaining number of documents: 1038


In [57]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/java_codet5_code_Vdb",
    collection_name="java_codet5_code_collection"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'java_codet5_code_collection'.
Remaining number of documents: 1038


In [62]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/javascript_codet5_code_Vdb",
    collection_name="javascript_codet5_code_collection"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'javascript_codet5_code_collection'.
Remaining number of documents: 1038


In [67]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/python-qwen-VectorDB",
    collection_name="pythonVectorDataBase-Qwen2.5-Coder-7B"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'pythonVectorDataBase-Qwen2.5-Coder-7B'.
Remaining number of documents: 1038


In [72]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Code/python_codet5_code_Vdb",
    collection_name="python_codet5_code_collection"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'python_codet5_code_collection'.
Remaining number of documents: 1038


# desc

In [77]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Desc/C_codet5_disc_Vdb",
    collection_name="C_codet5_disc_collection"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'C_codet5_disc_collection'.
Remaining number of documents: 1038


In [82]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Desc/Chroma_DB_JavaScript_Desc_falcon",
    collection_name="JavaScript-VectorDB-Desc-Falcon"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'JavaScript-VectorDB-Desc-Falcon'.
Remaining number of documents: 1038


In [87]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Desc/Chroma_DB_Java_Desc_falcon",
    collection_name="Java-VectorDB-Desc-Falcon"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'Java-VectorDB-Desc-Falcon'.
Remaining number of documents: 1038


In [93]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Desc/Chroma_DB_Py_Desc_falcon",
    collection_name="Python-VectorDB-Desc-Falcon"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'Python-VectorDB-Desc-Falcon'.
Remaining number of documents: 1038


In [98]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Desc/chroma_db_c_desc_falcon",
    collection_name="C-VectorDB-Desc-Falcon"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'C-VectorDB-Desc-Falcon'.
Remaining number of documents: 1038


In [103]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Desc/java_codet5_disc_Vdb",
    collection_name="java_codet5_disc_collection"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'java_codet5_disc_collection'.
Remaining number of documents: 1038


In [113]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Desc/javascript_codet5_disc_Vdb",
    collection_name="javascript_codet5_disc_collection"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'javascript_codet5_disc_collection'.
Remaining number of documents: 1038


In [108]:
delete_ids_that_from_file(
    db_path="/content/drive/MyDrive/All DB/Desc/python_codet5_disc_Vdb",
    collection_name="python_codet5_disc_collection"
)

Loaded 100 IDs from deleted_ids.txt.
Initial number of documents: 1138
Deleted 100 IDs from the collection 'python_codet5_disc_collection'.
Remaining number of documents: 1038
