In [3]:
from qdrant_client import QdrantClient
from collections import defaultdict
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Connect to Qdrant Cloud
client = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY")
)

# Get list of all collections
collections = client.get_collections().collections

for collection in collections:
    collection_name = collection.name
    print(f"\nProcessing collection: {collection_name}")
    
    # Scroll through all points in collection
    points = client.scroll(
        collection_name=collection_name,
        with_payload=True,
        with_vectors=False
    )[0]
    
    # Track points by their filename
    seen_points = defaultdict(list)
    
    # Group points by filename
    for point in points:
        if 'filename' in point.payload:  # Check if filename exists in payload
            filename = point.payload['filename']
            seen_points[filename].append(point.id)
            if len(seen_points[filename]) > 1:  # Print when we find a duplicate
                print(f"Found duplicate for filename: {filename}")
    
    # Find and delete duplicates
    points_to_delete = []
    for filename_group in seen_points.values():
        if len(filename_group) > 1:
            # Keep the first occurrence, delete the rest
            points_to_delete.extend(filename_group[1:])
    
    # Delete duplicate points
    if points_to_delete:
        client.delete(
            collection_name=collection_name,
            points_selector=points_to_delete
        )
        print(f"Deleted {len(points_to_delete)} duplicates from {collection_name}")
        print(f"These had identical filenames.")
    else:
        print(f"No duplicates found in {collection_name}")


Processing collection: personality
No duplicates found in personality

Processing collection: storyteller
No duplicates found in storyteller


In [6]:
from qdrant_client import QdrantClient
import os
from dotenv import load_dotenv
from collections import defaultdict

# Load environment variables
load_dotenv()

# Connect to Qdrant Cloud
client = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY")
)

# Get list of all collections
collections = client.get_collections().collections

for collection in collections:
    collection_name = collection.name
    print(f"\nAnalyzing collection: {collection_name}")
    
    # Scroll through all points in collection
    points = client.scroll(
        collection_name=collection_name,
        with_payload=True,
        with_vectors=False
    )[0]
    
    # Track unique filenames
    unique_filenames = set()
    total_points = 0
    
    # Count points and unique filenames
    for point in points:
        total_points += 1
        if 'filename' in point.payload:
            unique_filenames.add(point.payload['filename'])
    
    print(f"Total points in collection: {total_points}")
    print(f"Unique filenames: {len(unique_filenames)}")
    print(f"Potential duplicates: {total_points - len(unique_filenames)}")


Analyzing collection: personality
Total points in collection: 10
Unique filenames: 10
Potential duplicates: 0

Analyzing collection: storyteller
Total points in collection: 10
Unique filenames: 10
Potential duplicates: 0


In [7]:
from qdrant_client import QdrantClient
import os
from dotenv import load_dotenv
from collections import defaultdict

# Load environment variables
load_dotenv()

# Connect to Qdrant Cloud
client = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY")
)

# Get list of all collections
collections = client.get_collections().collections

for collection in collections:
    collection_name = collection.name
    print(f"\nProcessing collection: {collection_name}")
    
    # Track points by their filename
    seen_points = defaultdict(list)
    
    # Scroll through all points with pagination
    offset = None
    while True:
        batch, offset = client.scroll(
            collection_name=collection_name,
            with_payload=True,
            with_vectors=False,
            offset=offset
        )
        
        # Group points by filename in this batch
        for point in batch:
            if 'filename' in point.payload:
                filename = point.payload['filename']
                seen_points[filename].append(point.id)
                if len(seen_points[filename]) > 1:
                    print(f"Found duplicate for filename: {filename}")
        
        # If no more offset, we've reached the end
        if offset is None:
            break
    
    # Find and delete duplicates
    points_to_delete = []
    for filename_group in seen_points.values():
        if len(filename_group) > 1:
            # Keep the first occurrence, delete the rest
            points_to_delete.extend(filename_group[1:])
    
    # Delete duplicate points
    if points_to_delete:
        print(f"Found {len(points_to_delete)} duplicates to delete")
        client.delete(
            collection_name=collection_name,
            points_selector=points_to_delete
        )
        print(f"Successfully deleted {len(points_to_delete)} duplicates from {collection_name}")
    else:
        print(f"No duplicates found in {collection_name}")


Processing collection: personality
Found duplicate for filename: resume_1103.txt
Found duplicate for filename: resume_1197.txt
Found duplicate for filename: resume_1323.txt
Found duplicate for filename: resume_1353.txt
Found duplicate for filename: resume_105.txt
Found duplicate for filename: resume_1086.txt
Found duplicate for filename: resume_1344.txt
Found duplicate for filename: resume_1163.txt
Found duplicate for filename: resume_1071.txt
Found duplicate for filename: resume_1007.txt
Found duplicate for filename: resume_1393.txt
Found duplicate for filename: resume_1116.txt
Found duplicate for filename: resume_1216.txt
Found duplicate for filename: resume_1280.txt
Found duplicate for filename: resume_1324.txt
Found duplicate for filename: resume_1282.txt
Found duplicate for filename: resume_1235.txt
Found duplicate for filename: resume_1074.txt
Found duplicate for filename: resume_1285.txt
Found duplicate for filename: resume_1015.txt
Found duplicate for filename: resume_1178.txt