In [None]:
import os
import glob
import requests
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue


# Create Text Chunks

In [None]:

def get_text_chunks(text, chunk_size=100):
    """
    Splits a text into chunks of specified word count.
    """
    words = text.split()
    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks



# Start Qdrant (Vector DB)

In [None]:
def initialize_qdrant(collection_name):
    """
    Initializes Qdrant client and creates a collection if it doesn't exist.
    """
    client = QdrantClient("localhost", port=6333)  # Connect to Qdrant server
    collections = client.get_collections()
    if collection_name not in [c.name for c in collections.collections]:
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=384, distance=Distance.COSINE)  # Adjust size for embedding model
        )
    return client



# Check for duplicates

In [None]:
def is_duplicate_chunk(client, collection_name, file_name, chunk_number):
    """
    Checks if a chunk with the same file name and chunk number already exists in Qdrant.
    """
    response = client.scroll(
        collection_name=collection_name,
        scroll_filter=Filter(
            must=[
                FieldCondition(key="file_name", match=MatchValue(value=file_name)),
                FieldCondition(key="chunk_number", match=MatchValue(value=chunk_number))
            ]
        ),
        limit=1  # Only need to check if any match exists
    )
    return len(response.points) > 0



# Get keywords from Ollama

In [None]:
def get_keywords_from_ollama(chunk_text, model="llama3.2:1b"):
    """
    Uses the Ollama Python package to generate keywords for a given text chunk.
    """
    try:
        # Generate a response from Ollama
        response = ollama.generate(
            model=model,
            prompt=f"Extract the main keywords from the following text:\n\n{chunk_text}\n\nKeywords:"
        )
        # Return the keywords (clean up response)
        return response["response"].strip()
    except Exception as e:
        print(f"Failed to get keywords from Ollama: {e}")
        return ""


# Save to Qdrant

In [None]:
def process_files_and_save_to_qdrant(folder_path, collection_name, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)  # Embedding model
    client = initialize_qdrant(collection_name)
    point_id = 0  # Unique ID for chunks
    
    for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
        if os.path.isfile(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    chunks = get_text_chunks(content)
                    
                    for idx, chunk in enumerate(chunks, start=1):
                        # Skip duplicates
                        if is_duplicate_chunk(client, collection_name, os.path.basename(file_path), idx):
                            print(f"Skipping duplicate: {os.path.basename(file_path)} - Chunk {idx}")
                            continue
                        
                        # Generate embedding
                        embedding = model.encode(chunk)
                        
                        # Get keywords from Ollama
                        keywords = get_keywords_from_ollama(chunk)
                        
                        # Metadata
                        metadata = {
                            "file_name": os.path.basename(file_path),
                            "chunk_number": idx,
                            "content": chunk,
                            "keywords": keywords
                        }
                        
                        # Insert into Qdrant
                        client.upsert(
                            collection_name=collection_name,
                            points=[
                                PointStruct(id=point_id, vector=embedding.tolist(), payload=metadata)
                            ]
                        )
                        print(f"Inserted: {metadata}")
                        point_id += 1
            except Exception as e:
                print(f"Error processing {file_path}: {e}")


# Driver Code

In [None]:

folder_path = ""
collection_name = ""
process_files_and_save_to_qdrant(folder_path, collection_name)

