In [2]:
# Import required libraries
import os
from typing import List
from pathlib import Path
import fitz  # PyMuPDF for PDF processing
from pptx import Presentation
from qdrant_client import QdrantClient
from qdrant_client.http import models
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Create FileProcessor class
class FileProcessor:
    def __init__(self, collection_name: str = "file_collection"):
        # Initialize Qdrant client
        self.client = QdrantClient("localhost", port=6333)
        self.collection_name = collection_name
        
        # Initialize the embedding model
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Create collection if it doesn't exist
        self.client.recreate_collection(
            collection_name=self.collection_name,
            vectors_config=models.VectorParams(
                size=384,  # Vector size for all-MiniLM-L6-v2
                distance=models.Distance.COSINE
            )
        )

    def process_pdf(self, file_path: str) -> str:
        """Extract text from PDF files"""
        text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
        return text

    def process_ppt(self, file_path: str) -> str:
        """Extract text from PowerPoint files"""
        text = ""
        prs = Presentation(file_path)
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
        return text

    # def process_audio(self, file_path: str) -> str:
    #     """Convert audio to text using speech recognition"""
    #     recognizer = sr.Recognizer()
    #     with sr.AudioFile(file_path) as source:
    #         audio = recognizer.record(source)
    #         try:
    #             text = recognizer.recognize_google(audio)
    #         except sr.UnknownValueError:
    #             text = "[Unrecognized Audio]"
    #         except sr.RequestError:
    #             text = "[Error: Could not request results]"
    #     return text

    # def process_video(self, file_path: str) -> str:
    #     """Extract audio from video and convert to text"""
    #     # Extract audio from video
    #     audio_path = "temp_audio.wav"
    #     video = AudioFileClip(file_path)
    #     video.write_audiofile(audio_path, codec="pcm_s16le")
        
    #     # Convert audio to text
    #     text = self.process_audio(audio_path)
        
    #     # Clean up temporary audio file
    #     os.remove(audio_path)
    #     return text

    def store_file(self, file_path: str, metadata: dict = None) -> bool:
        """Store file content in Qdrant"""
        file_path = Path(file_path)
        
        # Process different file types
        if file_path.suffix.lower() == '.pdf':
            text = self.process_pdf(str(file_path))
        elif file_path.suffix.lower() in ['.ppt', '.pptx']:
            text = self.process_ppt(str(file_path))
        elif file_path.suffix.lower() in ['.mp3', '.wav']:
            text = self.process_audio(str(file_path))
        elif file_path.suffix.lower() in ['.mp4', '.avi']:
            text = self.process_video(str(file_path))
        else:
            raise ValueError(f"Unsupported file type: {file_path.suffix}")

        # Generate embedding
        embedding = self.model.encode(text).tolist()

        # Prepare metadata
        if metadata is None:
            metadata = {}
        metadata.update({
            "filename": file_path.name,
            "file_type": file_path.suffix,
            "file_path": str(file_path)
        })

        # Store in Qdrant
        response = self.client.upsert(
            collection_name=self.collection_name,
            points=[
                models.PointStruct(
                    id=hash(str(file_path)),  # Use hash of file path as ID
                    vector=embedding,
                    payload=metadata
                )
            ]
        )

        return True

    def search_similar_files(self, query: str, limit: int = 5) -> List[dict]:
        """Search for similar files based on text query"""
        # Generate embedding for the query
        query_vector = self.model.encode(query).tolist()
        
        # Search in Qdrant
        search_result = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_vector,
            limit=limit
        )
        
        return [{"score": hit.score, "metadata": hit.payload} for hit in search_result]


In [8]:
# Initialize processor
processor = FileProcessor()

# Test PDF processing
pdf_text = processor.process_pdf(r"E:\Wappnet internship\ElevateEdOrg\Data\6\cnn.pdf")
print("Extracted PDF Text:", pdf_text[:500])  # Print first 500 chars

# # Test PowerPoint processing
# ppt_text = processor.process_ppt("sample.pptx")
# print("Extracted PPT Text:", ppt_text[:500])

# # Test Audio processing
# audio_text = processor.process_audio("sample.wav")
# print("Extracted Audio Text:", audio_text)

# # Test Video processing
# video_text = processor.process_video("sample.mp4")
# print("Extracted Video Text:", video_text)

# Store file in Qdrant
success = processor.store_file(r"E:\Wappnet internship\ElevateEdOrg\Data\6\cnn.pdf")
print("File Stored Successfully:", success)

# Search for similar files
results = processor.search_similar_files("machine learning concepts")
print("Search Results:", results)


  self.client.recreate_collection(


Extracted PDF Text: Convolutional 
Networks
Lecture slides for Chapter 9 of Deep Learning 
Ian Goodfellow 
2016-09-12
(Goodfellow 2016)
Convolutional Networks
• Scale up neural networks to process very large images / 
video sequences 
• Sparse connections 
• Parameter sharing 
• Automatically generalize across spatial translations of inputs 
• Applicable to any input that is laid out on a grid (1-D, 2-D, 
3-D, …)
(Goodfellow 2016)
Key Idea
• Replace matrix multiplication in neural nets with 
convolution 
• Everythi
File Stored Successfully: True
Search Results: [{'score': 0.24976523, 'metadata': {'filename': 'cnn.pdf', 'file_type': '.pdf', 'file_path': 'E:\\Wappnet internship\\ElevateEdOrg\\Data\\6\\cnn.pdf'}}]


  search_result = self.client.search(
