In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [112]:
!ls "drive/MyDrive/Colab Notebooks/OrganisedSearch/data"

'bch(1).pdf'  'bch(5).pdf'  'testing 1 - Copy.txt'  'testing 1.txt'   textingt2.txt


In [113]:
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer

In [114]:
from docx import Document
import csv
import openpyxl
import PyPDF2

In [115]:
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
import os

In [116]:
file_paths="drive/MyDrive/Colab Notebooks/OrganisedSearch/data"


In [117]:
files=os.listdir(file_paths)

In [118]:
print("files:",files)

files: ['bch(1).pdf', 'bch(5).pdf', 'testing 1.txt', 'textingt2.txt', 'testing 1 - Copy.txt']


In [71]:
def get_embedding(text):
    return model.encode(text)

In [88]:
def extract_text_from_pdf(file_path):
    if not os.path.exists(file_path):
        print(f"Error: File not found - {file_path}")
        return ""

    text = ""
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()

def extract_text_from_txt(file_path):
    if not os.path.exists(file_path):
        print(f"Error: File not found - {file_path}")
        return ""

    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read().strip()

def extract_text_from_docx(file_path):
    if not os.path.exists(file_path):
        print(f"Error: File not found - {file_path}")
        return ""

    doc = Document(file_path)
    return '\n'.join([para.text for para in doc.paragraphs]).strip()

def extract_text_from_csv(file_path):
    if not os.path.exists(file_path):
        print(f"Error: File not found - {file_path}")
        return ""

    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        return '\n'.join([' '.join(row) for row in reader]).strip()

def extract_text_from_xlsx(file_path):
    if not os.path.exists(file_path):
        print(f"Error: File not found - {file_path}")
        return ""

    wb = openpyxl.load_workbook(file_path)
    sheet = wb.active
    text = ""
    for row in sheet.iter_rows(values_only=True):
        text += ' '.join([str(cell) for cell in row if cell is not None]) + '\n'
    return text.strip()

In [91]:
def load_files(directory):
    file_paths = [os.path.join(directory, file) for file in os.listdir(directory)]
    file_texts = []
    file_names = []

    for file_path in file_paths:
        text = ""
        if file_path.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        elif file_path.endswith(".txt"):
            text = extract_text_from_txt(file_path)
        elif file_path.endswith(".docx"):
            text = extract_text_from_docx(file_path)
        elif file_path.endswith(".csv"):
            text = extract_text_from_csv(file_path)
        elif file_path.endswith(".xlsx"):
            text = extract_text_from_xlsx(file_path)
        else:
            print(f"Skipping unsupported file type: {file_path}")
            continue

        if text:  # Ensure file is not empty
            file_texts.append(text)
            file_names.append(os.path.basename(file_path))

    return file_texts, file_names

In [102]:
def compute_all_similarities(file_texts, file_names, threshold=0.8):
    file_embeddings = model.encode(file_texts)  # Get embeddings
    similarity_matrix = cosine_similarity(file_embeddings)

    similar_files = []
    for i in range(len(file_names)):
        for j in range(i + 1, len(file_names)):  # Avoid redundant comparisons
            similarity_score = similarity_matrix[i][j]
            if similarity_score >= threshold:
                similar_files.append({
                    "file1": file_names[i],
                    "file2": file_names[j],
                    "similarity_score": round(similarity_score * 100, 2)
                })
    print("similar_files:",similar_files)

    return similar_files

In [119]:
 file_texts, file_names = load_files(file_paths)

In [120]:
similar_files = compute_all_similarities(file_texts, file_names)

similar_files: [{'file1': 'testing 1.txt', 'file2': 'testing 1 - Copy.txt', 'similarity_score': 100.0}]


In [121]:
print("similar_files:",similar_files)

similar_files: [{'file1': 'testing 1.txt', 'file2': 'testing 1 - Copy.txt', 'similarity_score': 100.0}]


In [122]:
def search_files(query, file_texts, file_names, top_n=5):
    query_embedding = model.encode([query])  # Convert query to embedding
    file_embeddings = model.encode(file_texts)  # Convert files to embeddings
    similarities = cosine_similarity(query_embedding, file_embeddings)[0]

    ranked_files = sorted(
        zip(file_names, similarities), key=lambda x: x[1], reverse=True
    )

    return [{"file": file, "score": round(score * 100, 2)} for file, score in ranked_files[:top_n]]

In [123]:
search_query = "the appropriate use of capitals"
search_results = search_files(search_query, file_texts, file_names)

In [124]:
print("\nðŸ”¹ Search results:")
for result in search_results:
    print(f"{result['file']} | Relevance: {result['score']}%")


ðŸ”¹ Search results:
testing 1.txt | Relevance: 28.01%
testing 1 - Copy.txt | Relevance: 28.01%
bch(1).pdf | Relevance: 21.11%
bch(5).pdf | Relevance: 7.62%
textingt2.txt | Relevance: -0.24%
