<a href="https://colab.research.google.com/github/EldarDadon/embeddings-project/blob/master/Eldar_Dadon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.14


In [None]:
!pip install python-docx


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [None]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:
import fitz
import docx
import re
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

def read_file(file_path):
    if file_path.endswith('.pdf'):
        return read_pdf(file_path)
    elif file_path.endswith('.docx'):
        return read_docx(file_path)
    else:
        raise ValueError("Unsupported file format. Please use PDF or DOCX.")

def read_pdf(file_path):
    text = ""
    with fitz.open(file_path) as pdf:
        for page in pdf:
            text += page.get_text()
    return text

def read_docx(file_path):
    doc = docx.Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

def split_text_fixed_size(text, chunk_size=200, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

def split_text_sentences(text):
    return sent_tokenize(text)

def split_text_paragraphs(text):
    paragraphs = re.split(r'\n\s*\n', text)
    return [p.strip() for p in paragraphs if p.strip()]

def create_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks)
    return embeddings

def save_embeddings(embeddings, ids, index_path="vector_index.faiss"):
    embedding_array = np.array(embeddings, dtype='float32')
    index = faiss.IndexFlatL2(embedding_array.shape[1])
    index.add(embedding_array)
    faiss.write_index(index, index_path)
    print(f"Saved index to {index_path}")

def load_embeddings(index_path="vector_index.faiss"):
    return faiss.read_index(index_path)

def search_similar(embedding, index, top_k=5):
    distances, indices = index.search(np.array([embedding], dtype='float32'), top_k)
    return distances, indices

def process_file(file_path, split_method="fixed", chunk_size=200, overlap=50, model_name="all-MiniLM-L6-v2", index_path="vector_index.faiss"):
    text = read_file(file_path)

    if split_method == "fixed":
        chunks = split_text_fixed_size(text, chunk_size, overlap)
    elif split_method == "sentences":
        chunks = split_text_sentences(text)
    elif split_method == "paragraphs":
        chunks = split_text_paragraphs(text)
    else:
        raise ValueError("Invalid split method. Choose 'fixed', 'sentences', or 'paragraphs'.")

    embeddings = create_embeddings(chunks, model_name)
    ids = list(range(len(embeddings)))
    save_embeddings(embeddings, ids, index_path)
    print("Processing complete.")

# יש להזין כאן את שם הקובץ
file_path = file_path = "/content/CVELDARDADON.pdf"
  # עדכן כאן את נתיב הקובץ שלך
process_file(file_path, split_method="fixed", chunk_size=200, overlap=50)

Saved index to vector_index.faiss
Processing complete.
