## Retriever

#### Text pre-processing

Pre-processing improves retrieval accuracy in RAG systems. It's well known that PDFs often have irregular spaces and newline characters.

In [None]:
# Removing extra whitespaces and linebreakes

import re

def clean_whitespace(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = text.strip()  # Remove leading and trailing spaces
    return text

PDFs might also contain characters like bullets, dashes or unwanted puncuation that can be removed.

In [None]:
# Removing only spesific punctuatioon except , and .

def remove_unwanted_chars(text):
    text = re.sub(r'[^\w\s.,!?;:]', '', text)  # Keeps words, spaces and selected punctuation (. and ,)
    return text

In [None]:
# Converting all letters into lowersace so that the whole text has the same format

def to_lowercase(text):
    return text.lower()

University PDFs often have repeated headers/footers that don’t contribute to meaningful information so they need to get remved.

In [None]:
# Removing useless phrases that don't offer important information

def remove_boilerplate(text):
    unwanted_phrases = ['Page', 'Copyright', 'Confidential']  # Common phrases
    for phrase in unwanted_phrases:
        text = text.replace(phrase, '')
    return text

PDFs often contain broken words across lines with hyphens (exam-\nnation -> examination).

In [None]:
def fix_hyphenation(text):
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)  # Removing hyphenated line breaks
    return text

Large text blocks aren’t effective for retrieval so we could split the text into smaller semantic chunks, but for the sake of generating descriptions for each document we will keed the text as they are.

#### Combining all the functions into one

In [None]:
def preprocess_text(text):
    """Applies all preprocessing steps including punctuation removal."""
    text = clean_whitespace(text)
    text = remove_unwanted_chars(text)
    text = to_lowercase(text)
    text = remove_boilerplate(text)
    text = fix_hyphenation(text)
    return [text]

Since we have plenty of folders containing pdf and docx documents we need to somehow loop through each and every single one of them to extract information. For this purpose we will create new functions that process pdf and docx documents.

#### Installing important packages

In [None]:
!pip install langchain
!pip install -U langchain-community
!pip install langchain-openai
!pip install transformers
!pip install torch
!pip install docx2txt

#### Importing libraries

In [None]:
import os
import langchain
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Neo4jVector
from langchain.embeddings import OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import WebBaseLoader
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from transformers import AutoTokenizer, AutoModel
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import pandas as pd
import json
import torch

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [None]:
# All the processed documents will be stored here
docs = []

# One for each folder (pdfs and docx forms)
docs1 = []
docs2 = []

In [None]:
# For PDF, docx and doc files
import os
import fitz  # pymupdf for better Greek text extraction

def process_pdfs(pdf_folder):
    pdf_docs = []
    pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

    for pdf_file in pdf_files:
        file_path = os.path.join(pdf_folder, pdf_file)

        try:
            doc = fitz.open(file_path)
            extracted_pdf_text = " ".join([page.get_text("text") for page in doc])  # Extract text
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")
            extracted_pdf_text = ""

        # Ensure proper encoding (Greek characters)
        extracted_pdf_text = extracted_pdf_text.encode("utf-8", "ignore").decode("utf-8")

        cleaned_pdf_text = preprocess_text(extracted_pdf_text)  # Your existing pre-processing function
        pdf_docs.append(cleaned_pdf_text)

    return pdf_docs, pdf_files

def process_docx(docx_folder):
    # Extracting text from all DOCX files in a folder and returns a list of documents
    docx_docs = []
    docx_files = [f for f in os.listdir(docx_folder) if f.endswith(".docx")] # Again takes all the file names in the folder but filters only files ending in .docx


    for docx_file in docx_files:
        loader = Docx2txtLoader(os.path.join(docx_folder, docx_file))
        extracted_docx_text = loader.load()

        # Converting extracted_text to string and pre-processing
        full_docx_text = " ".join([doc.page_content for doc in extracted_docx_text])
        cleaned_docx_text = preprocess_text(full_docx_text)

        docx_docs.extend(cleaned_docx_text)

    return docx_docs, docx_files

# Paths where PDF and DOCX files are stored
pdf_folder = '/Users/a35797/Documents/DSC514/Team Project/pdf_forms'
docx_folder = '/Users/a35797/Documents/DSC514/Team Project/docx_forms'

# Processing files separately
pdf_docs, pdf_files = process_pdfs(pdf_folder)
docx_docs, docx_files = process_docx(docx_folder)

# Merging everything into a single `docs` list
docs1 = pdf_docs
docs2 = docx_docs

print(f"Total documents processed: {len(docs1)}")
print(f"Total documents processed: {len(docs2)}")

Total documents processed: 31
Total documents processed: 24


In [None]:
docs1

[['ονομασεπωντμο : ........ σίηινοθέζε: ......... κοπο σαξιδιοτ: ................ αα ηκεξνκελία ώξα αξηζκόο δηαλπθηεξ. ναι ουι 1 αναχώπηζη αθιξη 2 αναχώπηζη αθιξη 3 αναχώπηζη αθιξη 4 αναχώπηζη αθιξη αλλα εξοδα: λεπηνκέξεηεοείδνο πνζό ζε επξώ  ύλνιν  1 αεροπορικό ειζιηήριο οηθνλνκηθήοπξώηεο ζέζεο  .. πιεξσκή από αηηεηή πιεξσκή πξνο σαμηδησηηθό γξαθείν δηαγξάςηε όηη δελ ηζρύεη 2  .. πιεξσκή από αηηεηή πιεξσκή απεπζείαο από παλεπηζηήκην κύπξνπ πξνο δηνξγαλσηέο δηαγξάςηε όηη δελ ηζρύεη 3 μεηαθνξηθά έμνδα: α από και ππορ ζηα αεποδπόμια ζηην κύππο με απόδειξη  .. φωπίρ απόδειξη  .. β από και ππορ αεποδπόμια ζηο εξωηεπικό μόνο με απόδειξη  .. 4 άιια έμνδα   .. ύλνιν απνδεκίσζεο .. υξεκαηνδνηείηαηκαιύπηεηαη κεξηθώο ή νιηθώο από άιιν φνξέα; ναι  ουι πχ. διαμονή, διαηποθή κλπ μείνλ πξνθαηαβνιή . .. όνομα φορέα: ... πνζό πξνο πιεξσκή ποσό: . αριθμός ημερών: . άλλη κάλσψη: .  ..... υπογπαθή αιηηηή ημεπομηνία υπευθυνη δηλωση επιδομα τνσηρηη εξωσερικοτ για υρηη απο οικονομικε τπηρειε εντυπο ουγπ3 δη

In [None]:
docs2

['registration in courses students from gaza to be able to attend postgraduate courses at the university of cyprus, you should hold a degree from a recognized academic institution and meet the admission criteria of the university of cyprus. please complete this form and send it to ms. stella sotiriou sotiriou.stellaucy.ac.cy at the graduate school. to be completed by the student semester: fall spring identity card no: __________________________________ level of studies: master programme name  surname: ___________________________________________ telephone no: _____________________________ department: __________________________________ postal address  code: __________________________________ please select from the list below the courses you would like to attend. course code course title ects  arc 670 applications of geoinformatics in archeology 10 fes 730 méthodologie de la didactique du fle 10 fes 731 tice en didactique du fle 10 fes 734 didactique de la grammaire du fle 10 fes 775 euro

After extracting the text from the pdf and docx files we will generate desriptions for every document we have and add it to a csv file that contains their names and the links used to find them. In order to do that we will ceate a prompt with the use of OllamaLLM. It may not be the best choice among LLMs since GPT can be more effective but for the sake of the course we can compromise with what is accesible to us.

In [None]:
# Creating 2 lists, one containig the names of the docs and another containign the extracted texts

docs = docs1 + docs2
files = pdf_files + docx_files

In [None]:
# Checking if the length of the list with the docs names matches the list with the extracted texts

print("Number of docs:", len(docs))
print("Number of files:", len(files))

Number of docs: 55
Number of files: 55


In [None]:
# Creating a dictionary that contains the names and the text parts from the pdfs

project_dict = {}

In [None]:
for i in range(len(files)):
    project_dict[files[i]] = docs[i]

In [None]:
project_dict

{'Entypo-Ypobolis-Exodwn-Exoterikou-GP3.pdf': ['ονομασεπωντμο : ........ σίηινοθέζε: ......... κοπο σαξιδιοτ: ................ αα ηκεξνκελία ώξα αξηζκόο δηαλπθηεξ. ναι ουι 1 αναχώπηζη αθιξη 2 αναχώπηζη αθιξη 3 αναχώπηζη αθιξη 4 αναχώπηζη αθιξη αλλα εξοδα: λεπηνκέξεηεοείδνο πνζό ζε επξώ  ύλνιν  1 αεροπορικό ειζιηήριο οηθνλνκηθήοπξώηεο ζέζεο  .. πιεξσκή από αηηεηή πιεξσκή πξνο σαμηδησηηθό γξαθείν δηαγξάςηε όηη δελ ηζρύεη 2  .. πιεξσκή από αηηεηή πιεξσκή απεπζείαο από παλεπηζηήκην κύπξνπ πξνο δηνξγαλσηέο δηαγξάςηε όηη δελ ηζρύεη 3 μεηαθνξηθά έμνδα: α από και ππορ ζηα αεποδπόμια ζηην κύππο με απόδειξη  .. φωπίρ απόδειξη  .. β από και ππορ αεποδπόμια ζηο εξωηεπικό μόνο με απόδειξη  .. 4 άιια έμνδα   .. ύλνιν απνδεκίσζεο .. υξεκαηνδνηείηαηκαιύπηεηαη κεξηθώο ή νιηθώο από άιιν φνξέα; ναι  ουι πχ. διαμονή, διαηποθή κλπ μείνλ πξνθαηαβνιή . .. όνομα φορέα: ... πνζό πξνο πιεξσκή ποσό: . αριθμός ημερών: . άλλη κάλσψη: .  ..... υπογπαθή αιηηηή ημεπομηνία υπευθυνη δηλωση επιδομα τνσηρηη εξωσερικοτ γι

#### Generating Descriptions

In [None]:
from langchain.chat_models import ChatOpenAI
os.environ["OPENAI_API_KEY"] = ''

In [None]:
from langchain_core.prompts import ChatPromptTemplate
import langchain
from langchain_core.output_parsers import StrOutputParser

llm = ChatOpenAI(temperature=0, model="gpt-4o")

  llm = ChatOpenAI(temperature=0, model="gpt-4o")


In [None]:
desc = """ I will give you a document that will be used for similarity search in a RAG system.
Generate a description for this document so the system retrieves it more effectively, be sure to mention its purpose, go heavy on the keywords that will help the system to retrieve this specific document. Return the descriptions in English.
Don't give me any narration or verbose, just output the description and the keywords and avoid using any 'description:' or 'document description:'
File: {document}"""

In [None]:
prompt = ChatPromptTemplate.from_template(desc)

In [None]:
chain = prompt | llm | StrOutputParser()

In [None]:
def doc_description(doc):
    return(chain.invoke({"document" : doc}))

In [None]:
# Testing how the prompt works

doc_description(docs1[0])

'This document is a financial reimbursement form used for travel expenses related to attending a conference. It includes fields for personal information, travel itinerary, and various expense categories such as airfare, transportation, and other costs. The form requires details on payment methods, proof of expenses, and any additional funding sources. It is intended for individuals seeking reimbursement from an organization or institution for costs incurred during professional travel. \n\nKeywords: financial reimbursement, travel expenses, conference attendance, airfare, transportation costs, expense categories, payment methods, proof of expenses, funding sources, professional travel, reimbursement form, travel itinerary, personal information, organization, institution.'

Since we have a lot of dociments we need to loop through all of them in order to generate descriptions for every document avilable in the folders.

In [None]:
doc_description(docs2[:5])

'This document is a comprehensive guide and application form for students and researchers from Gaza seeking to attend postgraduate courses at the University of Cyprus. It includes detailed instructions for registration, admission criteria, and course selection, as well as information on obtaining entry and residence permits in Cyprus. The document also covers the legal framework for aliens and immigration, including the conditions of entry and residence for third-country nationals for purposes such as research, studies, and training. Additionally, it provides guidelines on the processing of personal data in compliance with EU regulations.\n\nKeywords: University of Cyprus, postgraduate courses, registration, admission criteria, Gaza students, entry permit, residence permit, aliens and immigration, third-country nationals, research, studies, training, personal data processing, EU regulations, course selection, application form, civil registry, migration department, Cyprus.'

In [None]:
# Saving the dictionary as a csv so we  don't hve to regenerate the descriptions every time we run the code

import csv

csv_file = "dsc514_project_dictionary.csv"

# Writing dictionary as CSV with 2 columns: filename, content
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["filename", "content"])  # Write column headers
    for filename, content in project_dict.items():
        writer.writerow([filename, content])  # Write each row

In [None]:
from IPython.display import display, FileLink

display(FileLink(csv_file))

In [None]:
descriptions = []

# Loop through each document and generate a description for its content
for i, doc in enumerate(docs):
    descriptions.append({
        'File': files[i],
        'Text': doc,
        'Description': doc_description(doc)
    })

In [None]:
df_desc = pd.DataFrame.from_dict(descriptions)

df_desc.to_csv('descriptions.csv')

In [None]:
df_desc

Unnamed: 0,File,Text,Description
0,Entypo-Ypobolis-Exodwn-Exoterikou-GP3.pdf,[ονομασεπωντμο : ........ σίηινοθέζε: ...........,This document is a financial reimbursement for...
1,Recommendation Letter Form for Admission to th...,[letter of recommendation to the applicant ple...,"This document is a ""Letter of Recommendation F..."
2,Entypo-Apozimiwsis-Filoksenoumenwn-GP4.pdf,[ενστπο οτγπ4 προ: οικονομικέρ τπηπεζίερ σο: f...,This document is a financial services form rel...
3,Payment Order for Extra Pay Through the Payrol...,[1. payment to: 2. 3. position: 4. 5.a it doe...,This document is a payment order form used by ...
4,Declaration Claim Form for I.R.59.pdf,[declaration claim form to: university of cypr...,Declaration Claim Form for Income Tax Allowanc...
5,Entypo-Bebaiwsis-Paroxis-Ypiresiwn-GP5.pdf,[εντυπο ουγπ5 .. ονοματεπωνυμο: ... αρ. ταυτοτ...,This document is a form used for the declarati...
6,MA-DissertationForm-2.pdf,[μαστερ πολιτικης επιστημης διπλωματικη εργασι...,Master's thesis proposal for Political Science...
7,Timesheet_EN.pdf,[ναμε month id no s.i. no year position depart...,This document is a timesheet template used for...
8,Special-Course-Approval-Form.pdf,[],It seems there is no content in the file you p...
9,Entypo-Pliromis-Timologiou-me-Fwtoantigrafo-GP...,[ενστπο οτγπ6 προ: οικονομικε τπηρειε απο:. σι...,This document is a financial services report r...


In [None]:
from IPython.display import FileLink
FileLink('descriptions.csv')

#### PDFs csv

In [None]:
import pandas as pd

# Read the CSV for PDFs
df_pdfs = pd.read_csv('/Users/a35797/Documents/DSC514/Team Project/pdfs.csv')

In [None]:
df_pdfs

Unnamed: 0,Names,Links
0,Timesheet_EN.pdf,https://drive.google.com/file/d/143GDgWLQ2vcol...
1,Submission Form of Supporting Documents for th...,https://drive.google.com/file/d/1Cly8FoLDhvxu1...
2,Recommendation Letter Form for Admission to th...,https://drive.google.com/file/d/1Nk5xDd69GZ_zR...
3,Recommendation Letter Form for Admission to th...,https://drive.google.com/file/d/1RUtNi75kvHtTh...
4,recommendationLetterEN_Politechnikis.pdf,https://drive.google.com/file/d/1CfgUTpvQCbv0i...
5,Recommendation_Form_-_English.pdf,https://drive.google.com/file/d/14ux76MVPTAHeW...
6,Phd-degree-award-sumbission-form-eng.pdf,https://drive.google.com/file/d/1AkYmfgmYjaHXb...
7,MA-DissertationForm-2.pdf,https://drive.google.com/file/d/1dzpCpulPoJhlt...
8,HostingAgreementUCY-ResearcherCitizen3rdCountr...,https://drive.google.com/file/d/1I4v_9w8jefMT7...
9,Entipo-Aitisis-Isdoxis-se-Defterevon-Ptixio-MI...,https://drive.google.com/file/d/1PyQaAdA8rUpVR...


In [None]:
import pandas as pd

# Load the descriptions
df_desc = pd.read_csv('descriptions.csv')  # has 'file', 'text', 'description'

# Merge based on matching document names
# We assume that 'Names' in df_links corresponds to 'file' in df_desc
df_merged = pd.merge(df_pdfs, df_desc[['File', 'Description']], left_on='Names', right_on='File', how='inner')

# Optionally drop the duplicate 'file' column
df_merged = df_merged.drop(columns=['File'])

# Save the merged DataFrame
df_merged.to_csv('merged_documents.csv', index=False)

df_merged

Unnamed: 0,Names,Links,Description
0,Timesheet_EN.pdf,https://drive.google.com/file/d/143GDgWLQ2vcol...,This document is a timesheet template used for...
1,Submission Form of Supporting Documents for th...,https://drive.google.com/file/d/1Cly8FoLDhvxu1...,This document is a submission form for support...
2,Recommendation Letter Form for Admission to th...,https://drive.google.com/file/d/1Nk5xDd69GZ_zR...,"This document is a ""Letter of Recommendation F..."
3,Recommendation Letter Form for Admission to th...,https://drive.google.com/file/d/1RUtNi75kvHtTh...,Graduate Programs in Psychology Recommendation...
4,recommendationLetterEN_Politechnikis.pdf,https://drive.google.com/file/d/1CfgUTpvQCbv0i...,"This document is a ""Letter of Recommendation F..."
5,Recommendation_Form_-_English.pdf,https://drive.google.com/file/d/14ux76MVPTAHeW...,Graduate Programs in Psychology Recommendation...
6,Phd-degree-award-sumbission-form-eng.pdf,https://drive.google.com/file/d/1AkYmfgmYjaHXb...,This document outlines the submission process ...
7,MA-DissertationForm-2.pdf,https://drive.google.com/file/d/1dzpCpulPoJhlt...,Master's thesis proposal for Political Science...
8,HostingAgreementUCY-ResearcherCitizen3rdCountr...,https://drive.google.com/file/d/1I4v_9w8jefMT7...,"This document outlines the ""Hosting Agreement""..."
9,Entipo-Aitisis-Isdoxis-se-Defterevon-Ptixio-MI...,https://drive.google.com/file/d/1PyQaAdA8rUpVR...,Application for Admission to Minor Degree Prog...


In [None]:
from IPython.display import FileLink
FileLink('merged_documents.csv')

#### DOCX csv

In [None]:
import pandas as pd

# Read the CSV for PDFs
df_docs = pd.read_csv('/Users/a35797/Documents/DSC514/Team Project/doc.csv')

In [None]:
df_docs

Unnamed: 0,Names,Links
0,DECLARATION_for_ASYLUM-1.docx,https://drive.google.com/file/d/1YuPyqbMwbyqud...
1,ΕΚΠΤΩΣΗ_ΔΙΔΑΚΤΡΩΝ_english.docx,https://drive.google.com/file/d/1GQhqR_kS5iw2l...
2,registrationformTESI.docx,https://drive.google.com/file/d/1ORVxcWnFG4Jvn...
3,C_Δήλωση-ενδιαφέροντος-ΠΑ_2022-4.docx,https://drive.google.com/file/d/1nyKI4SVR0HbXw...
4,Entypo-Allagis-Bathmologias-english.docx,https://drive.google.com/file/d/1RcxuRd3zupqf2...
5,(Application for the grant of the Entry Permit...,https://drive.google.com/file/d/1tCIdZxFjrvtyc...
6,Έντυπο-Συστατικής-Επιστολής-ΣΜΣ-Πανεπιστήμιο-Κ...,https://drive.google.com/file/d/1afErlGaZ-R2FP...
7,entypo-gia-eggrafi-se-peran-ton-45-english.docx,https://drive.google.com/file/d/1bkV40UcefBl9E...
8,CourseDescription_EN.docx,https://drive.google.com/file/d/1x_VFKwgRHgghn...
9,EXTERNAL_ATTENDANT_AUDIT_STUDENTS_FORM.docx,https://drive.google.com/file/d/1V5-6u3RR-Cy1i...


In [None]:
import pandas as pd

# Load the descriptions
df_desc2 = pd.read_csv('descriptions.csv')  # has 'file', 'text', 'description'

# Merge based on matching document names
# We assume that 'Names' in df_links corresponds to 'file' in df_desc
df_merged2 = pd.merge(df_docs, df_desc[['File', 'Description']], left_on='Names', right_on='File', how='inner')

# Optionally drop the duplicate 'file' column
df_merged2 = df_merged2.drop(columns=['File'])

# Save the merged DataFrame
df_merged2.to_csv('merged_documents2.csv', index=False)

df_merged2

Unnamed: 0,Names,Links,Description
0,DECLARATION_for_ASYLUM-1.docx,https://drive.google.com/file/d/1YuPyqbMwbyqud...,Declaration form for entry into the Republic o...
1,ΕΚΠΤΩΣΗ_ΔΙΔΑΚΤΡΩΝ_english.docx,https://drive.google.com/file/d/1GQhqR_kS5iw2l...,Scholarship Announcement for Postgraduate Stud...
2,registrationformTESI.docx,https://drive.google.com/file/d/1ORVxcWnFG4Jvn...,Document for students from Gaza seeking regist...
3,C_Δήλωση-ενδιαφέροντος-ΠΑ_2022-4.docx,https://drive.google.com/file/d/1nyKI4SVR0HbXw...,This document is an application form for parti...
4,Entypo-Allagis-Bathmologias-english.docx,https://drive.google.com/file/d/1RcxuRd3zupqf2...,Graduate school grade change form for students...
5,(Application for the grant of the Entry Permit...,https://drive.google.com/file/d/1tCIdZxFjrvtyc...,This document is a comprehensive guide and app...
6,Έντυπο-Συστατικής-Επιστολής-ΣΜΣ-Πανεπιστήμιο-Κ...,https://drive.google.com/file/d/1afErlGaZ-R2FP...,This document is a recommendation letter form ...
7,entypo-gia-eggrafi-se-peran-ton-45-english.docx,https://drive.google.com/file/d/1bkV40UcefBl9E...,Special Permission for Enrolment in More than ...
8,CourseDescription_EN.docx,https://drive.google.com/file/d/1x_VFKwgRHgghn...,This document outlines the comprehensive detai...
9,EXTERNAL_ATTENDANT_AUDIT_STUDENTS_FORM.docx,https://drive.google.com/file/d/1V5-6u3RR-Cy1i...,External Attendant Audit Students Form for Pos...


In [None]:
from IPython.display import FileLink
FileLink('merged_documents2.csv')