In [1]:
import fitz
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter import ttk
# from pdf2image import convert_from_bytes
# from PIL import Image, ImageTk
from transformers import pipeline
from PIL import Image, ImageTk


# # Function to display PDF file
# def display_pdf(pdf_file):
#     doc = fitz.open(pdf_file)
#     for page in doc:
#         img = convert_from_bytes(page.get_page_bytes(), dpi=100)[0]
#         img = ImageTk.PhotoImage(img)
#         label = ttk.Label(frame_pdf_viewer, image=img)
#         label.image = img
#         label.pack(padx=10, pady=10)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initializing the model
# oracle = pipeline(model="deepset/roberta-base-squad2")
oracle = pipeline("question-answering", model="deepset/roberta-base-squad2")
# mdeberta = pipeline("question-answering", "timpal0l/mdeberta-v3-base-squad2")
whole_word = pipeline("question-answering", model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad")


Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
models = [oracle]
file_name_counter = 0

In [9]:
# Function to convert PDF to text
def pdf_to_text(pdf_file):
    text = ""
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Function to find and highlight the relevant texts in the pdf file 
def highlight_pdf(relevant_texts, pdf_file,model_index):
    global file_name_counter
    doc = fitz.open(pdf_file)
    for page in doc:
        for text in relevant_texts:
            if len(text) < 3:
                print('skipping')
                continue
            text_instances = page.search_for(text)
            for inst in text_instances:
                highlight = page.add_highlight_annot(inst)
                highlight.update()
    doc.save(f'Model{model_index}_{file_name_counter}.pdf')
    doc.close()
    file_name_counter += 1

def display_modified_pdf(pdf_file):
    doc = fitz.open(pdf_file)
    zoom = 1.0  # Adjust the zoom level as needed
    mat = fitz.Matrix(zoom, zoom)

    for page_index in range(doc.page_count):
        page = doc.load_page(page_index)
        pix = page.get_pixmap(matrix=mat)
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        photo_image = ImageTk.PhotoImage(image)
        label = ttk.Label(frame_pdf_viewer, image=photo_image)
        label.image = photo_image
        label.pack(padx=10, pady=10)


In [5]:
# Function to run machine learning model
def run_model(text, prompt,model):
   # Placeholder for your machine learning model
   # Replace this with your actual model code
   all_relevant_texts = model(question= prompt, context=text,
      top_k=20)
   for line in all_relevant_texts:
      print(line)
   #  print('the answer given is ', all_relevant_texts['answer'])
   return all_relevant_texts

In [10]:
# Function to handle button click event
def submit_clicked():
    global resume_text 
    pdf_file = entry_pdf.get()
    prompt = entry_prompt.get()

    if pdf_file == "":
        messagebox.showerror("Error", "Please select a PDF file")
        return
    
    if prompt == "":
        messagebox.showerror("Error", "Please enter a prompt")
        return
    
    # Convert PDF to text
    text = pdf_to_text(pdf_file)
    resume_text = text
    # print(text)
    
    # Run models
    for index,model in enumerate(models) :
        model_returns = run_model(text, prompt,model)
        # relevant_texts = ['team','to detail and']
    
        relevant_texts = set()
        for answer in model_returns:
            phrases = answer['answer'].split('\n')
            for phrase in phrases:
                if len(phrase) >= 3:
                    relevant_texts.add(phrase)
                    
        relevant_texts = list(relevant_texts)
        print('Passing the relevant texts to highlight: ', relevant_texts)
        highlight_pdf(relevant_texts, pdf_file,index)
    # Save modified text to a new PDF file
    # Here you would write code to create a new PDF from modified_text

    # Display the modified PDF in the UI
    for widget in frame_pdf_viewer.winfo_children():
        widget.destroy()
    display_modified_pdf(modified_pdf_file) 
      
    # Display the modified PDF file
    # display_pdf(pdf_file)

In [7]:
# Creating the GUI 

# Create main window
root = tk.Tk()
root.title("PDF Modifier")

# Create style
style = ttk.Style(root)
style.theme_use("clam")

# Create widgets
label_pdf = ttk.Label(root, text="PDF File:")
label_pdf.grid(row=0, column=0, padx=5, pady=5, sticky="w")
entry_pdf = ttk.Entry(root, width=40)
entry_pdf.grid(row=0, column=1, padx=5, pady=5)
button_browse = ttk.Button(root, text="Browse", command=lambda: entry_pdf.insert(tk.END, filedialog.askopenfilename()))
button_browse.grid(row=0, column=2, padx=5, pady=5)

label_prompt = ttk.Label(root, text="Prompt:")
label_prompt.grid(row=1, column=0, padx=5, pady=5, sticky="w")
entry_prompt = ttk.Entry(root, width=40)
entry_prompt.grid(row=1, column=1, padx=5, pady=5)

button_submit = ttk.Button(root, text="Submit", command=submit_clicked)
button_submit.grid(row=2, column=1, padx=5, pady=5)

# PDF Viewer Frame
frame_pdf_viewer = tk.Frame(root)
frame_pdf_viewer.grid(row=3, columnspan=3, padx=10, pady=10)

# Run the Tkinter event loop
root.mainloop()


{'score': 0.00021333503536880016, 'start': 1439, 'end': 1453, 'answer': '13-hour event\n'}
{'score': 6.471630331361666e-05, 'start': 216, 'end': 244, 'answer': '\nRELEVANT EXPERIENCE\nIntern\n'}
{'score': 5.44532376807183e-05, 'start': 1238, 'end': 1286, 'answer': 'LEADERSHIP EXPERIENCE\nPhilanthropy Events Chair\n'}
{'score': 4.995187191525474e-05, 'start': 1260, 'end': 1286, 'answer': 'Philanthropy Events Chair\n'}
{'score': 4.793393236468546e-05, 'start': 1238, 'end': 1285, 'answer': 'LEADERSHIP EXPERIENCE\nPhilanthropy Events Chair'}
{'score': 4.3971485865768045e-05, 'start': 1260, 'end': 1285, 'answer': 'Philanthropy Events Chair'}
{'score': 3.75119416275993e-05, 'start': 216, 'end': 217, 'answer': '\n'}
{'score': 2.8752032449119724e-05, 'start': 91, 'end': 127, 'answer': 'Bachelor of Arts in Criminal Justice'}
{'score': 2.7860785849043168e-05, 'start': 1439, 'end': 1446, 'answer': '13-hour'}
{'score': 2.5122864826698788e-05, 'start': 1439, 'end': 1452, 'answer': '13-hour event'}


In [36]:
prompt = 'what are some things related to marketing?'
returns = run_model(resume_text, prompt)
# print(returns)

{'score': 0.0018495976692065597, 'start': 108, 'end': 152, 'answer': 'Bachelor of Arts in Marketing; Spanish minor'}
{'score': 0.0014535044319927692, 'start': 2756, 'end': 2758, 'answer': '\nH'}
{'score': 0.0014326439704746008, 'start': 1544, 'end': 1556, 'answer': 'Olive Garden'}
{'score': 0.0009395668748766184, 'start': 108, 'end': 124, 'answer': 'Bachelor of Arts'}
{'score': 0.0008295539300888777, 'start': 139, 'end': 152, 'answer': 'Spanish minor'}
{'score': 0.0008050044998526573, 'start': 1544, 'end': 1557, 'answer': 'Olive Garden\n'}
{'score': 0.0006648484850302339, 'start': 2692, 'end': 2758, 'answer': 'Southeastern Marketing Association, Member\nAugust 2019 – Present\nH'}
{'score': 0.0006501433090306818, 'start': 108, 'end': 137, 'answer': 'Bachelor of Arts in Marketing'}
{'score': 0.0006339185638353229, 'start': 2692, 'end': 2726, 'answer': 'Southeastern Marketing Association'}
{'score': 0.0005308256368152797, 'start': 206, 'end': 244, 'answer': '\nRELEVANT EXPERIENCE\nMarketi