In [1]:
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import os
import google.generativeai as genai
import fitz

printable = set(list(' ,`0123456789-=~!@#$%^&*()_+abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,:;"\'/\n[]\\{}|?><.'))
bloat = {'per', 'and', 'but', 'the', 'for', 'are', 'was', 'were', 'be', 'been', 'with', 'you', 'this', 'but', 'his',
         'from', 'they', 'say', 'her', 'she', 'will', 'one', 'all', 'would', 'there', 'their', 'what', 'out', 'about',
         'who', 'get', 'which', 'when', 'make', 'can', 'like', 'time', 'just', 'into', 'year', 'your', 'good', 'some',
         'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also',
         'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because',
         'any', 'these', 'give', 'day', 'most'}
load_dotenv()
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=GEMINI_API_KEY)

theta = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

  from tqdm.autonotebook import tqdm, trange




In [2]:
def delimiter(phraset):
    l0 = 1
    for i in range(3):
        if phraset[i][1]>0.62:
            l0 += 1
    l1 = l0 + 1
    for i in range(10):
        if phraset[l0+i][1]>0.60:
            l1 += 1
    l2 = l1 + 1
    for i in range(20):
        if phraset[l1+i][1] >= 0.52:
            l2 += 1

    i1 = [x[0] for x in phraset[:l0]]
    i2 = [x[0] for x in phraset[l0:l1]]
    i3 = [x[0] for x in phraset[l1:l2]]
    return [i1, i2, i3]


def phrases_by_relevance(text, prompts):
    qembed = theta.encode([*prompts])
    embeddings = theta.encode(text)
    result = [tensor.item() for tensor in list(cos_sim(qembed, embeddings)[0])]
    phraset = [(text[i], result[i]) for i in range(len(text))]
    phraset.sort(key=lambda x: x[1], reverse=True)
    return phraset


def cleaned(t):
    return "".join([i for i in t if i in printable])


def pdf_to_text(pdf):
    with fitz.open(pdf) as doc:
        ina = [cleaned(page.get_text()).split("\n") for page in doc]
        texta = []
        for a in ina:
            texta += a
        print("found text:", texta)
    return [k for k in texta if len(k)>2 and not k.isspace() and k not in bloat]

In [9]:
import torch
import numpy as np
def pooling(outputs: torch.Tensor, inputs: dict,  strategy: str = 'cls') -> np.ndarray:
    if strategy == 'cls':
        outputs = outputs[:, 0]
    elif strategy == 'mean':
        outputs = torch.sum(
            outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True)
    else:
        raise NotImplementedError
    return outputs.detach().cpu().numpy()

In [4]:
def process_pdf():
    context = pdf_to_text('Final_Resumes/Resume_of_ID_0.pdf')
    
    print("context:", context)


process_pdf()

found text: ['BILINGUAL LANGUAGE ARTS SIXTH GRADE TEACHER', 'Summary', "Dedicated and enthusiastic professional with over four years' experience in education. Proven expertise in establishing rapport and building trust", 'among students, parents, administrators and community members. Possess strong communication skills and ability to partner across departments', 'within and outside of an organization to meet the needs of students. Motivating students School improvement committee Interactive', 'teaching/learning Interdisciplinary teaching Innovative lesson planning Effectively work with parents', 'Professional Experience', '08/2014 to Current', 'Bilingual Language Arts Sixth Grade Teacher Company Name  City , State', 'Developed and implemented interesting and interactive learning mediums to increase student understanding of course materials and build', 'community within the classroom Participate in the development of intervention plans for students within the classroom setting, during M

In [15]:
from typing import Dict

import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sentence_transformers.util import cos_sim

# For retrieval you need to pass this prompt. Please find our more in our blog post.
def transform_query(query: str) -> str:
    return f'Represent this sentence for searching relevant passages: {query}'

# The model works really well with cls pooling (default) but also with mean pooling.
def pooling(outputs: torch.Tensor, inputs: Dict,  strategy: str = 'cls') -> np.ndarray:
    if strategy == 'cls':
        outputs = outputs[:, 0]
    elif strategy == 'mean':
        outputs = torch.sum(
            outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True)
    else:
        raise NotImplementedError
    return outputs.detach().cpu().numpy()

# 1. load model
model_id = 'mixedbread-ai/mxbai-embed-large-v1'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)


docs = [
    transform_query('01/2001'),
] + pdf_to_text('Final_Resumes/Resume_of_ID_0.pdf')

# 2. encode
inputs = tokenizer(docs, padding=True, return_tensors='pt')
for k, v in inputs.items():
    inputs[k] = v
outputs = model(**inputs).last_hidden_state
embeddings = pooling(outputs, inputs, 'cls')

similarities = cos_sim(embeddings[0], embeddings[1:])
print('similarities:', similarities)

found text: ['BILINGUAL LANGUAGE ARTS SIXTH GRADE TEACHER', 'Summary', "Dedicated and enthusiastic professional with over four years' experience in education. Proven expertise in establishing rapport and building trust", 'among students, parents, administrators and community members. Possess strong communication skills and ability to partner across departments', 'within and outside of an organization to meet the needs of students. Motivating students School improvement committee Interactive', 'teaching/learning Interdisciplinary teaching Innovative lesson planning Effectively work with parents', 'Professional Experience', '08/2014 to Current', 'Bilingual Language Arts Sixth Grade Teacher Company Name  City , State', 'Developed and implemented interesting and interactive learning mediums to increase student understanding of course materials and build', 'community within the classroom Participate in the development of intervention plans for students within the classroom setting, during M