In [21]:
import spacy
from spacy.tokens import Doc
import en_core_web_sm
import nltk
import re
from spacy.language import Language

nlp = spacy.load("en_core_web_sm")
nlp = en_core_web_sm.load()

@Language.component("senticizer")
def senticizer(doc):
    text = doc.text
    # Fix newlines before hyphens or numbers (for list items like "1." or "-")
    text = re.sub(r'(\n)(-|[0-9])', r"\1\n\2", text)[3:]
    
    # Split on double newlines
    parts = text.split('\n\n')
    
    tokens = []
    for part in parts:
        tokens.extend(nltk.word_tokenize(part))
    
    # Re-create the spaCy Doc from tokens
    return Doc(doc.vocab, words=tokens)

# Add the component to the pipeline
nlp.add_pipe("senticizer", before="tok2vec")



<function __main__.senticizer(doc)>

In [22]:
import voyageai
import os

from dotenv import load_dotenv
load_dotenv()

import os

vo = voyageai.Client(api_key=os.getenv("VOYAGE_API_KEY"))


In [23]:

from turbopuffer import Turbopuffer

TURBOPUFFER_REGION = "aws-us-west-2"
TURBOPUFFER_API_KEY = "tpuf_dQHBpZEvl612XAdP0MvrQY5dbS0omPMy"

TPUF_NAMESPACE_NAME = "aditya_unal"

tpuf = Turbopuffer(
    # Pick the right region https://turbopuffer.com/docs/regions
    region=TURBOPUFFER_REGION,
    # This is the default and can be omitted
    api_key=TURBOPUFFER_API_KEY,
)

ns = tpuf.namespace(TPUF_NAMESPACE_NAME)

# Query nearest neighbors with a vector.



In [24]:
hard_criterias = [
    '''
    1. JD degree from an accredited U.S. law school
    2. 3+ years of experience practicing law
    ''',
    '''
    1. 2-4 years of experience as a Corporate Lawyer at a leading law firm in the USA, Europe, or Canada, or in-house at a major global organization
2. Graduate of a reputed law school in the USA, Europe, or Canada
    ''',
    '''
    1. MD degree from a medical school in the U.S. or India
    ''',
    '''
    1. MD degree from a top U.S. medical school
    2. 2+ years of clinical practice experience in the U.S.
    3. Experience working as a General Practitioner (GP)
    ''',
    '''
    1. Completed undergraduate studies in the U.S., U.K., or Canada
    2. PhD in Biology from a top U.S. university    
    ''',
    '''
    1. PhD (in progress or completed) from a distinguished program in sociology, anthropology, or economics
    2. PhD program started within the last 3 years
    ''',
    ''' 	
    1. Completed undergraduate studies in the U.S., U.K., or Canada
    2. PhD in Mathematics or Statistics from a top U.S. university
    ''',
    '''
    1. MBA from a Prestigious U.S. university (M7 MBA)
    2. 3+ years of experience in quantitative finance, including roles such as risk modeling, algorithmic trading, or financial engineering 
    ''',
    '''
    1. MBA from a U.S. university
    2. 2+ years of prior work experience in investment banking, corporate finance, or M&A advisory
    ''',
    '''
    1. Higher degree in Mechanical Engineering from an accredited university
    2. 3+ years of professional experience in mechanical design, product development, or systems engineering
    '''    
]

soft_criterias = [
    '''
    1. Experience advising clients on tax implications of corporate or financial transactions
    2. Experience handling IRS audits, disputes, or regulatory inquiries
    3. Experience drafting legal opinions or filings related to federal and state tax compliance
    ''',
    '''
    1. Experience supporting Corporate M&A transactions, including due diligence and legal documentation
    2. Experience drafting and negotiating legal contracts or commercial agreements
    3. Familiarity with international business law or advising on regulatory requirements across jurisdictions
    ''',
    '''
    1. Board certification in Radiology (ABR, FRCR, or equivalent) or comparable credential
    2. 3+ years of experience interpreting X-ray, CT, MRI, ultrasound, or nuclear medicine studies
    3. Expertise in radiology reporting, diagnostic protocols, differential diagnosis, or AI applications in medical imaging
    ''',
    '''
    1. Familiarity with EHR systems and managing high patient volumes in outpatient or family medicine settings
    2. Comfort with telemedicine consultations, patient triage, and interdisciplinary coordination
    ''',
    '''
    1. Research experience in molecular biology, genetics, or cell biology, with publications in peer-reviewed journals
    2. Familiarity with experimental design, data analysis, and lab techniques such as CRISPR, PCR, or sequencing
    3. Experience mentoring students, teaching undergraduate biology courses, or collaborating on interdisciplinary research
    ''',
    '''
    1. Demonstrated expertise in ethnographic methods, with substantial fieldwork or case study research involving cultural, social, or economic systems
    2. Strong academic output — published papers, working papers, or conference presentations on anthropological or sociological topics
    3. Experience applying anthropological theory to real-world or interdisciplinary contexts (e.g., migration, labor, technology, development), showing both conceptual depth and practical relevance
    ''',
    '''
    1. Research expertise in pure or applied mathematics, statistics, or probability, with peer-reviewed publications or preprints
    2. Proficiency in mathematical modeling, proof-based reasoning, or algorithmic problem-solving
    ''',
    '''
    1. Experience applying financial modeling techniques to real-world problems like portfolio optimization or derivatives pricing
    2. Proficiency with Python for quantitative analysis and exposure to financial libraries (e.g., QuantLib or equivalent)
    3. Demonstrated ability to work in high-stakes environments such as global investment firms, showing applied knowledge of quantitative methods in production settings
    ''',
    '''
    1. Specialized experience in healthcare-focused investment banking or private equity, including exposure to sub-verticals like biotech, pharma services, or provider networks
    2. Led or contributed to transactions involving healthcare M&A, recapitalizations, or growth equity investments
    3. Familiarity with healthcare-specific metrics, regulatory frameworks, and value creation strategies (e.g., payer-provider integration, RCM optimization)
    ''',
    '''
    1. Experience with CAD tools (e.g., SolidWorks, AutoCAD) and mechanical simulation tools (e.g., ANSYS, COMSOL)
    2. Demonstrated involvement in end-to-end product lifecycle — from concept through prototyping to manufacturing or testing
    3. Domain specialization in areas like thermal systems, fluid dynamics, structural analysis, or mechatronics
    '''
]

criterias = ["tax_lawyer.yml",
             "junior_corporate_lawyer.yml",
             "radiology.yml",
             "doctors_md.yml",
             "biology_expert.yml",
             "anthropology.yml",
             "mathematics_phd.yml",
             "quantitative_finance.yml",
             "bankers.yml",
             "mechanical_engineers.yml"]

soft_criterias_vectors = []
for criteria in soft_criterias:
    soft_criterias_vectors.append(vo.embed(criteria,model="voyage-3").embeddings)

In [25]:
EDU_KEYWORDS = {"undergraduate", "postgraduate", "phd", "bachelor", "master", "mba", "school", "college", "university", "degree", "studied"}
PRESTIGE_KEYWORDS = {"top", "prestigious", "elite", "ivy league", "renowned", "reputed", "world-class", "highly ranked","major","fortune 500", "top tier", "big law", "bulge bracket", "ivy league", "magic circle", "unicorn", "faang",
    "blue chip", "elite", "premier", "tier-1", "top consulting", "top firm","M7"}

# Common academic subjects
SUBJECT_LIST = {
    "biology", "chemistry", "physics", "mathematics", "statistics", "engineering", "mechanical engineering",
    "electrical engineering", "civil engineering", "computer science", "data science", "ai", "artificial intelligence",
    "economics", "finance", "accounting", "business", "management", "marketing", "law", "medicine", "nursing",
    "pharmacy", "psychology", "sociology", "philosophy", "political science", "anthropology", "linguistics",
    "history", "english", "education", "architecture", "design", "graphic design", "fine arts", "journalism",
    "communications", "environmental science", "geology", "agriculture", "astronomy", "robotics"
}

def extract_education_info(sentence: str):
    doc = nlp(sentence)
    sentence_lc = sentence.lower()

    is_education = any(word in sentence_lc for word in EDU_KEYWORDS)

    if not is_education:
        return None
        
    level = None
    for lvl in ["phd", "mba", "postgraduate", "master", "bachelor", "undergraduate"]:
        if lvl in sentence_lc:
            level = lvl
            break

    locations = {ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}}

    subjects = set()
    for subj in SUBJECT_LIST:
        if subj in sentence_lc:
            subjects.add(subj)

    # Fallback subject pattern if static ones weren't found
    if not subjects:
        for token in doc:
            if token.text.lower() == "in":
                next_token = token.nbor()
                if next_token.pos_ in {"NOUN", "PROPN"}:
                    subjects.add(next_token.text)
                    break
            elif token.lemma_ == "study" and token.head == token:
                for child in token.children:
                    if child.pos_ == "NOUN":
                        subjects.add(child.text)
                        break

    is_prestigious = any(word in sentence_lc for word in PRESTIGE_KEYWORDS)


    return {
        "is_education": True,
        "level": level,
        "location": list(locations),
        "is_prestigious": is_prestigious,
        "subject": list(subjects)
    }

In [None]:
WORK_KEYWORDS = ["work", "firm", "practice", "employed", "worked", "company", "organization", 
                     "consulting", "law firm", "investment", "experience", "career", "joined", 
                     "currently", "employer", "employment", "hospital", "clinic", "corporate", "internship", "associate"]
FIELDS = { "biology", "chemistry", "physics", "mathematics", "statistics", "engineering", "mechanical engineering",
    "electrical engineering", "civil engineering", "computer science", "data science", "ai", "artificial intelligence",
    "economics", "finance", "accounting", "business", "management", "marketing", "law", "medicine", "nursing",
    "pharmacy", "psychology", "sociology", "philosophy", "political science", "anthropology", "linguistics",
    "history", "english", "education", "architecture", "design", "graphic design", "fine arts", "journalism",
    "communications", "environmental science", "geology", "agriculture", "astronomy", "robotics","law", "finance", "medicine", "surgery", "data", "software", "accounting",
              "consulting", "investment", "marketing", "design", "architecture", "education",
              "neuroscience", "oncology", "engineering", "cybersecurity"
}
def extract_work_info(sentence: str):
    doc = nlp(sentence)
    sentence_lc = sentence.lower()
    
    is_work_experience = any(kw in sentence_lc for kw in WORK_KEYWORDS)
    if not is_work_experience:
        return None
    locations = {ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}}
    is_prestigious = any(word in sentence_lc for word in PRESTIGE_KEYWORDS)
    fields = set()
    for subj in SUBJECT_LIST:
        if subj in sentence_lc:
            fields.add(subj)

    # Fallback subject pattern if static ones weren't found
    if not fields:
        for token in doc:
            if token.text.lower() == "in":
                next_token = token.nbor()
                if next_token.pos_ in {"NOUN", "PROPN"}:
                    fields.add(next_token.text)
                    break
            elif token.lemma_ == "study" and token.head == token:
                for child in token.children:
                    if child.pos_ == "NOUN":
                        fields.add(child.text)
                        break


In [27]:
vector_results = []
for doc in hard_criterias:
    vector_result = ns.query(
      rank_by=("rerank_summary", "BM25", doc),
      top_k=1000,
      include_attributes=["id","name","rerank_summary","vector"],
    )
    vectors = vector_result.rows
    temp = []
    # print(vectors)
    for vector in vectors:
      temp.append(dict(vector))
    print(len(temp))
    vector_results.append(temp)
    

1000
1000
1000
1000
1000
1000
1000
1000
1000
1000


In [28]:
print(len(vector_results[0]))

1000


In [29]:
def check_education_info(education_info, summary):
    summary_lc = summary.lower()

    level = education_info.get("level")
    if level and level.lower() not in summary_lc:
        return False

    locations = education_info.get("location", [])
    if locations and not any(loc.lower() in summary_lc for loc in locations):
        return False

    subjects = education_info.get("subject", [])
    if subjects and not any(subj.lower() in summary_lc for subj in subjects):
        return False

    if education_info.get("is_prestigious", False):
        PRESTIGE_KEYWORDS = [
            "harvard", "stanford", "oxford", "mit", "cambridge", "yale", "princeton",
            "columbia", "upenn", "berkeley", "caltech", "duke", "ivy league",
            "iit", "iim", "top-ranked", "world-class", "elite university", "tier 1 college"
        ]
        if not any(keyword in summary_lc for keyword in PRESTIGE_KEYWORDS):
            return False

    return True


In [30]:
def filter_edu(sent, profiles):
    education_info = extract_education_info(sent)
    if education_info is None:
        return profiles

    # Use a filtered list instead of popping while iterating
    filtered_profiles = []
    for profile in profiles:
        if check_education_info(education_info, profile["rerank_summary"]):
            filtered_profiles.append(profile)

    return filtered_profiles


In [35]:
def check_work_info(work_info, summary):
    summary_lc = summary.lower()

    # Check location
    locations = work_info.get("locations", [])
    if locations and not any(loc.lower() in summary_lc for loc in locations):
        return False

    # Check field
    fields = work_info.get("fields", [])
    if fields and not any(f.lower() in summary_lc for f in fields):
        return False

    # Check prestige
    if work_info.get("is_prestigious", False):
        PRESTIGIOUS_ORG_KEYWORDS = [
            "mckinsey", "bain", "boston consulting", "goldman sachs", "jp morgan", "blackrock",
            "google", "apple", "amazon", "facebook", "meta", "openai", "microsoft", "palantir",
            "nasa", "world bank", "who", "united nations", "harvard", "oxford", "stanford",
            "pwc", "deloitte", "ey", "kpmg", "accenture", "moelis", "barclays", "credit suisse",
            "linkedin", "tesla", "berkshire hathaway", "twilio", "stripe", "airbnb"
        ]
        PRESTIGE_KEYWORDS = [
            "fortune 500", "top tier", "big law", "bulge bracket", "ivy league", "magic circle", "unicorn", "faang",
            "blue chip", "elite", "premier", "tier-1", "top consulting", "top firm"
        ]
        if not any(p in summary_lc for p in PRESTIGIOUS_ORG_KEYWORDS + PRESTIGE_KEYWORDS):
            return False

    return True


In [36]:
def filter_work(sent, profiles):
    work_info = extract_work_info(sent)
    if work_info is None:
        return profiles

    filtered_profiles = []
    for profile in profiles:
        if check_work_info(work_info, profile["rerank_summary"]):
            filtered_profiles.append(profile)

    return filtered_profiles


In [33]:
print(len(vector_results[0]))

1000


In [None]:
for i, doc in enumerate(hard_criterias):
    doc_nlp = nlp(doc)
    print(i)
    original_profiles = vector_results[i]
    for sent in doc_nlp.sents:
        vector_results[i] = filter_edu(str(sent), vector_results[i])
        vector_results[i] = filter_work(str(sent), vector_results[i])
        if len(vector_results[i]) < 10:
            vector_results[i] = original_profiles


0
After filtering: 713
After filtering: 713
After filtering: 713
1
After filtering: 515
After filtering: 515
2
After filtering: 1000
3
After filtering: 399
After filtering: 157
After filtering: 157
4
After filtering: 30
After filtering: 30
After filtering: 30
5
After filtering: 640
After filtering: 640
6
After filtering: 41
After filtering: 41
After filtering: 41
7
After filtering: 103
After filtering: 103
After filtering: 100
8
After filtering: 249
After filtering: 249
After filtering: 240
9
After filtering: 998
After filtering: 998
After filtering: 998
