In [1]:
import time
from fuzzywuzzy import fuzz
from typing import TypedDict
from typing import List
from fuzzywuzzy import fuzz
import pandas as pd


class ProcessedWikidata(TypedDict):
    name: str
    nicknames: List[str]
    instance: str
    wd_id: str
    instances: List[str]

In [2]:
cleaner_schools = schools = [
    # Universities
    "Harvard University",
    "Stanford University",
    "University of Oxford",
    "Massachusetts Institute of Technology (MIT)",
    "University of Cambridge",
    "University of California, Berkeley",
    "Yale University",
    "University of Chicago",
    "Princeton University",
    "Columbia University",

    # Colleges
    "Macalester College",
    "Amherst College",
    "Williams College",
    "Swarthmore College",
    "Pomona College",
    "Grinnell College",
    "Wellesley College",
    "Middlebury College",
    "Oberlin College",
    "Bowdoin College",

    # Business Schools
    "Harvard Business School",
    "Stanford Graduate School of Business",
    "Wharton School (University of Pennsylvania)",
    "MIT Sloan School of Management",
    "Kellogg School of Management (Northwestern University)",
    "Columbia Business School",
    "Booth School of Business (University of Chicago)",
    "Tuck School of Business (Dartmouth College)",
    "INSEAD Business School",
    "London Business School",

    # Law Schools
    "Harvard Law School",
    "Yale Law School",
    "Stanford Law School",
    "Columbia Law School",
    "University of Oxford Law School",
    "New York University School of Law",
    "University of Chicago Law School",
    "University of California, Berkeley Law",
    "University of Cambridge Faculty of Law",
    "Georgetown University Law Center",

    # High Schools
    "Phillips Academy Andover",
    "Harvard-Westlake School",
    "The Lawrenceville School",
    "The Hotchkiss School",
    "The Brearley School",
    "The Spence School",
    "Trinity School (New York City)",
    "The Dalton School",
    "St. Paul's School",
    "The Chapin School",

    # Colleges within Universities
    "College of William & Mary (part of William & Mary)",
    "College of Arts & Sciences (University of Washington)",
    "College of Engineering (University of Michigan)",
    "College of Fine Arts (University of Texas)",
    "College of Charleston (University of Charleston)",
    "College of Education (University of Pennsylvania)",
    "College of Liberal Arts (Texas A&M University)",
    "College of Public Health (University of Iowa)",
    "College of Medicine (University of Florida)",
    "College of Business (University of Nebraska–Lincoln)"
]

In [16]:
csv_file_path = 'processed_schools_min_cols_1732548646.9518888.csv'

In [17]:
from collections import defaultdict

# read csv into dataframe and dict
df = pd.read_csv(csv_file_path, dtype=str)
name_to_ids = defaultdict(list)
law_name_to_ids = defaultdict(list)
business_name_to_ids = defaultdict(list)
med_name_to_ids = defaultdict(list)
eng_name_to_ids = defaultdict(list)

categories = ['law', 'business', 'medical', 'engineer']
category_dicts = [law_name_to_ids, business_name_to_ids, med_name_to_ids, eng_name_to_ids]
 
for _, row in df.iterrows():
    wd_id = row['wd_id']
    # Normalize name and nicknames for consistency
    name = row['name'].strip().lower()

    if pd.notna(row['nickname']):  # Check if 'nicknames' is not NaN
        nicknames = [
            nickname.strip().strip("'").lower() 
            for nickname in str(row['nickname'])[1:-1].split("', '") 
            if nickname.strip()
        ]
    else:
        nicknames = []
    
    # Add wd_id to the main name and nicknames
    name_to_ids[name].append(wd_id)
    for nickname in nicknames:
        name_to_ids[nickname].append(wd_id)
    
    for i, category in enumerate(categories):
        if category in row['instance of']:
            category_dicts[i][name].append(wd_id)
            for nickname in nicknames:
                category_dicts[i][nickname].append(wd_id)

# Ensure unique wd_ids for each name/nickname
string_id_map = {key: list(set(value)) for key, value in name_to_ids.items()}
for i, dict in enumerate(category_dicts):
    globals()[f"string_id_map_{categories[i]}"] = {key: list(set(value)) for key, value in dict.items()}


In [18]:
print(string_id_map)

{'michigan technological university': ['Q12432'], 'university of massachusetts amherst': ['Q15142'], 'university of massachusetts boston': ['Q15144'], 'university of saint joseph': ['Q24399'], 'kansas state university': ['Q31249'], 'charter oak state college': ['Q32287'], 'northeastern university': ['Q37548'], 'allen university': ['Q49041'], 'university of maryland eastern shore': ['Q49089'], 'massachusetts institute of technology': ['Q49108'], 'boston university': ['Q49110'], 'boston college': ['Q49118'], 'brandeis university': ['Q49119'], 'tufts university': ['Q49120'], 'wesleyan university': ['Q49167'], 'smith college': ['Q49204'], 'wellesley college': ['Q49205'], 'rensselaer polytechnic institute': ['Q49211'], 'texas a&m university': ['Q49212'], 'virginia tech': ['Q65379'], 'new york university tandon school of engineering': ['Q75478'], 'st. bonaventure university': ['Q93662'], 'university of alaska southeast': ['Q94279'], 'university of louisiana at lafayette': ['Q116485'], "unive

In [19]:
from rapidfuzz import process, fuzz

def fetch_csv_properties(entity_str):
    """
    Fetch specified properties from a CSV based on keywords in the entity string.
    
    Parameters:
        entity_str (str): The entity string (e.g., 'Harvard University').
        csv_file_path (str): The path to the CSV file containing relevant institution data.
    
    Returns:
        pd.DataFrame: A DataFrame containing rows from the CSV that match the criteria.
    """
    str_to_search = string_id_map
    # Define the categories or keywords to search for in the entity string
    if " medical" in entity_str.casefold() or "med" in entity_str.casefold():
        str_to_search = med_name_to_ids
    if " law" in entity_str.casefold():
        str_to_search = law_name_to_ids
    if " engineering" in entity_str.casefold():
        str_to_search = eng_name_to_ids
    if " business" in entity_str.casefold():
        str_to_search = business_name_to_ids

    matches = process.extract(entity_str, str_to_search.keys(), scorer=fuzz.WRatio, limit=50)

    if matches[0][1] == 100:
        return [
            (matched_key, score, string_id_map[matched_key]) 
            for matched_key, score, _ in matches 
            if score == 100
        ]
    return [
        (matched_key, score, string_id_map[matched_key]) 
        for matched_key, score, _ in matches 
        if score >= 86
    ]

In [20]:
def retrieve_school_object(match_id: str):
    match = df[df['wd_id'] == match_id]

    if not match.empty:
        return match.iloc[0].to_dict()
    
    print(f"No match found for wd_id: {match_id}")
    return None

In [21]:
from rapidfuzz import fuzz
import re

def custom_scoring_function(query, candidate, *, score_cutoff=None):
    ignore_words = {'school', 'college', 'law', 'business', 'medical', 'of', 'the'}

    def remove_parentheses(s):
        return re.sub(r'\(.*?\)', '', s).strip()

    query = remove_parentheses(query.lower().strip())
    candidate = remove_parentheses(candidate.lower().strip())

    query_words = set(query.split())
    candidate_words = set(candidate.split())

    important_query_words = query_words - ignore_words
    important_candidate_words = candidate_words - ignore_words

    base_score = fuzz.token_sort_ratio(query, candidate)

    coverage_score = len(query_words & candidate_words) / len(query_words) * 100 if query_words else 0
    important_coverage_score = len(important_query_words & important_candidate_words) / len(important_query_words) * 100 if important_query_words else 0

    length_difference_penalty = abs(len(query_words) - len(candidate_words)) * 2  # Reduced penalty

    # Bonus for fully containing the query
    containment_bonus = 20 if query in candidate else 0

    final_score = (0.3 * base_score + 0.3 * coverage_score + 0.4 * important_coverage_score) - length_difference_penalty + containment_bonus

    if score_cutoff is not None and final_score < score_cutoff:
        return 0

    return max(0, final_score)

In [28]:
def choose_best_id(match, score, ids):
    if len(ids) == 1:
        return ids[0]
    
    # sort on name > nickname, etc. if needed using entire object
    schools = []
    name_matches = []
    nickname_matches = []
    for id in ids:
        school = retrieve_school_object(id)
        if school:
            school['score'] = score
            if match == school['name'].casefold():
                name_matches.append(school)
            else: nickname_matches.append(school)
            school['matched_on_name'] = match == school['name']
            schools.append(school)
    if len(name_matches) == 1:
        return name_matches[0]['wd_id']
    else: return ids

In [29]:
from typing import Tuple

def find_best_match(query: str, data: List[Tuple[str, float, List[str]]]):
    if len(data) == 1:
        if len(data[0][2]) == 1 and data[0][1] > 95:
            return data[0][2][0]
        else:
            return choose_best_id(*data[0])
    
    # If all strings refer to the same place, return
    all_match = True
    while all_match:
        for match in data:
            for id in match[2]:
                if id != data[0][2][0]:
                    all_match = False
    if all_match:
        print('all ids the same. returning.')
        return choose_best_id(*data[0])
    
    strings_to_match = [datum[0] for datum in data]
    new_scores = process.extract(query, strings_to_match, scorer=custom_scoring_function, score_cutoff=70)

    if new_scores:
        threshold = 0
        new_scores = [
            (matched_key, score, string_id_map[matched_key]) 
            for matched_key, score, _ in new_scores 
            if score >= threshold
        ]

        print('new scores')
        max_score = new_scores[0][1]
        print('max scores:', max_score)
        max_scores = [
            (matched_key, score, string_id_map[matched_key]) 
            for matched_key, score, _ in new_scores 
            if score >= max_score
        ]

        print(max_scores)

        return choose_best_id(*max_scores[0])
    else:
        return None
    
    # for school in schools:
    #     print(school)


In [30]:
# Main function to resolve entity
def resolve_entity(query):
    candidates = fetch_csv_properties(query.casefold())
    print('candidates', len(candidates), candidates)
    if candidates:
        best_match = find_best_match(query, candidates)
        return best_match
        # if best_match:
        #     match = retrieve_school_object(best_match)
        #     return match
    return None

In [31]:
query = 'harvard law'

In [32]:
query = 'ucla'
start_time = time.time()
match = resolve_entity(query)
end_time = time.time()
print(match)
print(end_time-start_time)

candidates 49 [('ucla school of dentistry', 90.0, ['Q14684196']), ('ucla school of public health', 90.0, ['Q7864045']), ('ucla division of humanities', 90.0, ['Q104801142']), ('ucla division of life sciences', 90.0, ['Q105427200']), ('ucla department of psychology', 90.0, ['Q105445551']), ('ucla school of nursing', 90.0, ['Q7864042']), ('ucla school of law', 90.0, ['Q8012898']), ('ucla lab school', 90.0, ['Q7864033']), ('hillel at ucla', 90.0, ['Q107318538']), ('ucla music library', 90.0, ['Q60644689']), ('arts library, ucla', 90.0, ['Q105105567']), ('ucla department of philosophy', 90.0, ['Q7864021']), ('ucla department of anthropology', 90.0, ['Q45137679']), ('ucla department of linguistics', 90.0, ['Q101024352']), ('ucla department of mathematics', 90.0, ['Q101024353']), ('ucla department of medicine', 90.0, ['Q101024355']), ('ucla department of neurobiology', 90.0, ['Q101024357']), ('ucla department of neurology', 90.0, ['Q101024359']), ('ucla department of neurosurgery', 90.0, ['Q

In [33]:
import random
queries = [
    "macalester",
    "UMichigan",
    "Harvard Kennedy School",
    "university of minnesota",
    "Harvard",
    "harvard law",
    "harvard medical",
    "umn",
    "stanford",
    "Dunwoody College of Technology",
]

random.shuffle(cleaner_schools)
shortened_cleaner_schools = ['College of Liberal Arts (Texas A&M University)', 'Trinity School (New York City)', 'The Spence School', 'Booth School of Business (University of Chicago)', 'Harvard-Westlake School', 'College of Public Health (University of Iowa)']
shortened_cleaner_schools.extend(cleaner_schools[: len(cleaner_schools) // 2])

for query in shortened_cleaner_schools:
    print(query)
    start_time = time.time()
    entity = resolve_entity(query)
    end_time = time.time()
    print(entity)
    print(f"{end_time-start_time}s to execute\n")


College of Liberal Arts (Texas A&M University)
candidates 2 [('texas a&m university college of liberal arts', 92.88888888888887, ['Q101077444']), ('texas a&m university', 90.0, ['Q49212'])]
new scores
max scores: 104.59701492537313
[('texas a&m university college of liberal arts', 104.59701492537313, ['Q101077444'])]
Q101077444
0.2250502109527588s to execute

Trinity School (New York City)
candidates 2 [('trinity school', 90.0, ['Q7842959', 'Q7842961', 'Q111943190']), ('trinity schools', 86.89655172413794, ['Q16191978'])]
new scores
max scores: 120.0
[('trinity school', 120.0, ['Q7842959', 'Q7842961', 'Q111943190'])]
['Q7842959', 'Q7842961', 'Q111943190']
0.18815898895263672s to execute

The Spence School
candidates 2 [('spence school', 95.0, ['Q7575953']), ('the peck school', 87.5, ['Q7756454'])]
new scores
max scores: 84.0
[('spence school', 84.0, ['Q7575953'])]
Q7575953
0.16609477996826172s to execute

Booth School of Business (University of Chicago)
candidates 1 [('booth school of 