In [69]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get embeddings for a paragraph
def get_paragraph_embedding(paragraph):
    encoded_input = tokenizer(paragraph, return_tensors='pt', truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model(**encoded_input)

    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding

In [70]:
def read_paragraphs_from_csv(file_path, name_column, paragraph_column, researchers_name):
    df = pd.read_csv(file_path)
    paragraphs = []
    for name in researchers_name:
        for _, row in df.iterrows():
            if pd.notna(row[name_column]) and name in row[name_column]:
                if pd.notna(row[paragraph_column]):
                    paragraphs.append((name, row[paragraph_column], row["Research Interests"]))
                break
    return paragraphs

def read_paragraphs_from_json(file_path, key):
    with open(file_path, 'r') as file:
        data = json.load(file)
    paragraphs = [(entry["Researcher"],entry[key], entry["Research Interests"]) for entry in data if key in entry]  # Extract paragraphs using the specified key
    return paragraphs

In [71]:
csv_file_path = '/content/columbia_research_faculty_extracted.csv'
json_file_path = '/content/GPT4o_summarized_researcher_profile.json'
csv_paragraph_column = "Research Introduction"
csv_name_column = "Name"
json_key = "Research Overview"
researchers_name = [
    "Joseph A. Gogos", "Jacqueline Gottlieb", "Richard S. Mann", "Mimi Shirasu-Hiza",
    "Jane Dodd", "Larry Abbott", "Christoph Kellendonk", "Vincent P. Ferrera", "Robert D. Hawkins",
    "Nikolaus Kriegeskorte", "Henry Colecraft", "Eric A. Schon", "Eric Kandel", "Jonathan A. Javitch",
    "Rui M. Costa", "Wayne Hendrickson", "Stavros Lomvardas", "Steven A. Siegelbaum", "David Sulzer",
    "Laura Landweber"
]
csv_paragraphs = read_paragraphs_from_csv(csv_file_path, csv_name_column, csv_paragraph_column, researchers_name)
csv_paragraphs.sort()
# Read paragraphs from JSON
json_paragraphs = (read_paragraphs_from_json(json_file_path, json_key))
csv_paragraphs.sort()


In [72]:
for i in range(len(csv_paragraphs)):
    embedding1 = get_paragraph_embedding(csv_paragraphs[i][1])
    embedding2 = get_paragraph_embedding(json_paragraphs[i][1])
    print(csv_paragraphs[i][2].split(' '))

    # Calculate cosine similarity
    overview_similarity = cosine_similarity([embedding1], [embedding2])
    embedding1 = get_paragraph_embedding(csv_paragraphs[i][2])
    embedding2 = get_paragraph_embedding(" ".join(json_paragraphs[i][2]))
    interests_similarity = cosine_similarity([embedding1], [embedding2])
    print("Name:", csv_paragraphs[i][0])
    print(f"Research Overview Cosine Similarity: {overview_similarity[0][0]}")
    print("Research Interest Cosine Similarity:", interests_similarity[0][0])



['Models', 'of', 'Psychiatric', 'Disorders', 'Neurobiology', 'of', 'Cognitive', 'and', 'Motivated', 'Behaviors', 'Neurobiology', 'of', 'Disease', 'Neurobiology', 'of', 'Learning', 'and', 'Memory', 'Neurobiology', 'of', 'Psychiatric', 'Disorders', 'Neurogenetics', 'Thalamo-Cortical', 'and', 'Basal', 'Ganglia', 'Circuitry']
Name: Christoph Kellendonk
Research Overview Cosine Similarity: 0.8695370554924011
Research Interest Cosine Similarity: 0.91710955
['Cell', 'biology', 'Learning', 'and', 'behavior', 'Neurodegeneration', 'and', 'repair', 'Neuroimmunology', 'Synapses', 'and', 'Circuits', 'Grants', 'Four', 'current', 'R01s', '(NIMH.', 'NINDS.', 'NIDA)', 'Lead', 'PI,', 'ASAP', 'Foundation', 'award', 'for', 'research', 'in', "Parkinson's", 'Multiple', 'private', 'foundation', 'awards']
Name: David Sulzer
Research Overview Cosine Similarity: 0.8097734451293945
Research Interest Cosine Similarity: 0.7999315
['Mitochondrial', 'genetics', 'and', 'the', 'molecular', 'basis', 'of', 'human', 'mit

# **Research Interest Semantic Similarity Comparison Based On Ontology**

In [73]:
pip install owlready2 nltk



In [95]:
import owlready2
import nltk
from owlready2 import get_ontology
from math import log
from itertools import product

# Load information content file from NLTK
nltk.download('wordnet_ic')
brown_ic = nltk.corpus.wordnet_ic.ic('ic-brown.dat')

# Load the Gene Ontology
ontology = get_ontology("http://purl.obolibrary.org/obo/go.owl").load()

# Function to get the IC of a term
def get_information_content(term, ic_dict):
    if term in ic_dict:
        return ic_dict[term]
    else:
        return -log(1e-10)

# Function to find the most informative common ancestor (MICA)
def get_mica(term1, term2, ic_dict):
    ancestors1 = set(term1.ancestors())
    ancestors2 = set(term2.ancestors())
    common_ancestors = ancestors1.intersection(ancestors2)
    if not common_ancestors:
        return None, 0.0
    mica = max(common_ancestors, key=lambda ancestor: get_information_content(ancestor, ic_dict))
    return mica, get_information_content(mica, ic_dict)

# Function to calculate Resnik similarity between two terms
def resnik_similarity(term1, term2, ic_dict):
    mica, ic = get_mica(term1, term2, ic_dict)
    return ic

# Precompute the information content dictionary from ontology
ic_dict = {}
for term in ontology.classes():
    ic_dict[term] = get_information_content(term, brown_ic)

# Function to map terms to ontology classes
def map_to_ontology(term, ontology):
    return ontology.search_one(label=term)

# Iterate through the paragraphs and compute similarities
for i in range(len(csv_paragraphs)):
    name = csv_paragraphs[i][0]

    # Extract and clean human research interests
    interests_terms = csv_paragraphs[i][2].split(" ")
    interests_terms_human = [term for term in interests_terms if term not in ["and", "the", "of"]]
    print("Human",interests_terms_human)

    interests_terms_gpt = json_paragraphs[i][2]
    print("GPT",interests_terms_gpt)

    # Map terms to ontology classes
    mapped_terms1 = [map_to_ontology(term, ontology) for term in interests_terms_human]
    mapped_terms2 = [map_to_ontology(term, ontology) for term in interests_terms_gpt]
    print(mapped_terms1)
    print(mapped_terms2)
    pairwise_similarities = []
    for term1, term2 in product(mapped_terms1, mapped_terms2):
        if term1 and term2:
            similarity = resnik_similarity(term1, term2, ic_dict)
            pairwise_similarities.append(similarity)
    # print(pairwise_similarities)
    if pairwise_similarities:
        overall_similarity = sum(pairwise_similarities) / len(pairwise_similarities)
    else:
        overall_similarity = 0.0

    print(f"Overall similarity between Research Interests for {name}: {overall_similarity}")


[nltk_data] Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!


Human ['Models', 'Psychiatric', 'Disorders', 'Neurobiology', 'Cognitive', 'Motivated', 'Behaviors', 'Neurobiology', 'Disease', 'Neurobiology', 'Learning', 'Memory', 'Neurobiology', 'Psychiatric', 'Disorders', 'Neurogenetics', 'Thalamo-Cortical', 'Basal', 'Ganglia', 'Circuitry']
GPT ['Neural Circuitry', 'Neurotransmitter Systems', 'Cognitive Neuroscience', 'Motor Function', 'Neurodevelopment', 'Neuroplasticity', 'Psychiatric Disorders', 'Molecular Neuroscience', 'Behavioral Neuroscience', 'Synaptic Transmission']
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[None, None, None, None, None, None, None, None, None, None]
Overall similarity between Research Interests for Christoph Kellendonk: 0.0
Human ['Cell', 'biology', 'Learning', 'behavior', 'Neurodegeneration', 'repair', 'Neuroimmunology', 'Synapses', 'Circuits', 'Grants', 'Four', 'current', 'R01s', '(NIMH.', 'NINDS.', 'NIDA)', 'Lead', 'PI,', 'ASAP', 'Foundation