In [1]:
!git clone https://github.com/DanielLiangAjj/DBMI_research_profile_crawler.git

Cloning into 'DBMI_research_profile_crawler'...
remote: Enumerating objects: 2222, done.[K
remote: Counting objects: 100% (773/773), done.[K
remote: Compressing objects: 100% (720/720), done.[K
remote: Total 2222 (delta 53), reused 767 (delta 50), pack-reused 1449[K
Receiving objects: 100% (2222/2222), 46.71 MiB | 15.42 MiB/s, done.
Resolving deltas: 100% (123/123), done.
Updating files: 100% (2146/2146), done.


In [2]:
!pip install nltk rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4a1a9940c4bd88d0da065f9ef9bb8b0ef52d62be49ed4e0ee87169b79112ba5e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


# **Data Preparation & Parsing**

In [31]:
import nltk
import json
import re
import csv
import os
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

nltk.download('punkt')
reference_path = '/content/DBMI_research_profile_crawler/researchers_files(Yilu_format)'
candidate_path = '/content/DBMI_research_profile_crawler/Research Overview'
comparison_csv_path = '/content/DBMI_research_profile_crawler/columbia_research_faculty_extracted.csv'
def parse_MeSh_keyword(path):
    keywords = []
    mesh_terms = []

    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    for article in data:
        keywords.extend(article.get('Keywords', []))
        mesh_terms.extend(article.get('MeSH terms', []))

    return keywords, mesh_terms

def normalize_name(name):
    # Remove titles like "PhD", "MD", etc.
    name = re.sub(r',?\s*(PhD|MD|Dr|Prof|MS|mfa|ph.d|d.|mph|msw|mba|cgc|l|mbe|ma|otr/l|bcb|m.|llb|ch.b|gpd)\.?', '', name, flags=re.IGNORECASE)
    # Remove middle names/initials
    name = re.sub(r'\b[A-Z]\.\b', '', name)
    # Remove extra whitespace and convert to lower case
    name = re.sub(r'\s+', ' ', name).strip().lower()
    return name

comparison_names = {}
with open(comparison_csv_path, mode='r', newline='', encoding='utf-8') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        comparison_names[row['Name']] = row['Research Introduction']

normalized_comparison_names = {normalize_name(name): intro for name, intro in comparison_names.items()}
file_names = []
file_names_dict = {}
for filename in os.listdir(candidate_path):
    if filename.endswith('.txt'):
        name, _ = os.path.splitext(filename)
        file_names.append(name.lower())
        name = name.lower()
        name = name.split(" ")
        first_name, last_name = name[0], name[-1]
        file_names_dict[(first_name, last_name)] = filename

for i in range(len(file_names)):
    name = file_names[i].split(" ")
    first_name, last_name = name[0], name[-1]
    file_names[i] = (first_name, last_name)

matches = {}
for i in normalized_comparison_names:
    name = i.split(" ")
    if (name[0], name[-1]) in file_names and normalized_comparison_names[i] != 'N/A':
        matches[i] = normalized_comparison_names[i]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **BLEU & ROUGE**

In [37]:
for test_name in matches:
    test_name_copy = test_name
    test_name = test_name.split(" ")
    test_first_name, test_last_name = test_name[0], test_name[-1]
    test_path = candidate_path+'/'+file_names_dict[(test_first_name, test_last_name)]
    with open(test_path, 'r', encoding='utf-8') as file:
        test_contents = file.read()
    # print(matches[test_name_copy])
    # Tokenize the paragraphs into sentences
    reference_sentences = nltk.sent_tokenize(matches[test_name_copy])
    candidate_sentences = nltk.sent_tokenize(test_contents)

    # Tokenize sentences into words
    reference_tokens = [nltk.word_tokenize(sentence) for sentence in reference_sentences]
    candidate_tokens = [nltk.word_tokenize(sentence) for sentence in candidate_sentences]

    # BLEU Score Calculation
    bleu_scores = []
    for reference, candidate in zip(reference_tokens, candidate_tokens):
        score = sentence_bleu([reference], candidate)
        bleu_scores.append(score)

    average_bleu_score = sum(bleu_scores) / len(bleu_scores)
    print(f"Average BLEU score for {test_name_copy}: {average_bleu_score:.4f}")

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = []
    for reference, candidate in zip(reference_sentences, candidate_sentences):
        scores = scorer.score(reference, candidate)
        rouge_scores.append(scores)

    # Calculate average ROUGE scores
    average_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    average_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    average_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    print(f"ROUGE Score for {test_name_copy}:")
    print(f"Average ROUGE-1 score: {average_rouge1:.2f}")
    print(f"Average ROUGE-2 score: {average_rouge2:.2f}")
    print(f"Average ROUGE-L score: {average_rougeL:.2f}")
    print("==================================================")

Average BLEU score for stephen p. goff: 0.0000
ROUGE Score for stephen p. goff:
Average ROUGE-1 score: 0.13
Average ROUGE-2 score: 0.01
Average ROUGE-L score: 0.11
Average BLEU score for eric c. greene: 0.0000
ROUGE Score for eric c. greene:
Average ROUGE-1 score: 0.13
Average ROUGE-2 score: 0.00
Average ROUGE-L score: 0.11
Average BLEU score for barry honig: 0.0000
ROUGE Score for barry honig:
Average ROUGE-1 score: 0.20
Average ROUGE-2 score: 0.01
Average ROUGE-L score: 0.16
Average BLEU score for peter kwong: 0.0000
ROUGE Score for peter kwong:
Average ROUGE-1 score: 0.19
Average ROUGE-2 score: 0.02
Average ROUGE-L score: 0.13
Average BLEU score for chia-wei cheng: 0.0000
ROUGE Score for chia-wei cheng:
Average ROUGE-1 score: 0.20
Average ROUGE-2 score: 0.01
Average ROUGE-L score: 0.15
Average BLEU score for jean gautier: 0.0000
ROUGE Score for jean gautier:
Average ROUGE-1 score: 0.16
Average ROUGE-2 score: 0.02
Average ROUGE-L score: 0.13
Average BLEU score for vincenzo aessano ge