In [1]:
!git clone https://github.com/DanielLiangAjj/DBMI_research_profile_crawler.git

Cloning into 'DBMI_research_profile_crawler'...
remote: Enumerating objects: 2473, done.[K
remote: Counting objects: 100% (1024/1024), done.[K
remote: Compressing objects: 100% (920/920), done.[K
remote: Total 2473 (delta 167), reused 920 (delta 101), pack-reused 1449 (from 1)[K
Receiving objects: 100% (2473/2473), 55.97 MiB | 22.68 MiB/s, done.
Resolving deltas: 100% (237/237), done.
Updating files: 100% (2278/2278), done.


In [2]:
!pip install nltk rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=287f364481c7b7795693f807147a98d346cba2b77a0045171fee648c997ad080
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


# **Data Preparation & Parsing**

In [3]:
import nltk
import json
import re
import csv
import os
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
candidate_path = '/content/DBMI_research_profile_crawler/Research Overview'
comparison_csv_path = '/content/DBMI_research_profile_crawler/columbia_research_faculty_extracted.csv'
ed_gpt_path = '/content/DBMI_research_profile_crawler/results_gpt_ed.json'
ed_scraper_path = '/content/DBMI_research_profile_crawler/results_scraper_ed.json'

# parsing out the keyword and the MeSH Term
def parse_MeSh_keyword(path):
    keywords = []
    mesh_terms = []

    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    for article in data:
        keywords.extend(article.get('Keywords', []))
        mesh_terms.extend(article.get('MeSH terms', []))

    return keywords, mesh_terms

def normalize_name(name):
    if "," in name:
        comma_index = name.find(",")
        return name[:comma_index]
    return name

def parse_ed_data():
    with open(ed_gpt_path, 'r') as gpt_file, open(ed_scraper_path, 'r') as scraper_file:
        gpt_data = json.load(gpt_file)
        scraper_data = json.load(scraper_file)

    gpt_dict = {}
    scraper_dict = {}
    for name in scraper_data:
        if len(scraper_data[name].split()) > 10:
            scraper_dict[normalize_name(name)] = scraper_data[name]


    for researcher in gpt_data:
        name = researcher["Name"]
        name = name.split(", ")
        name[0], name[-1] = name[-1], name[0]
        name = " ".join(name)
        name = normalize_name(name)
        content = researcher.get('DivConq', "")
        gpt_dict[name] = content

    return scraper_dict, gpt_dict
# def normalize_name(name):
#     # Remove titles like "PhD", "MD", etc.
#     name = re.sub(r',?\s*(PhD|MD|Dr|Prof|MS|mfa|ph.d|d.|mph|msw|mba|cgc|l|mbe|ma|otr/l|bcb|m.|llb|ch.b|gpd)\.?', '', name, flags=re.IGNORECASE)
#     # Remove middle names/initials
#     name = re.sub(r'\b[A-Z]\.\b', '', name)
#     # Remove extra whitespace and convert to lower case
#     name = re.sub(r'\s+', ' ', name).strip().lower()
#     return name

# function to get rid of the titles following after the name



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [4]:
from tkinter import FALSE
comparison_names = {}
# read from the csv file and extract the researcher name with their research introduction scraped by the web scraper
with open(comparison_csv_path, mode='r', newline='', encoding='utf-8') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        if row['Research Introduction'] != 'N/A':
            comparison_names[row['Name']] = row['Research Introduction']
# scraped content
normalized_comparison_names = {normalize_name(name): intro for name, intro in comparison_names.items()}
# print(normalized_comparison_names.keys())
scraper_dict, gpt_dict = parse_ed_data()

# add Ed's data into the dictionary
for name in scraper_dict:
    found = False
    for i in normalized_comparison_names:
        if (name[0], name[-1]) == (i[0], i[-1]):
            found = True
            break
    if not found:
        capitalized_name = ""
        for i in range(len(name)):
            if i != 0 and name[i-1] == " ":
                capitalized_name += name[i].upper()
            elif i == 0:
                capitalized_name += name[i].upper()
            else:
                capitalized_name += name[i]
        normalized_comparison_names[capitalized_name] = scraper_dict[name]

# print(normalized_comparison_names)
#names extracted from the json file
file_names = []
# GPT summarized content based on Keywords and MeSH Terms
file_names_dict = {}
for filename in os.listdir(candidate_path):
    if filename.endswith('.txt'):
        name, _ = os.path.splitext(filename)
        # file_names.append(name.lower())
        file_names.append(name)
        # name = name.lower()
        name = name.split(" ")
        first_name, last_name = name[0], name[-1]
        file_names_dict[(first_name, last_name)] = filename
print(len(file_names))
for i in range(len(file_names)):
    name = file_names[i].split(" ")
    first_name, last_name = name[0], name[-1]
    file_names[i] = (first_name, last_name)

matches = {}
for i in normalized_comparison_names:
    name = i.split(" ")
    if (name[0], name[-1]) in file_names and normalized_comparison_names[i] != 'N/A':
        matches[i] = normalized_comparison_names[i]

def remove_last_sentence_if_link(paragraph):
    # Regular expression to match sentences
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s')

    # Regular expression to detect URLs
    url_pattern = re.compile(r'http[s]?://\S+')
    sentences = sentence_endings.split(paragraph.strip())

    last_sentence = sentences[-1]
    if url_pattern.search(last_sentence):
        sentences.pop()
    new_paragraph = ' '.join(sentences)
    return new_paragraph
for name in matches:
    matches[name] = remove_last_sentence_if_link(matches[name])
    # print(matches[name])
abstract_match = {}
for i in gpt_dict:
    i_split = i.split(" ")
    for name in normalized_comparison_names:
        name_split = name.split(' ')
        if i == name and normalized_comparison_names[name] != 'N/A':
            abstract_match[name] = gpt_dict[i]
print("Abstract based:", len(abstract_match))
print("MeSH Term based:", len(matches))


714
Abstract based: 119
MeSH Term based: 214


# **GPT Text Quality Evaluation**

In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.40.6-py3-none-any.whl.metadata (22 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.40.6-py3-none-any.whl (361 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m361.3/361.3 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━

In [None]:
from openai import OpenAI
for test_name in matches:
    test_name_copy = test_name
    test_name = test_name.split(" ")
    test_first_name, test_last_name = test_name[0], test_name[-1]
    test_path = candidate_path + '/' + file_names_dict[(test_first_name, test_last_name)]

    with open(test_path, 'r', encoding='utf-8') as file:
        test_contents = file.read()

    # Tokenize the paragraphs into sentences
    # reference_sentences = nltk.sent_tokenize(matches[test_name_copy])
    # candidate_sentences = nltk.sent_tokenize(test_contents)
    GPT_API_KEY = ""
    client = OpenAI(api_key=GPT_API_KEY)
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
              {"role": "system",
               "content": "You are a helpful Researcher Profile Summarization Quality evaluator."},
              {"role": "user", "content": f"""Based on the given evaluation metrics, evluate the research overview about a researcher given below.
Evluation Metrics: (CONTENT) Grade each on a scale of 1 to 10 for how well it represents a researcher. For each abstract, rate:
Excellent Profile Description (8.1-10.0): The profile provides a comprehensive and compelling overview of the researcher’s work. It clearly highlights the significance, originality, and impact of their research. The description is well-organized, engaging, and includes specific achievements, methodologies, and outcomes. It effectively communicates the researcher’s expertise and contributions to the field.
Good Profile Description (6.1-8.0): The profile gives a clear and informative summary of the researcher’s work. It includes relevant details about their research focus, methodologies, and contributions. While the description is solid and informative, it may lack some depth or clarity in certain areas. It qualifies as a strong representation of the researcher’s achievements and expertise.
Fair Profile Description (4.1-6.0): The profile provides a basic overview of the researcher’s work but lacks detail or clarity in some aspects. It may include general information about the research focus but lacks specific examples or a clear explanation of the impact and significance. The description is sufficient for basic understanding but does not effectively highlight the researcher’s unique contributions or achievements.
Poor Profile Description (2.1-4.0): The profile provides minimal information about the researcher’s work. It lacks detail, specificity, and clarity, making it difficult to understand the research focus or its significance. The description may be too vague or incomplete, failing to convey the researcher’s contributions effectively.
Inadequate Profile Description (0.0-2.0): The profile does not provide meaningful information about the researcher’s work. It is either extremely vague, misleading, or irrelevant, with little to no useful content about the researcher’s contributions or achievements. The description fails to communicate the research focus and is not helpful for understanding the researcher’s expertise.

(QUALITY) Judge each description on a 10 point scale (1 being poor, 10 being great) for coherence, factual consistency, comprehensiveness, and harmfulness/misrepresentation. For each abstract, rate
Relevance (10): Perfectly relevant, up-to-date, and directly aligned with the researcher’s current focus.
Precision (10): Assesses the level of precision in describing specific contributions, methodologies, and outcomes.
Coherence (10): The description is exceptionally clear and logically organized. It presents information in a seamless, engaging manner, making it easy to grasp the research focus and its significance.
Factual Consistency (10): The description is entirely accurate and consistently factual. It provides a reliable and precise representation of the researcher’s work without any errors or misleading information.
Comprehensive (10): The description is exceptionally comprehensive. It includes detailed information about all significant aspects of the research providing a deep understanding of the researcher’s work.
Harmfulness (10): The description is entirely accurate and represents the researcher’s work honestly and transparently. It is well-crafted to prevent any potential harm or misrepresentation.

Research Overview to be evaluated: {test_contents}
"""
              }


            ]
        )
    content = response.choices[0].message.content
    print(content)
    break

APIConnectionError: Connection error.

# **BLEU, ROUGE, and METEOR MeSH & Keywords**

In [None]:
average_bleu_score_for_researchers = 0
average_rouge1_score_for_researchers = 0
average_rouge2_score_for_researchers = 0
average_rougeL_score_for_researchers = 0
average_meteor_score_for_researchers = 0
for test_name in matches:
    test_name_copy = test_name
    test_name = test_name.split(" ")
    test_first_name, test_last_name = test_name[0], test_name[-1]
    test_path = candidate_path + '/' + file_names_dict[(test_first_name, test_last_name)]
    with open(test_path, 'r', encoding='utf-8') as file:
        test_contents = file.read()
    if average_bleu_score_for_researchers == 0:
        print(matches[test_name_copy])
        print(test_contents)
    # Tokenize the paragraphs into sentences
    reference_sentences = nltk.sent_tokenize(matches[test_name_copy])
    candidate_sentences = nltk.sent_tokenize(test_contents)

    # Tokenize sentences into words for BLEU and METEOR
    reference_tokens = [nltk.word_tokenize(sentence) for sentence in reference_sentences]
    candidate_tokens = [nltk.word_tokenize(sentence) for sentence in candidate_sentences]

    # BLEU Score Calculation
    bleu_scores = []
    for reference, candidate in zip(reference_tokens, candidate_tokens):
        score = sentence_bleu([reference], candidate)
        bleu_scores.append(score)

    average_bleu_score = sum(bleu_scores) / len(bleu_scores)
    # print(f"Average BLEU score for {test_name_copy}: {average_bleu_score}")
    average_bleu_score_for_researchers += average_bleu_score

    # ROUGE Score Calculation
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = []
    for reference, candidate in zip(reference_sentences, candidate_sentences):
        scores = scorer.score(reference, candidate)
        rouge_scores.append(scores)

    average_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    average_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    average_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    # print(f"ROUGE Score for {test_name_copy}:")
    # print(f"Average ROUGE-1 score: {average_rouge1:.2f}")
    # print(f"Average ROUGE-2 score: {average_rouge2:.2f}")
    # print(f"Average ROUGE-L score: {average_rougeL:.2f}")
    average_rouge1_score_for_researchers += average_rouge1
    average_rouge2_score_for_researchers += average_rouge2
    average_rougeL_score_for_researchers += average_rougeL

    meteor_scores = []
    for reference, candidate in zip(reference_sentences, candidate_sentences):
        reference_tokens = nltk.word_tokenize(reference)
        candidate_tokens = nltk.word_tokenize(candidate)
        score = meteor_score([reference_tokens], candidate_tokens)
        meteor_scores.append(score)

    average_meteor_score = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
    # print(f"METEOR Score for {test_name_copy}: {average_meteor_score:.2f}")
    average_meteor_score_for_researchers += average_meteor_score
average_bleu_score_for_researchers /= len(matches)
average_rouge1_score_for_researchers /= len(matches)
average_rouge2_score_for_researchers /= len(matches)
average_rougeL_score_for_researchers /= len(matches)
average_meteor_score_for_researchers /= len(matches)
print(f"Average BLEU Score for all researchers: {average_bleu_score_for_researchers}")
print(f"Average ROUGE1 Score for all researchers: {average_rouge1_score_for_researchers}")
print(f"Average ROUGE2 Score for all researchers: {average_rouge2_score_for_researchers}")
print(f"Average ROUGEL Score for all researchers: {average_rougeL_score_for_researchers}")
print(f"Average METEOR Score for all researchers: {average_meteor_score_for_researchers}")



Al-Hashimi is interested in developing a deep, quantitative, and predictive understanding of cellular processes based on the fundamental behaviors of nucleic acids and their interactions with protein binding partners.  Over the past two decades, Al-Hashimi and his trainees developed approaches combining NMR spectroscopy, computational modeling, optical melting experiments, and chemical probing to determine 3D dynamic ensembles of RNA and DNA molecules at atomic resolution.  Using dynamic ensembles of nucleic acids, the Al-Hashimi group has developed quantitative and predictive models for several fundamental biological processes, including DNA replication fidelity, Tat-dependent HIV-1 transcriptional activation, RNA folding, and the impact of post-transcriptional modifications such as m6A and Nm on translation, splicing, and RNA-protein interactions.  These studies have reshaped structural biology, revealing dynamic ensembles as the fundamental behavior of biomolecules needed to underst

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score for all researchers: 0.0016423131706962465
Average ROUGE1 Score for all researchers: 0.16907457378191698
Average ROUGE2 Score for all researchers: 0.01619315209962701
Average ROUGEL Score for all researchers: 0.12856735317222412
Average METEOR Score for all researchers: 0.1356082739583685


# **BLEU, ROUGE and METEOR Abstract**

In [None]:
average_bleu_score_for_researchers = 0
average_rouge1_score_for_researchers = 0
average_rouge2_score_for_researchers = 0
average_rougeL_score_for_researchers = 0
average_meteor_score_for_researchers = 0
for test_name in abstract_match:
    test_contents = abstract_match[test_name]
    if average_bleu_score_for_researchers == 0:
        print(test_name)
        print(matches[test_name])
        print(test_contents)
    # Tokenize the paragraphs into sentences
    reference_sentences = nltk.sent_tokenize(matches[test_name])
    candidate_sentences = nltk.sent_tokenize(test_contents)

    # Tokenize sentences into words for BLEU and METEOR
    reference_tokens = [nltk.word_tokenize(sentence) for sentence in reference_sentences]
    candidate_tokens = [nltk.word_tokenize(sentence) for sentence in candidate_sentences]

    # BLEU Score Calculation
    bleu_scores = []
    for reference, candidate in zip(reference_tokens, candidate_tokens):
        score = sentence_bleu([reference], candidate)
        bleu_scores.append(score)

    average_bleu_score = sum(bleu_scores) / len(bleu_scores)
    # print(f"Average BLEU score for {test_name_copy}: {average_bleu_score}")
    average_bleu_score_for_researchers += average_bleu_score

    # ROUGE Score Calculation
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = []
    for reference, candidate in zip(reference_sentences, candidate_sentences):
        scores = scorer.score(reference, candidate)
        rouge_scores.append(scores)

    average_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    average_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    average_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    # print(f"ROUGE Score for {test_name_copy}:")
    # print(f"Average ROUGE-1 score: {average_rouge1:.2f}")
    # print(f"Average ROUGE-2 score: {average_rouge2:.2f}")
    # print(f"Average ROUGE-L score: {average_rougeL:.2f}")
    average_rouge1_score_for_researchers += average_rouge1
    average_rouge2_score_for_researchers += average_rouge2
    average_rougeL_score_for_researchers += average_rougeL

    meteor_scores = []
    for reference, candidate in zip(reference_sentences, candidate_sentences):
        reference_tokens = nltk.word_tokenize(reference)
        candidate_tokens = nltk.word_tokenize(candidate)
        score = meteor_score([reference_tokens], candidate_tokens)
        meteor_scores.append(score)

    average_meteor_score = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
    # print(f"METEOR Score for {test_name_copy}: {average_meteor_score:.2f}")
    average_meteor_score_for_researchers += average_meteor_score
average_bleu_score_for_researchers /= len(matches)
average_rouge1_score_for_researchers /= len(matches)
average_rouge2_score_for_researchers /= len(matches)
average_rougeL_score_for_researchers /= len(matches)
average_meteor_score_for_researchers /= len(matches)
print(f"Average BLEU Score for all researchers: {average_bleu_score_for_researchers}")
print(f"Average ROUGE1 Score for all researchers: {average_rouge1_score_for_researchers}")
print(f"Average ROUGE2 Score for all researchers: {average_rouge2_score_for_researchers}")
print(f"Average ROUGEL Score for all researchers: {average_rougeL_score_for_researchers}")
print(f"Average METEOR Score for all researchers: {average_meteor_score_for_researchers}")



Patrick Ryan
Patrick Ryan, PhD is Vice President, Observational Health Data Analytics at Janssen Research and Development, where he is leading efforts to develop and apply analysis methods to better understand the real-world effects of medical products. He is an original collaborator in Observational Health Data Sciences and Informatics (OHDSI), a multi-stakeholder, interdisciplinary collaborative to create open-source solutions that bring out the value of observational health data through large-scale analytics. He served as a principal investigator of the Observational Medical Outcomes Partnership (OMOP), a public-private partnership chaired by the Food and Drug Administration, where he led methodological research to assess the appropriate use of observational health care data to identify and evaluate drug safety issues. Patrick received his undergraduate degrees in Computer Science and Operations Research at Cornell University, his Master of Engineering in Operations Research and Ind

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score for all researchers: 0.0003975090880958469
Average ROUGE1 Score for all researchers: 0.07913616483109243
Average ROUGE2 Score for all researchers: 0.0050418057695733355
Average ROUGEL Score for all researchers: 0.05921020142988679
Average METEOR Score for all researchers: 0.06576000344827623


# **Rhetorical Strucutre Theory**

# **Part-of-speech Tagging**

In [5]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m99.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [8]:
def pos_distribution(doc):
    '''
    This function counts the occurrences of each PoS tag in the document and returns a
    dictionary where the keys are the PoS tags and the values are their respective counts.
    '''
    pos_counts = doc.count_by(spacy.attrs.POS)
    return {doc.vocab[k].text: v for k, v in pos_counts.items()}

def lexical_diversity_by_pos(doc):
    '''
    This function creates a dictionary where each key is a PoS tag, and the value is the
    number of unique words (lexical diversity) associated with that tag in the document.
    '''
    pos_words = {}
    for token in doc:
        pos = token.pos_
        if pos not in pos_words:
            pos_words[pos] = set()
        pos_words[pos].add(token.text)
    return {pos: len(words) for pos, words in pos_words.items()}

def dependency_tree_depth(doc):
    '''
    The depth of the dependency tree is determined by the longest path from any token
    to the root of the sentence. This function finds the maximum depth among all tokens
    in the document.
    '''
    return max([len(list(token.ancestors)) for token in doc])

def head_dependent_pairs(doc):
    '''
    This function identifies the syntactic relationships between words by returning a list
    of tuples where each tuple consists of a head word and its dependent word.
    '''
    return [(token.head.text, token.text) for token in doc if token.head != token]

def modifiers_analysis(doc):
    '''
    This function identifies modifiers (adjectives and adverbs) in the document and
    returns a list of tuples where each tuple contains the head word, the modifier,
    and the type of modification (adjectival or adverbial).
    '''
    modifiers = []
    for token in doc:
        if token.dep_ in ['amod', 'advmod']:
            modifiers.append((token.head.text, token.text, token.dep_))
    return modifiers

def syntactic_complexity(doc):
    '''
    Syntactic complexity is measured by averaging the length of dependency paths (the depth
    of the tree) for all tokens in the document. This gives an indication of how complex
    the sentence structures are.
    '''
    return sum(len(list(token.ancestors)) for token in doc) / len(doc)

def syntactic_ambiguity(doc):
    """
    Identify potential syntactic ambiguities in the document.
    Ambiguity is flagged if a token has multiple potential heads or ambiguous modifiers.
    """
    ambiguities = []

    for token in doc:
        if len(list(token.children)) > 1:
            # Checking for ambiguity in modifiers
            modifiers = [child for child in token.children if child.dep_ in ['amod', 'advmod']]
            if len(modifiers) > 1:
                ambiguities.append((token.text, "Ambiguous Modifiers", [mod.text for mod in modifiers]))

        # Ambiguity in conjunctions (coordinating conjunctions)
        if token.dep_ == 'cc' and token.head.dep_ in ['conj']:
            ambiguities.append((token.text, "Ambiguous Conjunction", token.head.text))

    return ambiguities



In [9]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
counter = 0
for test_name in matches:
    print(f"Processing {test_name}")
    test_name_copy = test_name
    test_name = test_name.split(" ")
    test_first_name, test_last_name = test_name[0], test_name[-1]
    test_path = candidate_path + '/' + file_names_dict[(test_first_name, test_last_name)]

    with open(test_path, 'r', encoding='utf-8') as file:
        test_contents = file.read()

    # Tokenize the paragraphs into sentences
    reference_sentences = nlp(matches[test_name_copy]) # human written
    candidate_sentences = nlp(test_contents) # GPT Generated

    human_written_pos_dist = pos_distribution(reference_sentences)
    gpt_generated_pos_dist = pos_distribution(candidate_sentences)

    print("PoS Distribution for Human written text:", dict(sorted(human_written_pos_dist.items(), key=lambda item: item[1], reverse = True)))
    print("PoS Distribution for GPT generate text:", dict(sorted(gpt_generated_pos_dist.items(), key=lambda item: item[1], reverse = True)))
    print("====================================")

    human_written_lexi_diver = lexical_diversity_by_pos(reference_sentences)
    gpt_generated_lexi_diver = lexical_diversity_by_pos(candidate_sentences)


    print("Lexical Diveristy for Human written text:", dict(sorted(human_written_lexi_diver.items(), key=lambda item: item[1], reverse = True)))
    print("Lexical Diveristy for GPT generate text:", dict(sorted(gpt_generated_lexi_diver.items(), key=lambda item: item[1], reverse = True)))
    print("====================================")

    human_written_tree_depth = dependency_tree_depth(reference_sentences)
    gpt_generated_tree_depth = dependency_tree_depth(candidate_sentences)

    print("Dependency Tree Depth for Human written text:", human_written_tree_depth)
    print("Dependency Tree Depth for GPT generated text:", gpt_generated_tree_depth)
    print("====================================")

    human_written_head_dependent = head_dependent_pairs(reference_sentences)
    gpt_generated_head_dependent = head_dependent_pairs(candidate_sentences)

    print("Head-Dependent Pairs in Human written text:", human_written_head_dependent)
    print("Head-Dependent Pairs in GPT generated text:", gpt_generated_head_dependent)
    print("====================================")

    human_written_modifiers = modifiers_analysis(reference_sentences)
    gpt_generated_modifiers = modifiers_analysis(candidate_sentences)

    print("Modifiers in Human written text:", human_written_modifiers)
    print("Modifiers in GPT generated text:", gpt_generated_modifiers)
    print("====================================")


    human_written_complexity = syntactic_complexity(reference_sentences)
    gpt_generated_complexity = syntactic_complexity(candidate_sentences)

    print("Syntactic Complexity for Human written text:", human_written_complexity)
    print("Syntactic Complexity for GPT generated text:", gpt_generated_complexity)
    print("====================================")


    human_written_ambiguities = syntactic_ambiguity(reference_sentences)
    gpt_generated_ambiguities = syntactic_ambiguity(candidate_sentences)

    print("Syntactic ambiguities for Human written text:", human_written_ambiguities)
    print("Syntactic ambiguities for GPT generated text:", gpt_generated_ambiguities)
    print("====================================")

    # print the result of PoS
    # for token in reference_sentences:
    #     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
    #             token.shape_, token.is_alpha, token.is_stop)
    # for token in candidate_sentences:
    #     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
    #             token.shape_, token.is_alpha, token.is_stop)
    reference_sentences_spans = list(reference_sentences.sents)
    candidate_sentences_spans = list(candidate_sentences.sents)
    # display PoS relationship with graph
    # displacy.serve(reference_sentences_spans, style="dep")
    # displacy.serve(candidate_sentences_spans, style="dep")
    if counter == 0:
        break


Processing Hashim M. Al-Hashimi
PoS Distribution for Human written text: {'NOUN': 108, 'ADJ': 61, 'PUNCT': 55, 'ADP': 44, 'VERB': 43, 'PROPN': 30, 'CCONJ': 25, 'DET': 20, 'ADV': 10, 'SPACE': 7, 'PART': 7, 'AUX': 6, 'PRON': 6, 'NUM': 3}
PoS Distribution for GPT generate text: {'NOUN': 51, 'ADP': 23, 'PUNCT': 22, 'ADJ': 18, 'PROPN': 16, 'VERB': 13, 'DET': 9, 'CCONJ': 8, 'ADV': 5, 'PRON': 5, 'SPACE': 2, 'PART': 2}
Lexical Diveristy for Human written text: {'NOUN': 87, 'ADJ': 39, 'VERB': 35, 'PROPN': 16, 'ADP': 13, 'ADV': 9, 'DET': 7, 'PRON': 5, 'PUNCT': 3, 'AUX': 3, 'NUM': 3, 'CCONJ': 1, 'SPACE': 1, 'PART': 1}
Lexical Diveristy for GPT generate text: {'NOUN': 44, 'ADJ': 15, 'VERB': 13, 'PROPN': 10, 'ADP': 9, 'ADV': 5, 'PUNCT': 4, 'PRON': 4, 'PART': 2, 'DET': 2, 'SPACE': 1, 'CCONJ': 1}
Dependency Tree Depth for Human written text: 16
Dependency Tree Depth for GPT generated text: 10
Head-Dependent Pairs in Human written text: [('Hashimi', 'Al'), ('Hashimi', '-'), ('is', 'Hashimi'), ('is', '

# **Dependency Parsing**

In [None]:
import spacy
from spacy import displacy
counter = 0
nlp = spacy.load("en_core_web_sm")
for test_name in matches:
    print(f"Processing {test_name}")
    test_name_copy = test_name
    test_name = test_name.split(" ")
    test_first_name, test_last_name = test_name[0], test_name[-1]
    test_path = candidate_path + '/' + file_names_dict[(test_first_name, test_last_name)]

    with open(test_path, 'r', encoding='utf-8') as file:
        test_contents = file.read()

    # Tokenize the paragraphs into sentences
    reference_sentences = nlp(matches[test_name_copy]) # human written
    candidate_sentences = nlp(test_contents) # GPT Generated

    for token in reference_sentences:
        print(f"Token: {token.text}, Head: {token.head.text}, Dep: {token.dep_}, POS: {token.pos_}")
    for token in candidate_sentences:
        print(f"Token: {token.text}, Head: {token.head.text}, Dep: {token.dep_}, POS: {token.pos_}")

    # Visualizing the dependency parse tree
    displacy.render(reference_sentences, style="dep", jupyter=True, options={'distance': 90})
    displacy.render(candidate_sentences, style="dep", jupyter=True, options={'distance': 90})
    if counter == 0:
        break

Processing Hashim M. Al-Hashimi
Token: Al, Head: Hashimi, Dep: compound, POS: PROPN
Token: -, Head: Hashimi, Dep: punct, POS: PUNCT
Token: Hashimi, Head: is, Dep: nsubj, POS: PROPN
Token: is, Head: is, Dep: ROOT, POS: AUX
Token: interested, Head: is, Dep: acomp, POS: ADJ
Token: in, Head: interested, Dep: prep, POS: ADP
Token: developing, Head: in, Dep: pcomp, POS: VERB
Token: a, Head: understanding, Dep: det, POS: DET
Token: deep, Head: quantitative, Dep: amod, POS: ADJ
Token: ,, Head: quantitative, Dep: punct, POS: PUNCT
Token: quantitative, Head: understanding, Dep: amod, POS: ADJ
Token: ,, Head: quantitative, Dep: punct, POS: PUNCT
Token: and, Head: quantitative, Dep: cc, POS: CCONJ
Token: predictive, Head: understanding, Dep: amod, POS: ADJ
Token: understanding, Head: developing, Dep: dobj, POS: NOUN
Token: of, Head: understanding, Dep: prep, POS: ADP
Token: cellular, Head: processes, Dep: amod, POS: ADJ
Token: processes, Head: of, Dep: pobj, POS: NOUN
Token: based, Head: processes

# **Word Mover's Distance, Cosine Similarity, Sentiment and Tone Analysis**

In [None]:
!pip install vaderSentiment
!pip install POT

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Collecting POT
  Downloading POT-0.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Downloading POT-0.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (835 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m835.4/835.4 kB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.4


In [None]:
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import gensim.downloader as api

# Load pre-trained GloVe embeddings
word_vectors = api.load("glove-wiki-gigaword-300")

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to calculate Word Mover's Distance for paragraphs
def calculate_wmd(paragraph1, paragraph2):
    paragraph1_tokens = [word for word in nltk.word_tokenize(paragraph1.lower()) if word in word_vectors]
    paragraph2_tokens = [word for word in nltk.word_tokenize(paragraph2.lower()) if word in word_vectors]
    return word_vectors.wmdistance(paragraph1_tokens, paragraph2_tokens)

# Function to calculate average word vectors for cosine similarity for paragraphs
def average_word_vectors(paragraph, model, num_features):
    words = nltk.word_tokenize(paragraph.lower())
    feature_vec = np.zeros((num_features,), dtype="float32")
    n_words = 0
    for word in words:
        if word in model:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if n_words > 0:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

# Function to analyze sentiment for paragraphs
def analyze_sentiment(paragraph):
    return analyzer.polarity_scores(paragraph)



KeyboardInterrupt: 

In [None]:
# Iterate through the matches
counter=0
for test_name in matches:
    test_name_copy = test_name
    test_name = test_name.split(" ")
    test_first_name, test_last_name = test_name[0], test_name[-1]
    test_path = candidate_path+'/'+file_names_dict[(test_first_name, test_last_name)]
    with open(test_path, 'r', encoding='utf-8') as file:
        test_contents = file.read()

    # Process the entire paragraph
    reference_paragraph = matches[test_name_copy] # human written
    candidate_paragraph = test_contents # GPT Generated

    print(f"Processing paragraph for {test_name_copy}:")
    print(f"Reference Paragraph (Human Written Scraped Content): {reference_paragraph}")
    print(f"Candidate Paragraph (GPT Generated Content): {candidate_paragraph}")
    # Detecting Subtle Differences in Meaning (Word Mover's Distance)
    wmd_distance = calculate_wmd(reference_paragraph, candidate_paragraph)
    print(f"-->Word Mover's Distance between paragraphs: {wmd_distance}")

    # Cosine Similarity using Word Embeddings for Paragraphs
    reference_vector = average_word_vectors(reference_paragraph, word_vectors, 300)
    candidate_vector = average_word_vectors(candidate_paragraph, word_vectors, 300)
    cosine_sim = cosine_similarity([reference_vector], [candidate_vector])[0][0]
    print(f"-->Cosine Similarity for paragraphs: {cosine_sim}")

    # Sentiment and Tone Analysis for Paragraphs
    reference_sentiment = analyze_sentiment(reference_paragraph)
    candidate_sentiment = analyze_sentiment(candidate_paragraph)
    print("Sentiment Analysis for paragraphs:")
    print(f"-->Reference Paragraph Sentiment: {reference_sentiment}")
    print(f"-->Candidate Paragraph Sentiment: {candidate_sentiment}")

    # Compare sentiment differences
    sentiment_diff = compare_sentiment(reference_sentiment, candidate_sentiment)
    print(f"-->Sentiment Differences for paragraphs: {sentiment_diff}")
    print("==================================================")
    if counter == 0:
        break
