In [1]:
from wikipedia2vec import Wikipedia2Vec
import time
from sentence_transformers import SentenceTransformer

from helper_functions.cosine_similarity import cosine_similarity
from helper_functions.obtain_articles import get_similar_articles, get_headings
from settings import *

In [2]:
print("Starting Load")
wiki2vec = Wikipedia2Vec.load(wikipedia_model_file)
sentence_transformer = SentenceTransformer(sentence_transformers_model)
print("Loaded models")

Starting Load
Loaded models


In [3]:
def get_sectioned_headers(article_name):
    print("Getting section headers (Using wiki2vec similarity)...")
    sectioned_headers = [[get_headings(article[0]), article[1]] for article in get_similar_articles(article_name, wiki2vec)]
    return sectioned_headers

In [4]:
def get_header_set(sectioned_headers):
    header_scores = {}
    for article in sectioned_headers:
        # get all headers in the similar article
        for header in article[0]:
            if header in list(header_scores.keys()):
                header_scores[header] += article[1]
            else:
                header_scores[header] = article[1]

    header_set = [[header_score[0], header_score[1]] for header_score in header_scores.items()]  # [("symptoms", 14.3234), ("history", 11.4321), etc.]
    return header_set

In [5]:
def similarity_pair_combine(header_set, header_requirement):
    similarity_pairs = []  # [(("symptoms", 14.3234), ("signs and symptoms", 15.3221), 0.994), etc.]
    for header_1 in range(len(header_set)):
        for header_2 in range(header_1+1, len(header_set)):
            similarity_pairs.append([header_set[header_1], header_set[header_2], cosine_similarity(header_set[header_1][0], header_set[header_2][0], sentence_transformer)])
    similarity_pairs = [pair for pair in sorted(similarity_pairs, key=lambda x: x[2], reverse=True) if pair[2] > cosine_similarity_requirement]

    for pair in range(len(similarity_pairs)):
        if len(header_set) <= header_requirement:
            break

        if similarity_pairs[pair][0] in header_set and similarity_pairs[pair][1] in header_set:
            # first header has bigger score
            if similarity_pairs[pair][0][1] > similarity_pairs[pair][1][1]:
                # add small score to big score and get rid of the smaller score
                header_set[header_set.index(similarity_pairs[pair][0])][1] += \
                header_set[header_set.index(similarity_pairs[pair][1])][1]
                header_set.remove(similarity_pairs[pair][1])
            else:
                header_set[header_set.index(similarity_pairs[pair][1])][1] += \
                header_set[header_set.index(similarity_pairs[pair][0])][1]
                header_set.remove(similarity_pairs[pair][0])

In [6]:
def header_cutoff(header_set, header_requirement):
        # Cut off if it is more than header_requirement
    if len(header_set) > header_requirement:
        generated_headings = [header[0] for header in sorted(header_set[:header_requirement], key=lambda x: x[1], reverse=True)]
    else:
        generated_headings = [header[0] for header in sorted(header_set, key=lambda x: x[1], reverse=True)]
    return generated_headings

In [7]:
def get_precision_and_recall(article_name, generated_headings, gt_headings):
    true_positive = 0
    false_positive = 0
    count = 0

    keyword_matches = {}
    ordered_keywords = []
    for keyword in generated_headings:
        if count == len(gt_headings):
            break

        keywords_found = []
        for gt_heading in gt_headings:
            if gt_heading in list(keyword_matches.values()):
                continue
            cosine_sim = cosine_similarity(keyword, gt_heading, sentence_transformer)
            if cosine_sim > cosine_similarity_requirement:
                keywords_found.append((gt_heading, cosine_sim))
                true_positive += 1
                break
            if keyword in gt_heading or gt_heading in keyword:
                keywords_found.append((gt_heading, 1))
                true_positive += 1
                break

        if not keywords_found:
            false_positive += 1

        else:
            best_keyword_match = ("", 0)
            for keyword_found in keywords_found:
                if keyword_found[1] > best_keyword_match[1]:
                    best_keyword_match = keyword_found
            keyword_matches[keyword] = best_keyword_match[0]
            ordered_keywords.append(keyword)

    false_negative = len(gt_headings) - count
    # (Impossible to divide by 0 unless ground truth has 0 headings)
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    return (precision, recall, ordered_keywords, keyword_matches)

In [8]:
def ordered_headings(ordered_keywords, keyword_matches, gt_headings):
    ordering_metric = 0
    pairs = []
    for header_1 in range(len(ordered_keywords)):
        for header_2 in range(header_1+1, len(ordered_keywords)):
            pairs.append((ordered_keywords[header_1], ordered_keywords[header_2]))

    print("Keyword matches: " + str(keyword_matches))

    for pair in pairs:
        if gt_headings.index(keyword_matches[pair[0]]) < gt_headings.index(keyword_matches[pair[1]]):
            ordering_metric += 1

    if len(pairs) == 0:
        ordering_metric = 0
    else:
        ordering_metric /= len(pairs)
        
    return ordering_metric

In [9]:
def get_pr_values(article_name, sectioned_headers):
    header_set = get_header_set(sectioned_headers)
    print(header_set)
    
    gt_headings = get_headings(article_name)
    
    similarity_pair_combine(header_set, len(gt_headings))
    print(f"Header set: {header_set}")
    
    generated_headings = header_cutoff(header_set, len(gt_headings))

    average_headers = sum([len(article_headers[0]) for article_headers in sectioned_headers]) // len(sectioned_headers)
    print(f"Average number of headers in similar articles: {average_headers}")

    print(f"Generated headings: {generated_headings}")
    
    precisions = []
    recalls = []
    ordering_metrics = []
    for header_requirement in range(header_min_bound, len(gt_headings)+1):
        if len(generated_headings) < header_requirement:
            break
        precision, recall, ordered_keywords, keyword_matches = \
        get_precision_and_recall(article_name, generated_headings[:header_requirement], gt_headings)

        precisions.append(precision)
        recalls.append(recall)

        ordering_metric = ordered_headings(ordered_keywords, keyword_matches, gt_headings)
        ordering_metrics.append(ordering_metric)

        print(f"Precision {precision}")
        print(f"Recall {recall}")
        print(f"Heading order {ordering_metric}")
    
    return precisions, recalls, ordering_metrics

In [10]:
# Bulk of time is here!
sectioned_headers_list = [get_sectioned_headers(article_name) for article_name in article_names]
print(sectioned_headers_list)

precisions_list = []
recalls_list = []
ordering_metrics_list = []

for index in range(len(sectioned_headers_list)):
    print(f"Article: {article_names[index]}")
    precisions, recalls, ordering_metrics = get_pr_values(article_names[index], sectioned_headers_list[index])
    precisions_list.append(precisions)
    recalls_list.append(recalls)
    ordering_metrics_list.append(ordering_metrics)

Getting section headers (Using wiki2vec similarity)...
Similar articles: [['Measles', 0.53669715], ['Orthomyxoviridae', 0.5243176], ['Influenzavirus_C', 0.51077414], ['1918_flu_pandemic', 0.50997454], ['Swine_influenza', 0.50933975], ['Influenza_A_virus_subtype_H5N1', 0.5092514], ['Avian_influenza', 0.50008357]]
Getting section headers (Using wiki2vec similarity)...
Similar articles: [['Panama_City', 0.6307721], ['List_of_cities_in_Panama', 0.62381727], ['Olá_District', 0.61495125], ['Coclé_Province', 0.61470526], ['Olá', 0.60882664], ['Chiriquí_Province', 0.60825735], ['Protected_areas_of_Panama', 0.60351366]]
Getting section headers (Using wiki2vec similarity)...
Similar articles: [['Exact_algorithm', 0.6048363], ['Hybrid_algorithm', 0.59737164], ['Time_complexity#Strongly_and_weakly_polynomial_time', 0.5917734], ['Sequential_algorithm', 0.58928], ['Las_Vegas_algorithm', 0.5849046], ['Output-sensitive_algorithm', 0.5847205], ['Time_complexity#Table_of_common_time_complexities', 0.582

Header set: [['pathophysiology', 0.53669715], ['epidemiology', 1.0459485], ['history', 6.691625], ['society and culture', 4.673109], ['genome', 0.5243176], ['epidemiology and pathology', 0.50997454], ['mortality', 0.50997454], ['sex differences in mortality', 2.5659814], ['virology', 0.50933975], ['genetics', 1.009335], ['global impact', 9.25209]]
Average number of headers in similar articles: 7
Generated headings: ['global impact', 'history', 'society and culture', 'sex differences in mortality', 'epidemiology', 'genetics', 'pathophysiology', 'genome', 'epidemiology and pathology', 'mortality', 'virology']


ValueError: too many values to unpack (expected 4)

In [None]:
print(precisions_list)
print(recalls_list)
print(ordering_metrics_list)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import PrecisionRecallDisplay

In [None]:
for index in range(len(section_headers_list)):
    disp = PrecisionRecallDisplay(precision=precisions_list[index], recall=recalls_list[index])
    disp.plot()
    plt.title(f"PR Curve for {article_names[index]}")
    plt.show()
    plt.savefig(pr_curve_graphs + f"/{article_names[index]}.png")