In [12]:
import jsonpickle
import matplotlib.pyplot as plt
import numpy as np
from ddr import DDR
from random import shuffle
import pandas as pd

In [15]:
with open('./data/all_dict.pickle', 'r') as f:
    all_dict = jsonpickle.decode(f.read())

# Collect paragraph
key_list = [key for key in all_dict.keys()]
paragraph_collection = []
for key in key_list:
    text = all_dict[key]['text']
    paragraphs = text.split("\n")
    paragraph_collection += [p for p in paragraphs if len(p.split())>=20 and len(p.split())<=60]
print('Total paragraph number', len(paragraph_collection))

Total paragraph number 33849


### Sample 10 articles to evaluate annotation candidates

In [18]:
# Sample 10 articles
with open('./data/all_dict.pickle', 'r') as f:
    all_dict = jsonpickle.decode(f.read())
key_list = [key for key in all_dict.keys()]
shuffle(key_list)
sample_keys = ['IssoabdaotVrTtsopsoiPotIVftfttbnmr-20230914'] + key_list[:9] 

# For each article, take each paragraph and run them through SpaCy
list_df = []
for key in sample_keys:
    text = all_dict[key]['text']
    paragraphs = [p.strip() for p in text.split("\n")]
    df = pd.DataFrame({'paragraphs': paragraphs})
    list_df.append(df)

with pd.ExcelWriter('./data/annotaion_sample.xlsx') as writer:
    for i in range(len(list_df)):
        df = list_df[i]
        df.to_excel(writer, sheet_name=f'article_{i}')

In [None]:
"""Output to excel for manual labelling"""
# with pd.ExcelWriter(save_dir) as writer:
#     df.to_excel(writer, sheet_name='Annotation_v1')

"""Use MFD2 counts as benchmark"""
"""Use DDR"""

### Make annotation spreadsheet

In [2]:
with open('./data/all_dict.pickle', 'r') as f:
    all_dict = jsonpickle.decode(f.read())

# Collect paragraph
key_list = [key for key in all_dict.keys()]
paragraph_collection = []
for key in key_list:
    text = all_dict[key]['text']
    paragraphs = text.split("\n")
    paragraph_collection += [p for p in paragraphs if len(p.split())>=20 and len(p.split())<=60]

# Compute cosine similarity based on DDR
# Takes < 3 mins
ddr = DDR()
mf_ddr_result = [] 
for p in paragraph_collection:
    mf_ddr_result.append(ddr.compute_similarity(p))

In [28]:
# Rank sentences based on similarity of DDR
mf_list = list(mf_ddr_result[0].keys())
mf_ranks = dict()
for mf in mf_list:
    ascend_rank = np.argsort([mf_ddr[mf] for mf_ddr in mf_ddr_result])
    mean = np.mean([mf_ddr[mf] for mf_ddr in mf_ddr_result])
    stdev = np.std([mf_ddr[mf] for mf_ddr in mf_ddr_result])
    
    # Save information
    if mf not in mf_ranks:
        mf_ranks[mf] = dict()
    mf_ranks[mf]['ascend_rank'] = ascend_rank
    mf_ranks[mf]['mean'] = mean
    mf_ranks[mf]['stdev'] = stdev

    # Obtaining the sampling pool.
    sample_idx_pool = []
    for i in range(len(mf_ddr_result)):
        sim_score = mf_ddr_result[i][mf]
        if sim_score >= mean - stdev and sim_score <= mean + stdev:
            sample_idx_pool.append(i)
    mf_ranks[mf]['sample_idx_pool'] = sample_idx_pool


In [29]:
# Sampling sentences for annotation.
top_n = 50
sampling_num = 50

annotation_target_idx_collection = dict()
for mf in mf_list:
    top_indices = [idx for idx in mf_ranks[mf]['ascend_rank'][-top_n:]]
    shuffle(mf_ranks[mf]['sample_idx_pool']) 
    sample_indices = [idx for idx in mf_ranks[mf]['sample_idx_pool'][:50]]

    output_indices = top_indices + sample_indices
    shuffle(output_indices)
    assert len(output_indices) == len(top_indices) + len(sample_indices), print(len(output_indices))

    annotation_target_idx_collection[mf] = output_indices

In [30]:
# Mix annotation sentences across foundations, and output to an excel file.

annotation_collection = {'Annotation unit': list()}
for mf, idx_list in annotation_target_idx_collection.items():
    annotation_collection['Annotation unit'] += [paragraph_collection[idx] for idx in idx_list]
shuffle(annotation_collection['Annotation unit'])
df = pd.DataFrame(annotation_collection)
save_dir = './data/annotation.xlsx'

with pd.ExcelWriter(save_dir) as writer:
    df.to_excel(writer, sheet_name='Annotation_v1')