In [1]:
import jsonpickle
import matplotlib.pyplot as plt
import numpy as np
from ddr import DDR
from random import shuffle
import pandas as pd



In [2]:
with open('./data/pickle_240227/voice.pickle', 'r') as f:
    voice_dict = jsonpickle.decode(f.read())
with open('./data/pickle_240227/yes.pickle', 'r') as f:
    yes_dict = jsonpickle.decode(f.read())
with open('./data/pickle_240227/no.pickle', 'r') as f:
    no_dict = jsonpickle.decode(f.read())
with open('./data/pickle_240227/wa.pickle', 'r') as f:
    wa_dict = jsonpickle.decode(f.read())

In [3]:
target_media = (
    # ABC assume national reach. 230 articles.
    ('ABC Premium News; Sydney', 'neutral', 'national'),
    ('7.30; Sydney', 'neutral', 'national'),

    # ACT. 951 articles. But Australian prolly national reach.
    ('The Australian (Online); Canberra, A.C.T.', 'right', 'national'),
    ('The Canberra Times; Canberra, A.C.T.', 'left', 'ACT'),
    
    # NSW
    ('News.com.au; Sydney, N.S.W.', 'right', 'national'),
    ('Sydney Morning Herald; Sydney, N.S.W.', 'left', 'NSW'),
    ('Sun-Herald; Sydney, N.S.W.', 'left', 'NSW'),
    ('The Daily Telegraph (Online); Surrey Hills, N.S.W.', 'right', 'NSW'),

    # VIC
    ('The Age; Melbourne, Vic.', 'left', 'VIC'),
    ('Herald Sun; Melbourne, Vic.', 'left', 'VIC'),
    ('Sunday Age; Melbourne, Vic.', 'left', 'VIC'),

    # SA
    ('The Advertiser; Adelaide, S. Aust.', 'right', 'SA'),
    
    # QLD
    ('The Courier - Mail; Brisbane, Qld.', 'right', 'QLD'),
    ('The Cairns Post; Cairns, Qld.', 'right', 'QLD'),
    
    # NT
    ('The Northern Territory News; Darwin, N.T.', 'unclear', 'NT'),

    # TAS
    ('Advocate; Burnie, Tas.', 'unclear', 'TAS'),
    ('The Examiner; Launceston, Tas.', 'unclear', 'TAS'),
    ('The Mercury (Online); Hobart Town', 'unclear', 'TAS'),

    # WA
    ('The West Australian', 'unclear', 'WA'),
    ('WAToday.com.au', 'unclear', 'WA')
)



In [15]:
def remove_duplicate(dic):
    output = dict()
    for k, v in dic.items():
        k_components = k.split('-')
        k_new = k_components[0] + '-' + k_components[2]
        if k_new not in output:
            output[k_new] = v
    return output
    

voice_dict2 = remove_duplicate(voice_dict)
yes_dict2 = remove_duplicate(yes_dict)
no_dict2 = remove_duplicate(no_dict)
wa_dict2 = remove_duplicate(wa_dict)

In [17]:
# The number below is the number of duplicates.
test_dict = voice_dict
len(test_dict) - len(set([list(key.split('-'))[0] + '-' + list(key.split('-'))[2] for key in list(test_dict.keys())]))

394

In [20]:
# The result below shows no duplicates in wa_dict2 and voice_dict2
len(voice_dict2) + len(wa_dict2) == len(list(voice_dict2.keys())+list(wa_dict2.keys()))

True

In [24]:
# Combined all dictionaries
final_dict = {}
dict_collection = [voice_dict2, yes_dict2, no_dict2, wa_dict2]
for d in dict_collection:
    for k, v in d.items():
        final_dict[k] = v

key_list = [k for k in final_dict.keys()]
media_set = set([media_info[0] for media_info in target_media])
paragraph_collection = []
for key in key_list:
    pub_title = final_dict[key]['pub_title']
    if pub_title in media_set:
        text = final_dict[key]['text']
        paragraphs = text.split("\n")

        # Collect paragraphs
        paragraph_collection += [p for p in paragraphs if len(p.split())>=20 and len(p.split())<=60]
len(paragraph_collection)

33313

In [26]:
# Takes < 3 mins
ddr = DDR()
mf_ddr_result = [] 

for p in paragraph_collection:
    mf_ddr_result.append(ddr.compute_similarity(p))

Abnormal tokens:  ['.', '.', '.', '-0.1573', '-0.29517', '0.30453', '-0.54773', '0.098293', '-0.1776', '0.21662', '0.19261', '-0.21101', '0.53788', '-0.047755', '0.40675', '0.023592', '-0.32814', '0.046858', '0.19367', '0.25565', '-0.021019', '-0.15957', '-0.1023', '0.20303', '-0.043333', '0.11618', '-0.18486', '0.0011948', '-0.052301', '0.34587', '0.052335', '0.16774', '-0.21384', '0.055947', '0.24934', '-0.12179', '0.16749', '0.28922', '-0.033739', '0.3015', '-0.13241', '0.092635', '0.37155', '-0.2884', '-0.0052731', '-0.001005', '-0.51153', '-0.28476', '-0.20139', '0.11837']
Abnormal tokens:  ['at', 'name@domain.com', '0.0061218', '0.39595', '-0.22079', '0.78149', '0.38759', '0.28888', '0.18495', '-0.37328', '-0.60018', '0.19625', '0.42975', '0.17942', '0.06375', '-0.44127', '0.72035', '0.50539', '0.17985', '-0.71305', '0.11122', '0.19733', '0.063884', '0.023288', '0.017074', '0.04756', '-0.083167', '0.14506', '-0.21856', '-0.07979', '-0.058909', '-0.79864', '0.65868', '-0.45031', '

In [27]:
mf_list = list(mf_ddr_result[0].keys())
mf_list

['care.virtue',
 'care.vice',
 'fairness.virtue',
 'fairness.vice',
 'loyalty.virtue',
 'loyalty.vice',
 'authority.virtue',
 'authority.vice',
 'sanctity.virtue',
 'sanctity.vice']

In [28]:
# Rank sentences based on similarity of DDR
mf_ranks = dict()
for mf in mf_list:
    ascend_rank = np.argsort([mf_ddr[mf] for mf_ddr in mf_ddr_result])
    mean = np.mean([mf_ddr[mf] for mf_ddr in mf_ddr_result])
    stdev = np.std([mf_ddr[mf] for mf_ddr in mf_ddr_result])
    
    # Save information
    if mf not in mf_ranks:
        mf_ranks[mf] = dict()
    mf_ranks[mf]['ascend_rank'] = ascend_rank
    mf_ranks[mf]['mean'] = mean
    mf_ranks[mf]['stdev'] = stdev

    # Obtaining the sampling pool.
    sample_idx_pool = []
    for i in range(len(mf_ddr_result)):
        sim_score = mf_ddr_result[i][mf]
        if sim_score >= mean - stdev and sim_score <= mean + stdev:
            sample_idx_pool.append(i)
    mf_ranks[mf]['sample_idx_pool'] = sample_idx_pool


In [29]:
# Sampling sentences for annotation.
top_n = 50
sampling_num = 50

annotation_target_idx_collection = dict()
for mf in mf_list:
    top_indices = [idx for idx in mf_ranks[mf]['ascend_rank'][-top_n:]]
    shuffle(mf_ranks[mf]['sample_idx_pool']) 
    sample_indices = [idx for idx in mf_ranks[mf]['sample_idx_pool'][:50]]

    output_indices = top_indices + sample_indices
    shuffle(output_indices)
    assert len(output_indices) == len(top_indices) + len(sample_indices), print(len(output_indices))

    annotation_target_idx_collection[mf] = output_indices

In [30]:
# Mix annotation sentences across foundations, and output to an excel file.

annotation_collection = {'Annotation unit': list()}
for mf, idx_list in annotation_target_idx_collection.items():
    annotation_collection['Annotation unit'] += [paragraph_collection[idx] for idx in idx_list]
shuffle(annotation_collection['Annotation unit'])
df = pd.DataFrame(annotation_collection)
save_dir = './data/annotation.xlsx'

with pd.ExcelWriter(save_dir) as writer:
    df.to_excel(writer, sheet_name='Annotation_v1')