# Select candiates for annotation

Select $n=1000$ candidate pairs from each group:

- Random pairs
- Vector similarity
- Next sentence prediction

Total $n_{total}=3000$ pairs


In [1]:
import json
import pandas as pd
import numpy as np
import pickle
from smart_open import open
import os
import sys
import random
import pickle
import logging
from tqdm.notebook import tqdm
from IPython.core.display import display
from collections import defaultdict
import itertools

#sys.path.append(os.path.dirname(os.getcwd()))

from experiments.environment import get_env

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

env = get_env()

/data/experiments/raring/semantic_storytelling/malte-candidates/environments
Environment detected: gpu_server2 (in default.yml)


In [2]:
n = 20_000
similarity_threshold = 0.92
nsp_threshold = 0.95

data_dir = '/data/experiments/hensel/storytelling-candidates/data/final'
with_categories_data_dir = '/data/experiments/hensel/storytelling-candidates/data/with_categories'


In [3]:
docs_df = pd.read_csv(os.path.join(with_categories_data_dir, 'meta_data.docs.tsv'), sep='\t', index_col=0, usecols=['doc_id', 'title', 'categories'])
docs_df

Unnamed: 0_level_0,title,categories
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
736,President of China lunches with Brazilian Pres...,"Politics and conflicts,South America,Asia,Braz..."
741,Palestinians to elect new president on January 9,"Palestine,Elections,Mahmoud Abbas,Yasser Arafa..."
743,Brazilian delegation returns from Arafat funeral,"Palestine,Brazil,Politics and conflicts,Middle..."
764,Hearing begins over David Hookes death,"Australia,Cricket,Crime and law,Oceania"
797,Brazilian soccer player's mother has been kidn...,"South America,Brazil,Football (soccer),Crime a..."
...,...,...
2909791,"Mohsen Fakhrizadeh, leader of Iranian nuclear ...","Iran,Asia,Middle East,Nuclear technology,Obitu..."
2909805,Former US national security advisor Michael Fl...,"United States,North America,Crime and law,Poli..."
2909818,"Wikinews interviews Sandra Jephcott, Sustainab...","Australia,Elections,Climate change,COVID-19,Qu..."
2909884,"Wikinews interviews Craig Farquharson, Liberal...","Australia,Elections,Queensland,Democracy,Polit..."


In [4]:
category_to_doc_ids = defaultdict(list)
doc_id_to_categories = {}

for doc_id, row in docs_df.iterrows():
    if isinstance(row['categories'], str):
        categories = row['categories'].split(',')
        doc_id_to_categories[doc_id] = set(categories)
        
        for c in categories:
            category_to_doc_ids[c].append(doc_id)
    
logger.info(f'Categories found: {len(category_to_doc_ids)}')

2021-06-11 14:44:47 - INFO - __main__ -   Categories found: 3334


In [5]:
min_docs_per_category = 10
max_docs_per_category = 100

needed_categories = {c for c, ds in category_to_doc_ids.items() if min_docs_per_category <= len(ds) <= max_docs_per_category}

logger.info(f'Categories matching criteria: {len(needed_categories)}')

2021-06-11 14:44:47 - INFO - __main__ -   Categories matching criteria: 1156


In [6]:
sent_df = pd.read_csv(os.path.join(data_dir, 'meta_data.tsv'), sep='\t')\
    .merge(docs_df, on='doc_id', how='left').set_index('sent_id')
sent_df

Unnamed: 0_level_0,doc_id,start,end,text,url,title,categories
sent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
736-0,736,0,37,"Hu Jintao, the President of the People's Repub...",https://en.wikinews.org/wiki?curid=736,President of China lunches with Brazilian Pres...,"Politics and conflicts,South America,Asia,Braz..."
736-1,736,38,49,Lunch was a traditional Brazilian barbecue wit...,https://en.wikinews.org/wiki?curid=736,President of China lunches with Brazilian Pres...,"Politics and conflicts,South America,Asia,Braz..."
741-1,741,0,16,Acting president Rawhi Fattuh has announced to...,https://en.wikinews.org/wiki?curid=741,Palestinians to elect new president on January 9,"Palestine,Elections,Mahmoud Abbas,Yasser Arafa..."
741-2,741,17,53,"Futtuh, head of the Palestinian parliament, wa...",https://en.wikinews.org/wiki?curid=741,Palestinians to elect new president on January 9,"Palestine,Elections,Mahmoud Abbas,Yasser Arafa..."
741-3,741,54,84,New leadership could prove to be the key to re...,https://en.wikinews.org/wiki?curid=741,Palestinians to elect new president on January 9,"Palestine,Elections,Mahmoud Abbas,Yasser Arafa..."
...,...,...,...,...,...,...,...
2909884-44,2909884,1032,1042,"Consequently there are water, road and infrast...",https://en.wikinews.org/wiki?curid=2909884,"Wikinews interviews Craig Farquharson, Liberal...","Australia,Elections,Queensland,Democracy,Polit..."
2909884-45,2909884,1043,1065,"We aim to make Groom a marginal electorate, if...",https://en.wikinews.org/wiki?curid=2909884,"Wikinews interviews Craig Farquharson, Liberal...","Australia,Elections,Queensland,Democracy,Polit..."
2909884-46,2909884,1066,1091,My approach is to listen to the community and ...,https://en.wikinews.org/wiki?curid=2909884,"Wikinews interviews Craig Farquharson, Liberal...","Australia,Elections,Queensland,Democracy,Polit..."
2909884-47,2909884,1092,1109,The principles of Liberal Democrats are the pr...,https://en.wikinews.org/wiki?curid=2909884,"Wikinews interviews Craig Farquharson, Liberal...","Australia,Elections,Queensland,Democracy,Polit..."


In [7]:
sim_df = pd.read_csv(os.path.join(data_dir, 'similarity_pairs.tsv'), sep='\t', index_col=[0,1])
sim_df

Unnamed: 0_level_0,Unnamed: 1_level_0,similarity
sent1_id,sent2_id,Unnamed: 2_level_1
61049-14,45027-2,0.811978
2896804-9,7662-5,0.893290
8685-1,99779-5,0.888850
98820-1,97754-1,0.901756
33586-1,33591-3,0.941778
...,...,...
117956-4,108112-0,0.903401
96322-3,151977-10,0.908700
123997-9,21575-12,0.923508
753804-2,45727-4,0.875747


In [8]:
nsp_df = pd.read_csv(os.path.join(data_dir, 'nsp_pairs.csv'), index_col=[0,1])
nsp_df

Unnamed: 0_level_0,Unnamed: 1_level_0,is_next_sentence,is_not_next_sentence
sent1_id,sent2_id,Unnamed: 2_level_1,Unnamed: 3_level_1
736-0,736-1,2.445305e-04,1.028482e-11
736-0,741-1,1.119650e-08,1.559122e-06
736-0,741-2,1.085477e-08,1.662852e-06
736-0,741-3,1.084755e-08,1.551225e-06
736-0,741-4,1.123367e-08,1.534388e-06
...,...,...,...
741-4,120767-1,9.893848e-09,1.974904e-06
741-4,120767-2,1.027576e-08,1.819374e-06
741-4,120767-3,1.013443e-08,1.883872e-06
741-4,120767-6,9.978086e-09,1.969033e-06


In [9]:
# Random pairs (from different docs)
random_pairs = []
skipped = 0

a_sent_ids = sent_df.index.tolist()
b_sent_ids = sent_df.index.tolist()

# shuffle
random.shuffle(a_sent_ids)
random.shuffle(b_sent_ids)

a_iter = iter(a_sent_ids)
b_iter = iter(b_sent_ids)

while len(random_pairs) < 1000:  # n 
    try:
        a_id = next(a_iter)
        b_id = next(b_iter)

        a = dict(sent_df.loc[a_id])
        b = dict(sent_df.loc[b_id])

        if a['doc_id'] == b['doc_id']:
            skipped += 1
            continue

        # check for categories
        if a['doc_id'] not in doc_id_to_categories or b['doc_id'] not in doc_id_to_categories:
            skipped += 1
            continue

        overlapping_categories = doc_id_to_categories[a['doc_id']] & doc_id_to_categories[b['doc_id']]
        if len(overlapping_categories) < 1:  # not a single overlapping category
            skipped += 1
            continue

        if len(overlapping_categories & needed_categories) < 1:  # categories are not part of needed categories
            skipped += 1
            continue

        pair = {f'a_{k}': a[k] for k in a.keys()}
        pair.update({f'b_{k}': b[k] for k in b.keys()})

        random_pairs.append(pair)
    
    except StopIteration:
        logger.info('done')
        break
    
logger.info(f'Skipped: {skipped:,}; Pairs: {len(random_pairs):,}')

2021-06-11 14:45:28 - INFO - __main__ -   Skipped: 161,806; Pairs: 1,000


In [10]:
# Similarity pairs
sim_pairs = []
skipped = 0

for a_id, b_id in sim_df[sim_df.similarity >= similarity_threshold].sample(n=n).index:
    a = dict(sent_df.loc[a_id])
    b = dict(sent_df.loc[b_id])
    
    if a['doc_id'] == b['doc_id']:
        skipped += 1
        continue
    
    # check for categories
    if a['doc_id'] not in doc_id_to_categories or b['doc_id'] not in doc_id_to_categories:
        skipped += 1
        continue
        
    overlapping_categories = doc_id_to_categories[a['doc_id']] & doc_id_to_categories[b['doc_id']]
    if len(overlapping_categories) < 1:  # not a single overlapping category
        skipped += 1
        continue
        
    if len(overlapping_categories & needed_categories) < 1:  # categories are not part of needed categories
        skipped += 1
        continue
    
    pair = {f'a_{k}': a[k] for k in a.keys()}
    pair.update({f'b_{k}': b[k] for k in b.keys()})
    
    sim_pairs.append(pair)
    
logger.info(f'Skipped: {skipped:,}; Pairs: {len(sim_pairs):,}')

2021-06-11 14:45:33 - INFO - __main__ -   Skipped: 18,203; Pairs: 1,797


In [11]:
# Next sentence pair
nsp_pairs = []
skipped = 0

for a_id, b_id in nsp_df[((nsp_df.is_next_sentence - nsp_df.is_not_next_sentence) / nsp_df.is_next_sentence) > nsp_threshold].sample(n=n).index:
    a = dict(sent_df.loc[a_id])
    b = dict(sent_df.loc[b_id])
    
    if a['doc_id'] == b['doc_id']:
        skipped += 1
        continue

    # check for categories
    if a['doc_id'] not in doc_id_to_categories or b['doc_id'] not in doc_id_to_categories:
        skipped += 1
        continue
        
    overlapping_categories = doc_id_to_categories[a['doc_id']] & doc_id_to_categories[b['doc_id']]
    if len(overlapping_categories) < 1:  # not a single overlapping category
        skipped += 1
        continue
        
    if len(overlapping_categories & needed_categories) < 1:  # categories are not part of needed categories
        skipped += 1
        continue
        
    pair = {f'a_{k}': a[k] for k in a.keys()}
    pair.update({f'b_{k}': b[k] for k in b.keys()})
    
    nsp_pairs.append(pair)
    
logger.info(f'Skipped: {skipped:,}; Pairs: {len(nsp_pairs):,}')

2021-06-11 14:45:37 - INFO - __main__ -   Skipped: 18,901; Pairs: 1,099


In [12]:
len(random_pairs), len(sim_pairs), len(nsp_pairs)

(1000, 1797, 1099)

In [13]:
# write to disk
write_dir = "./output/"
pd.DataFrame(random_pairs).to_csv(os.path.join(write_dir, f'candidates_random.with_categories.{n}.csv'), index=False)
pd.DataFrame(sim_pairs).to_csv(os.path.join(write_dir, f'candidates_similarity.with_categories.{n}.csv'), index=False)
pd.DataFrame(nsp_pairs).to_csv(os.path.join(write_dir, f'candidates_nsp.with_categories.{n}.csv'), index=False)


#pd.DataFrame(random_pairs).to_csv(os.path.join(data_dir, f'candidates_random.{n}.csv'), index=False)
#pd.DataFrame(sim_pairs).to_csv(os.path.join(data_dir, f'candidates_similarity.{n}.csv'), index=False)
#pd.DataFrame(nsp_pairs).to_csv(os.path.join(data_dir, f'candidates_nsp.{n}.csv'), index=False)


In [14]:
random_pairs[:10]

[{'a_doc_id': 47348,
  'a_start': 92,
  'a_end': 121,
  'a_text': "The President's Emergency Plan for AIDS Relief has done a great deal of good, and President Bush and his team deserve a lot of credit for it, Gates said.",
  'a_url': 'https://en.wikinews.org/wiki?curid=47348',
  'a_title': 'AIDS Conference: Clinton and Gates defend Bush program',
  'a_categories': 'World,United States,Canada,Health,George W. Bush,AIDS,Bill Clinton,Bill Gates,Prostitution',
  'b_doc_id': 102849,
  'b_start': 1453,
  'b_end': 1473,
  'b_text': ":'Melody Thomas Scott': One would think so, but I have not yet seen another show equal to our production values.",
  'b_url': 'https://en.wikinews.org/wiki?curid=102849',
  'b_title': "Popular soap opera 'The Young and the Restless' celebrates 35 years on the air",
  'b_categories': 'North America,United States,Culture and entertainment,Media,Television,Mike Halterman (Wikinewsie),Prostitution'},
 {'a_doc_id': 259826,
  'a_start': 217,
  'a_end': 234,
  'a_text': 

In [15]:
sim_pairs[:10]

[{'a_doc_id': 161838,
  'a_start': 0,
  'a_end': 38,
  'a_text': 'According to witness and media reports, at least two hundred people in Nigeria have been killed after pastoralists and villagers clashed near the city of , which has been the source of repeated tension between Christians and Muslims.',
  'a_url': 'https://en.wikinews.org/wiki?curid=161838',
  'a_title': 'Clashes in Nigeria kill hundreds, troops on alert',
  'a_categories': 'Tempodivalse (WWC2010),Writing contest 2010,Disasters and accidents,Crime and law,Africa,Goodluck Jonathan,Nigeria',
  'b_doc_id': 177003,
  'b_start': 0,
  'b_end': 23,
  'b_text': 'Fresh violence has broken out in the central Nigerian city of  earlier today, killing at least five people, according to witness reports.',
  'b_url': 'https://en.wikinews.org/wiki?curid=177003',
  'b_title': 'Five dead after continuing violence in Nigeria',
  'b_categories': 'Africa,Politics and conflicts,Crime and law,Nigeria,Religion,Islam,Christianity'},
 {'a_doc_id':

In [16]:
nsp_pairs[:10]

[{'a_doc_id': 741,
  'a_start': 17,
  'a_end': 53,
  'a_text': 'Futtuh, head of the Palestinian parliament, was sworn in hours after the death of Yasser Arafat on Thursday, and Palestinian Basic Law dictates that he may only serve up to two months before elections are held.',
  'a_url': 'https://en.wikinews.org/wiki?curid=741',
  'a_title': 'Palestinians to elect new president on January 9',
  'a_categories': 'Palestine,Elections,Mahmoud Abbas,Yasser Arafat,Fatah,Middle East,Palestinian National Authority',
  'b_doc_id': 4301,
  'b_start': 113,
  'b_end': 138,
  'b_text': "The groups appear to have come to an agreement that they should continue to observe a month's calm, as agreed with Abbas in late January.",
  'b_url': 'https://en.wikinews.org/wiki?curid=4301',
  'b_title': 'Hamas dampens Palestinian-Israeli truce',
  'b_categories': 'Middle East,Israel,Palestine,Politics and conflicts,Mahmoud Abbas,Hamas,Palestinian National Authority,Sharm el-Sheikh,Palestinian Islamic Jihad'},
 {'