#### This notebook preprocesses GoodNewsEveryone dataset to fit into our training pipeline

In [112]:
import json
import re
import csv
import pandas as pd
import numpy as np

In [101]:
with open('gne-release-v1.0.jsonl', 'r') as jfile:
    json_list = list(jfile)

data = []
for json_str in json_list:
    data.append(json.loads(json_str))

data_cleaned = []  # data point with gold cause annotation
for d in data:
    if len(d['annotations']['cause']['gold'][0]) != 0:
        data_cleaned.append(d)

In [96]:
print(data[587]['annotations']['cause']['gold'][0][0])  # cause
print(data[587]['headline'])  # document
print(data[587]['annotations']['dominant_emotion']['gold'])  # emotion

full-term abortion clinic injures woman during botched abortion – report0
Full-Term Abortion Clinic Injures Woman During Botched Abortion – Report
anger


In [60]:
# number of datapoint with cause annotation
len(data_cleaned)

4799

In [62]:
def getIndex(lst1, lst2):
    return next(((i, i + (len(lst2) if len(lst2) > 1 else 0)) for i in range(len(lst1) - len(lst2) + 1) if lst1[i:i + len(lst2)] == lst2), None)

In [110]:
with open('./data/full_data.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['document', 'token_label', 'emotion-label'])
    for d in data_cleaned:
        document = d['headline'].lower()
        document = document.replace('and#039;', ' ')
        document = re.sub(r'[^\w\s]', '', document)
        # print(document)
        cause = d['annotations']['cause']['gold'][0][0].lower()
        cause = cause.replace('and#039', ' ')
        cause = re.sub(r'[^\w\s]', '', cause)
        # print(cause)
        # print('*' * 50)
        emotion = d['annotations']['dominant_emotion']['gold']
        token_label = []
        try:
            start_idx, end_idx = getIndex(document.split(), cause.split())  
            for idx, word in enumerate(document.split()):
                if idx == start_idx:
                    token_label.append('B-CAU')
                elif idx > start_idx and idx <= end_idx - 1:
                    token_label.append('I-CAU')
                else:
                    token_label.append('O')
            assert len(document.split()) == len(token_label)
            tsv_writer.writerow([document, ' '.join(token_label), emotion])
        except:
            print(document)
            print(cause)
            print(len(document.split()))
            print(len(token_label))
            print('*' * 50)

fullterm abortion clinic injures woman during botched abortion  report
fullterm abortion clinic injures woman during botched abortion  report0
9
0
**************************************************
trumps madefortv moment in north korea
tv moment in north korea
6
0
**************************************************
north korea is waiting for trump to blinkor leave office
north korea is waiting for trump to blink or leave office
10
0
**************************************************
biden is toastand so is the party of open borders
and so is the party of open borders
10
0
**************************************************
the us team just won the womens world cupand boosted its case for equal pay
and boosted its case for equal pay
15
0
**************************************************
weve seen the debatesand what could be our future
and what could be our future
9
0
**************************************************
10year yield at 2473 2year yield at 2319 30year yield at 2899
year yi

In [92]:
document = "Teen Shoe Store Employee Goes and#039;Above And Beyondand#039; For Girl With Autism".lower()
cause = "above and beyondand#039"
document = document.replace('and#039;', ' ')
cause = cause.replace('and#039', '')

In [93]:
document = re.sub(r'[^\w\s]', '', document)
cause = re.sub(r'[^\w\s]', '', cause)

In [94]:
print(document)
print(cause)

teen shoe store employee goes  above and beyond  for girl with autism
above and beyond


In [91]:
getIndex(document.split(), cause.split())

(5, 8)

##### Split full_data.csv into train/dev/test

In [128]:
full_data = pd.read_csv('./data/full_data.tsv', delimiter='\t', index_col=False)

In [129]:
train, val, test = np.split(full_data.sample(frac=1, random_state=42), [int(.8 * len(full_data)), int(.9 * len(full_data))])

In [132]:
train.to_csv('./data/train.tsv', index=False, sep="\t")

In [133]:
val.to_csv('./data/val.tsv', index=False, sep="\t")

In [134]:
test.to_csv('./data/test.tsv', index=False, sep="\t")