## In this notebook, we'll create tab-separated files of our training and dev datasets as well as generate negative spans from the news articles. 

## In EDA.ipynb we noted the training data only contains instances of positive spans. However, the rest of the news articles can be treated as negative spans (i.e. segments without propaganda). Since we want to train our classifiers on both positive and negative samples we can assign the negative spans with a label of "No_Propaganda".

In [1]:
import os
import pandas as pd
import spacy

# <span style="color:#FF8800">  Generate training data

In [2]:
PATH = "datasets/train-articles/"

In [3]:
filenames = []
for root, dirs, files in os.walk(PATH, topdown = True):
    filenames = files

In [4]:
nlp = spacy.load("en_core_web_sm", disable = ["ner"]) # don't need NER, can disable

In [5]:
texts = [] # create a single string for each doc & add to list
text_num = [] # capture the number/order in which each doc was processed
counter = 0
for name in filenames:
    text_num.append((counter,name))
    counter += 1
    with open(PATH+name) as f:
        text = f.read()
        texts.append(text)

In [6]:
counter

371

In [7]:
from spacy.tokens import Doc, Token, Span
Span.set_extension("TRAIN_LABEL", default = "No_Label")

In [8]:
docs_as_docs = [] # create list where elements are separate docs
for doc in nlp.pipe(texts):
    docs_as_docs.append(doc)

In [9]:
file_num_name_df = pd.DataFrame(text_num, columns = ["element_in_list", "filename"])

In [10]:
import re

In [11]:
file_num_name_df["filenumber"] = file_num_name_df.apply(lambda x: int(re.findall("\d+", x["filename"])[0]), axis = 1)
file_num_name_df = file_num_name_df[["filenumber","filename","element_in_list"]]

In [12]:
TRAIN_LABELS = "/home/bryan/Documents/Code/si630/semeval/datasets/train-task2-TC.labels"

In [13]:
all_spans_df = pd.read_csv(TRAIN_LABELS, sep = "\t", header = None, names = ["filenumber","label","span_start","span_end"])
all_spans_df = all_spans_df.merge(file_num_name_df, on = "filenumber")

In [14]:
all_spans_df.shape

(6129, 6)

> ### <span style="color:#8800ff"> There are +6,100 positive labeled spans in the training data.

In [15]:
for row in all_spans_df.iterrows():
    try:
        row_temp = row[1]
        docs_as_docs[row_temp["element_in_list"]].char_span(row_temp["span_start"], row_temp["span_end"])._.TRAIN_LABEL = row_temp["label"]
    except Exception as e:
        pass

In [16]:
from intervaltree import Interval, IntervalTree

In [17]:
neg_spans_list = [] # create a list of nested tuples
errors = []
CONTEXT_WINDOW = 75

for entry in all_spans_df["element_in_list"].unique(): # iterate through unique entries in element_in_list
    sub_df = all_spans_df[all_spans_df["element_in_list"] == entry] # this is subset of main df containing only entries for the current doc
    these_intervals = [(x[1]["span_start"], x[1]["span_end"]) for x in sub_df.iterrows()] # get intervals
    this_tree = IntervalTree.from_tuples(these_intervals) # initialize tree with intervals
    
    this_doc = docs_as_docs[entry]
    START_POINT = CONTEXT_WINDOW # char position we can start from
    END_POINT = len(this_doc)-(CONTEXT_WINDOW) # char position we can work up until
    for token in range(START_POINT, END_POINT): # for the range of possible center positions of the CONTEXT_WINDOW
        if this_tree.overlap(token, token+CONTEXT_WINDOW):
            continue
        else:
            try:
                docs_as_docs[entry].char_span(token, token+CONTEXT_WINDOW)._.TRAIN_LABEL = "No_Propaganda"
                neg_spans_list.append((entry, (token,token+CONTEXT_WINDOW)))
            except Exception as e:
                errors.append(entry)

In [18]:
full_list = [(docs_as_docs[i[0]].char_span(i[1][0],i[1][1]).text,"No_Propaganda") for i in neg_spans_list]

In [19]:
for i in all_spans_df.iterrows():
    try:
        full_list.append((docs_as_docs[i[1]["element_in_list"]].char_span(i[1]["span_start"],i[1]["span_end"]).text, i[1]["label"]))
    except Exception as e:
        pass

In [20]:
train_df = pd.DataFrame.from_records(full_list, columns = ['text','label'])
train_df["label_binary"] = train_df.apply(lambda x: "No_Propaganda" if x["label"] == "No_Propaganda" else "Propaganda", axis = 1)

In [21]:
train_df.sample(3)

Unnamed: 0,text,label,label_binary
1167,nations grapple with the outbreak of the bubon...,No_Propaganda,No_Propaganda
13237,"Of course they did, and of course they still do",Loaded_Language,Propaganda
882,"now, we have all heard of Jesus Campos.\nHe’s ...",No_Propaganda,No_Propaganda


In [22]:
train_df["rand_letter"] = "s" # apparently this is needed/expected by BERT
train_data = train_df[["label","label_binary","rand_letter","text"]]
train_data.index = train_data.index.rename("id")
train_data = train_data.sample(frac = 1).reset_index(drop = True) #randomly sorts the rows

In [23]:
train_data.shape

(15928, 4)

> ### <span style="color:#8800ff"> We now have +15,900 labeled spans in our training data. An increase of 160%.

In [23]:
train_data.to_csv("datasets/train_data.tsv", sep = "\t", header = True)

# <span style="color:#FF8800">  Generate dev data
> ### This is essentially the same code as above. I'm not adhering to DRY standards because I only need to run this code twice for this project and, overall, this setup (datasets, labels, etc.) won't generalize well to other projects.

In [24]:
PATH = "datasets/dev-articles/"

In [25]:
filenames = []
for root, dirs, files in os.walk(PATH, topdown=True):
    filenames = files

In [26]:
texts = [] #create a single string for each doc & add to list
text_to_num = []
counter = 0
for name in filenames:
    text_to_num.append((counter,name))
    counter += 1
    with open(PATH+name) as f:
        text = f.read()
        texts.append(text)

In [27]:
counter

75

In [27]:
from spacy.tokens import Doc, Token, Span
Span.set_extension("DEV_LABEL", default = "No_Label")

In [28]:
docs_as_docs = []
for doc in nlp.pipe(texts):
    docs_as_docs.append(doc)

In [29]:
file_num_name_df = pd.DataFrame(text_to_num, columns = ["element_in_list", "filename"])

In [30]:
file_num_name_df["filenumber"] = file_num_name_df.apply(lambda x: int(re.findall("\d+", x["filename"])[0]), axis = 1)
file_num_name_df = file_num_name_df[["filenumber","filename","element_in_list"]]

In [31]:
DEV_LABELS = "datasets/dev-task-TC.labels"

In [32]:
all_spans_df = pd.read_csv(DEV_LABELS, sep = "\t", header = None, names = ["filenumber","label","span_start","span_end"])

In [33]:
all_spans_df = all_spans_df.merge(file_num_name_df, on = "filenumber")

In [34]:
all_spans_df.shape

(1063, 6)

> ### <span style="color:#8800ff"> There are +1,000 positive labeled spans in the dev data.

In [35]:
for row in all_spans_df.iterrows():
    try:
        row_temp = row[1]
        docs_as_docs[row_temp["element_in_list"]].char_span(row_temp["span_start"], row_temp["span_end"])._.DEV_LABEL = row_temp["label"]
    except Exception as e:
        pass

In [36]:
neg_spans_list = [] # create a list of nested tuples
errors = []
CONTEXT_WINDOW = 75

for entry in all_spans_df["element_in_list"].unique(): # iterate through unique entries in element_in_list
    sub_df = all_spans_df[all_spans_df["element_in_list"] == entry] # this is subset of main df containing only entries for the current doc
    these_intervals = [(x[1]["span_start"], x[1]["span_end"]) for x in sub_df.iterrows()] # get intervals
    this_tree = IntervalTree.from_tuples(these_intervals) # initialize tree with intervals
    
    this_doc = docs_as_docs[entry]
    START_POINT = CONTEXT_WINDOW # char position we can start from
    END_POINT = len(this_doc)-(CONTEXT_WINDOW) # char position we can work up until
    for token in range(START_POINT, END_POINT): # for the range of possible center positions of the CONTEXT_WINDOW
        if this_tree.overlap(token, token+CONTEXT_WINDOW):
            continue
        else:
            try:
                docs_as_docs[entry].char_span(token, token+CONTEXT_WINDOW)._.TRAIN_LABEL = "No_Propaganda"
                neg_spans_list.append((entry, (token,token+CONTEXT_WINDOW)))
            except Exception as e:
                errors.append(entry)

In [37]:
full_list = [(docs_as_docs[i[0]].char_span(i[1][0],i[1][1]).text,"No_Propaganda") for i in neg_spans_list]

In [38]:
for i in all_spans_df.iterrows():
    try:
        full_list.append((docs_as_docs[i[1]["element_in_list"]].char_span(i[1]["span_start"],i[1]["span_end"]).text, i[1]["label"]))
    except Exception as e:
        pass

In [39]:
dev_df = pd.DataFrame.from_records(full_list, columns = ['text','label'])

In [40]:
dev_df["label_binary"] = dev_df.apply(lambda x: "No_Propaganda" if x["label"] == "No_Propaganda" else "Propaganda", axis = 1)

In [41]:
dev_df.sample(3)

Unnamed: 0,text,label,label_binary
1397,report explaining the prosecution or declinati...,No_Propaganda,No_Propaganda
172,Agency.\nPaul told CNN’s “State of the Union” ...,No_Propaganda,No_Propaganda
1040,does all of this while he stands accused of ab...,No_Propaganda,No_Propaganda


In [42]:
dev_df["rand_letter"] = "s"
dev_data = dev_df[["label","label_binary","rand_letter","text"]]
dev_data.index = dev_data.index.rename("id")
dev_data = dev_data.sample(frac = 1).reset_index(drop = True) # randomly sorts the rows

In [43]:
dev_data.shape

(2850, 4)

> ### <span style="color:#8800ff"> We now have +2,800 labeled spans in our dev data. An increase of 168%.

In [44]:
dev_data.to_csv("datasets/dev_data.tsv", sep = "\t", header = True)

# <span style="color:#FF8800"> END