# To generate train & dev data sets for model evaluation


### To format Semeval data in BERT-friendly format
Three TSV (tab-separated value) files
1. train ~70% of data
1. dev ~10% of data
1. test ~ 20% of data

Featuring columns of ID, class label, throw-away letter, text for classification for #s 1 & 2. # 3 to only feature ID and text for classification (?)

In [1]:
import os
import pandas as pd
import spacy

## Generate train data

In [2]:
PATH = "/home/bryan/Documents/Code/si630/semeval/datasets/train-articles/"

In [3]:
filenames = []
for root, dirs, files in os.walk(PATH, topdown=True):
    filenames = files

In [4]:
nlp = spacy.load("en_core_web_sm", disable=["ner"]) #can also disable components at load, nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])

In [5]:
texts = [] #create a single string for each doc & add to list
text_to_num = []
counter = 0
for name in filenames:
    text_to_num.append((counter,name))
    counter += 1
    with open(PATH+name) as f:
        text = f.read()
        texts.append(text)

In [6]:
from spacy.tokens import Doc, Token, Span
Span.set_extension("TRAIN_LABEL", default = "No_Label")

In [7]:
%%time

docs_as_docs = []
for doc in nlp.pipe(texts):
    docs_as_docs.append(doc)

CPU times: user 22.1 s, sys: 6.69 s, total: 28.8 s
Wall time: 28.8 s


In [8]:
#create list of tuples of file name/number to element num & then convert to df
MAP_NUM_TO_NAME_df = pd.DataFrame(text_to_num, columns = ["element_in_list", "filename"])

In [9]:
MAP_NUM_TO_NAME_df.sample(3)

Unnamed: 0,element_in_list,filename
121,121,article761334950.txt
17,17,article700662577.txt
258,258,article765385479.txt


In [10]:
import re

In [11]:
MAP_NUM_TO_NAME_df["filenumber"] = MAP_NUM_TO_NAME_df.apply(lambda x: int(re.findall("\d+", x["filename"])[0]), axis=1)
MAP_NUM_TO_NAME_df = MAP_NUM_TO_NAME_df[["filenumber","filename","element_in_list"]]

In [12]:
TRAIN_LABELS = "/home/bryan/Documents/Code/si630/semeval/datasets/train-task2-TC.labels"

In [13]:
ALL_SPANS = pd.read_csv(TRAIN_LABELS, sep = "\t", header = None, names = ["filenumber","label","span_start","span_end"])

In [14]:
ALL_SPANS.sample(3)

Unnamed: 0,filenumber,label,span_start,span_end
5504,790667730,Loaded_Language,1967,1984
2308,757964238,"Exaggeration,Minimisation",2175,2193
2070,738207834,"Name_Calling,Labeling",12801,12836


In [15]:
ALL_SPANS_MAP = ALL_SPANS.merge(MAP_NUM_TO_NAME_df, on = "filenumber")

In [16]:
ALL_SPANS_MAP.sample(3)

Unnamed: 0,filenumber,label,span_start,span_end,filename,element_in_list
1270,724095467,"Exaggeration,Minimisation",2963,2970,article724095467.txt,6
1607,729410793,Appeal_to_Authority,4586,4678,article729410793.txt,323
1950,736231219,"Name_Calling,Labeling",1504,1544,article736231219.txt,16


In [17]:
#currently there are 26 errors, mostly due to missing/wrong spans
for row in ALL_SPANS_MAP.iterrows():
    try:
        row_temp = row[1]
        docs_as_docs[row_temp["element_in_list"]].char_span(row_temp["span_start"], row_temp["span_end"])._.TRAIN_LABEL = row_temp["label"]
    except Exception as e:
        pass

In [18]:
%%time

from intervaltree import Interval, IntervalTree

ALL_NEG_SPANS = []
errors = []
CONTEXT_WINDOW = 75 #size of one half of context window

# for a unique value in element_in_list
for i in ALL_SPANS_MAP["element_in_list"].unique(): #iterate through unique entries in element_in_list
    sub_df = ALL_SPANS_MAP[ALL_SPANS_MAP["element_in_list"]==i] #this is subset of main df containing only entries for the current doc
    these_intervals = [(x[1]["span_start"], x[1]["span_end"]) for x in sub_df.iterrows()] #get intervals
    this_tree = IntervalTree.from_tuples(these_intervals) #initialize tree with intervals
    
    this_doc = docs_as_docs[i]
    START_POINT = CONTEXT_WINDOW #char position we can start from # start_point = int(math.fabs(min(context_window))) 
    END_POINT = len(this_doc)-(CONTEXT_WINDOW) #char position we can work up until # end_point = len(docs_as_docs[13])-(max(max(context_window),0))
    for j in range(START_POINT, END_POINT): #for the range of possible center positions of the CONTEXT_WINDOW
        if this_tree.overlap(j, j+CONTEXT_WINDOW):
            continue
        else:
            try:
                docs_as_docs[i].char_span(j, j+CONTEXT_WINDOW)._.TRAIN_LABEL = ""
                ALL_NEG_SPANS.append((i, (j,j+CONTEXT_WINDOW)))
            except Exception as e:
                errors.append(i)

CPU times: user 5.12 s, sys: 3.1 ms, total: 5.13 s
Wall time: 5.27 s


In [19]:
full_list = [(docs_as_docs[i[0]].char_span(i[1][0],i[1][1]).text,"No_Propaganda") for i in ALL_NEG_SPANS]

In [20]:
for i in ALL_SPANS_MAP.iterrows():
    try:
        full_list.append((docs_as_docs[i[1]["element_in_list"]].char_span(i[1]["span_start"],i[1]["span_end"]).text, i[1]["label"]))
    except Exception as e:
        pass

In [21]:
train_df = pd.DataFrame.from_records(full_list, columns =['text','label'])

In [22]:
# create new col that is binary flag of "propaganda vs no propaganda"
train_df["label_binary"] = train_df.apply(lambda x: "No_Propaganda" if x["label"]=="No_Propaganda" else "Propaganda", axis=1)

In [23]:
train_df.sample(3)

Unnamed: 0,text,label,label_binary
8462,less than a week before the Senate Judiciary C...,No_Propaganda,No_Propaganda
15836,"Improbably, the Guardian has adopted Higgins a...",Doubt,Propaganda
9010,for plotting against him.\nThe behavior of sev...,No_Propaganda,No_Propaganda


In [24]:
train_df["rand_letter"] = "s"
train_data = train_df[["label","label_binary","rand_letter","text"]]
train_data.index = train_data.index.rename("id")
train_data = train_data.sample(frac=1).reset_index(drop=True) #randomly sorts the rows

In [25]:
train_data.to_csv("train_data.tsv", sep = "\t", header = True)

## Generate dev data

In [26]:
PATH = "/home/bryan/Documents/Code/si630/semeval/datasets/dev-articles/"

In [27]:
filenames = []
for root, dirs, files in os.walk(PATH, topdown=True):
    filenames = files

In [28]:
nlp = spacy.load("en_core_web_sm", disable=["ner"]) #can also disable components at load, nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])

In [29]:
texts = [] #create a single string for each doc & add to list
text_to_num = []
counter = 0
for name in filenames:
    text_to_num.append((counter,name))
    counter += 1
    with open(PATH+name) as f:
        text = f.read()
        texts.append(text)

In [30]:
from spacy.tokens import Doc, Token, Span
Span.set_extension("DEV_LABEL", default = "No_Label")

In [31]:
%%time

docs_as_docs = []
for doc in nlp.pipe(texts):
    docs_as_docs.append(doc)

CPU times: user 4.35 s, sys: 872 ms, total: 5.22 s
Wall time: 5.22 s


In [32]:
#create list of tuples of file name/number to element num & then convert to df
MAP_NUM_TO_NAME_df = pd.DataFrame(text_to_num, columns = ["element_in_list", "filename"])

In [33]:
MAP_NUM_TO_NAME_df.sample(3)

Unnamed: 0,element_in_list,filename
31,31,article782448403.txt
41,41,article778730964.txt
44,44,article832913316.txt


In [34]:
MAP_NUM_TO_NAME_df["filenumber"] = MAP_NUM_TO_NAME_df.apply(lambda x: int(re.findall("\d+", x["filename"])[0]), axis=1)
MAP_NUM_TO_NAME_df = MAP_NUM_TO_NAME_df[["filenumber","filename","element_in_list"]]

In [35]:
DEV_LABELS = "/home/bryan/Documents/Code/si630/semeval/datasets/dev-task-TC.labels"

In [36]:
ALL_SPANS = pd.read_csv(DEV_LABELS, sep = "\t", header = None, names = ["filenumber","label","span_start","span_end"])

In [37]:
ALL_SPANS.sample(3)

Unnamed: 0,filenumber,label,span_start,span_end
197,763280007,Flag-Waving,4169,4185
461,779309765,Causal_Oversimplification,2692,2876
986,999001280,Repetition,2135,2140


In [38]:
ALL_SPANS_MAP = ALL_SPANS.merge(MAP_NUM_TO_NAME_df, on = "filenumber")

In [39]:
ALL_SPANS_MAP.sample(3)

Unnamed: 0,filenumber,label,span_start,span_end,filename,element_in_list
339,776126299,Doubt,6082,6148,article776126299.txt,2
935,999000874,Loaded_Language,1628,1635,article999000874.txt,22
470,781672902,"Exaggeration,Minimisation",1433,1472,article781672902.txt,13


In [40]:
#currently there are 2 errors, due to missing/wrong spans
for row in ALL_SPANS_MAP.iterrows():
    try:
        row_temp = row[1]
        docs_as_docs[row_temp["element_in_list"]].char_span(row_temp["span_start"], row_temp["span_end"])._.DEV_LABEL = row_temp["label"]
    except Exception as e:
        pass

In [41]:
%%time

from intervaltree import Interval, IntervalTree

ALL_NEG_SPANS = []
errors = []
CONTEXT_WINDOW = 75 #size of one half of context window

# for a unique value in element_in_list
for i in ALL_SPANS_MAP["element_in_list"].unique(): #iterate through unique entries in element_in_list
    sub_df = ALL_SPANS_MAP[ALL_SPANS_MAP["element_in_list"]==i] #this is subset of main df containing only entries for the current doc
    these_intervals = [(x[1]["span_start"], x[1]["span_end"]) for x in sub_df.iterrows()] #get intervals
    this_tree = IntervalTree.from_tuples(these_intervals) #initialize tree with intervals
    
    this_doc = docs_as_docs[i]
    START_POINT = CONTEXT_WINDOW #char position we can start from # start_point = int(math.fabs(min(context_window))) 
    END_POINT = len(this_doc)-(CONTEXT_WINDOW) #char position we can work up until # end_point = len(docs_as_docs[13])-(max(max(context_window),0))
    for j in range(START_POINT, END_POINT): #for the range of possible center positions of the CONTEXT_WINDOW
        if this_tree.overlap(j, j+CONTEXT_WINDOW):
            continue
        else:
            try:
                docs_as_docs[i].char_span(j, j+CONTEXT_WINDOW)._.DEV_LABEL = ""
                ALL_NEG_SPANS.append((i, (j,j+CONTEXT_WINDOW)))
            except Exception as e:
                errors.append(i)

CPU times: user 744 ms, sys: 4.6 ms, total: 749 ms
Wall time: 743 ms


In [42]:
full_list = [(docs_as_docs[i[0]].char_span(i[1][0],i[1][1]).text,"No_Propaganda") for i in ALL_NEG_SPANS]

In [43]:
for i in ALL_SPANS_MAP.iterrows():
    try:
        full_list.append((docs_as_docs[i[1]["element_in_list"]].char_span(i[1]["span_start"],i[1]["span_end"]).text, i[1]["label"]))
    except Exception as e:
        pass

In [44]:
dev_df = pd.DataFrame.from_records(full_list, columns =['text','label'])

In [45]:
# create new col that is binary flag of "propaganda vs no propaganda"
dev_df["label_binary"] = dev_df.apply(lambda x: "No_Propaganda" if x["label"]=="No_Propaganda" else "Propaganda", axis=1)

In [46]:
dev_df.sample(3)

Unnamed: 0,text,label,label_binary
19,two Ohio police officers for allegedly buying ...,No_Propaganda,No_Propaganda
2550,the biggest campaign against freedom of speech...,"Exaggeration,Minimisation",Propaganda
322,?\nWhom do you consider to be the most corrupt...,No_Propaganda,No_Propaganda


In [47]:
dev_df["rand_letter"] = "s"
dev_data = dev_df[["label","label_binary","rand_letter","text"]]
dev_data.index = dev_data.index.rename("id")
dev_data = dev_data.sample(frac=1).reset_index(drop=True) #randomly sorts the rows

In [48]:
dev_data.to_csv("dev_data.tsv", sep = "\t", header = True)