## Train and Dev data

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')
import os
import sys
from tqdm import tqdm
from collections import defaultdict
import pandas as pd

In [3]:
def dataprepare(data):
    span_dict_val=defaultdict(list)
    val_text=defaultdict(str)

    if data=="train":
        path2=r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/train-labels-task-flc-tc"
        path = r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/train-articles"
    else:
        if data=="dev":
            path2=r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/dev-labels-task-flc-tc"
            path = r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/dev-articles"

    fileList = os.listdir(path)
    for filename in tqdm(sorted(fileList)):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with open(path+"//"+filename, "r", encoding="utf-8") as file:
            file_text = file.read()
        val_text[article_id]=file_text
        span_article="article"+article_id+".task-flc-tc.labels"
        with open(path2+"//"+span_article, "r", encoding="utf-8") as f:
            for row in f.readlines():
                s=row.rstrip().split("\t")[1:]
                c=s[0]
                l=int(s[1])
                r=int(s[2])
                span_dict_val[article_id].append((c,l,r))
    return span_dict_val, val_text

In [4]:
train_span_dict,train_text= dataprepare("train")
dev_span_dict,dev_text= dataprepare("dev")

100%|██████████| 371/371 [01:41<00:00,  3.65it/s]
100%|██████████| 75/75 [00:36<00:00,  2.08it/s]


In [5]:
def dataset(datatype):
    if datatype=="train":
        val_text= train_text
        span_dict_val = train_span_dict
    if datatype=="dev":
        val_text = dev_text
        span_dict_val = dev_span_dict

    sent=[]
    span_text=[]
    cl=[]
    span=[]
    span_start=[]
    span_end=[]
    ids=[]
    len_ratio=[]
    for key in tqdm(span_dict_val):
        doc = nlp(val_text[key])
        for sent1 in doc.sents:
            for x in span_dict_val[key]:
                c=x[0]
                l=int(x[1])
                r=int(x[2])
                if (l>=sent1.start_char and l<sent1.end_char):
                    ids.append(key)
                    sent.append(sent1)
                    span_text.append(doc.text[l:r])
                    ratio=len(sent1.text.split(' '))/len(doc.text[l:r].split(' '))
                    len_ratio.append(ratio)
                    span_start.append(l)
                    span_end.append(r)
                    span.append((l,r))
                    cl.append(c)

    dict1 = {'Id': ids, 'Sentence': sent, 'Span_text':span_text,'ratio':len_ratio,'Span':span,'Span_start':span_start,'Span_end':span_end,'class':cl}     
    df = pd.DataFrame(dict1)
    if datatype=="train":
        df.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-TC/processed_data/train_TC_dataset.csv',index=False)
    if datatype=="dev":
        df.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-TC/processed_data/dev_TC_dataset.csv',index=False)

In [6]:
dataset("train")
dataset("dev")

100%|██████████| 357/357 [01:20<00:00,  4.44it/s]
100%|██████████| 74/74 [00:12<00:00,  6.12it/s]


## Test data preparation

In [8]:
span_dict_val=defaultdict(list)
val_text=defaultdict(str)
# class_dict=defaultdict(list)
path2=r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/test-task-tc-template.out"
path = r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/test-articles"
fileList = os.listdir(path)
for filename in tqdm(sorted(fileList)):
    article_id = os.path.basename(filename).split(".")[0][7:]
    with open(path+"//"+filename, "r", encoding="utf-8") as file:
        file_text = file.read()
    val_text[article_id]=file_text
    with open(path2, "r", encoding="utf-8") as f:
        for row in f.readlines():
            s=row.rstrip().split("\t")
            aid=s[0]
            l=int(s[2])
            r=int(s[3])
            if(article_id==aid):
                span_dict_val[article_id].append((l,r))

100%|██████████| 90/90 [00:17<00:00,  5.01it/s]


In [9]:
sent=[]
span_text=[]
cl=[]
span=[]
span_start=[]
span_end=[]
ids=[]
len_ratio=[]
for key in tqdm(span_dict_val):
    doc = nlp(val_text[key])
    for sent1 in doc.sents:
        for x in span_dict_val[key]:
            l=int(x[0])
            r=int(x[1])
            if (l>=sent1.start_char and l<sent1.end_char):
                ids.append(key)
                sent.append(sent1)
                span_text.append(doc.text[l:r])
                ratio=len(sent1.text.split(' '))/len(doc.text[l:r].split(' '))
                len_ratio.append(ratio)
                span.append((l,r))
                span_start.append(l)
                span_end.append(r)

100%|██████████| 87/87 [00:14<00:00,  5.80it/s]


In [10]:
import pandas as pd
dict1 = {'Id': ids, 'Sentence': sent, 'Span_text':span_text,'ratio':len_ratio,'Span':span,'Span_start':span_start,'Span_end':span_end}     
df = pd.DataFrame(dict1)
df.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-TC/processed_data/test_TC_dataset.csv',index=False)


## Preprocessing

In [11]:
df_train = pd.read_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-TC/processed_data/train_TC_dataset.csv')
df_train.head()

Unnamed: 0,Id,Sentence,Span_text,ratio,Span,Span_start,Span_end,class
0,111111111,Geneva - The World Health Organisation chief o...,appeared,32.0,"(149, 157)",149,157,Doubt
1,111111111,"""The next transmission could be more pronounce...",The next transmission could be more pronounced...,2.666667,"(265, 323)",265,323,Appeal_to_Authority
2,111111111,"But Tedros voiced alarm that ""plague in Madaga...","a very, very different",4.25,"(1069, 1091)",1069,1091,Repetition
3,111111111,He also pointed to the presence of the pneumon...,He also pointed to the presence of the pneumon...,1.0,"(1334, 1462)",1334,1462,Appeal_to_fear-prejudice
4,111111111,He praised the rapid response from WHO and Mad...,but warned that the danger was not over,3.125,"(1577, 1616)",1577,1616,Appeal_to_fear-prejudice


In [12]:
df_dev =  pd.read_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-TC/processed_data/dev_TC_dataset.csv')
df_dev.head()

Unnamed: 0,Id,Sentence,Span_text,ratio,Span,Span_start,Span_end,class
0,730093263,"Earlier, I blogged that the police had release...",white,25.0,"(123, 128)",123,128,"Whataboutism,Straw_Men,Red_Herring"
1,730093263,( Most people named Quentin Lamar Smith are bl...,black,12.0,"(352, 357)",352,357,"Whataboutism,Straw_Men,Red_Herring"
2,730093263,He called them “true American heroes.”,“true American heroes.”,2.0,"(1370, 1393)",1370,1393,Flag-Waving
3,730093263,"it is the same guy, and he is black.\n",black,9.0,"(2434, 2439)",2434,2439,"Whataboutism,Straw_Men,Red_Herring"
4,730093263,"This goes in our ""Black Murders Of White Cops""...","""Black Murders Of White Cops""",6.0,"(2458, 2487)",2458,2487,Loaded_Language


In [13]:
df_test = pd.read_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-TC/processed_data/test_TC_dataset.csv')
df_test.head()

Unnamed: 0,Id,Sentence,Span_text,ratio,Span,Span_start,Span_end
0,813452859,: I guess her only chance is if Labour decides...,to dishonour democracy,7.666667,"(717, 739)",717,739
1,813452859,: I guess her only chance is if Labour decides...,I guess her only chance is if Labour decides t...,1.045455,"(657, 773)",657,773
2,813452859,There is a chance; as unfortunately there are ...,dead in the water,21.5,"(1293, 1310)",1293,1310
3,813452859,There is a chance; as unfortunately there are ...,bitterly against it,28.666667,"(1128, 1147)",1128,1147
4,813452859,There is a chance; as unfortunately there are ...,remoaner MPs,43.0,"(1256, 1268)",1256,1268


In [14]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [15]:
# Combining all the above statemennts 
from tqdm import tqdm

def preprocess_text(text):
    preprocessed_sent = []
    # tqdm is for printing the status bar
    for sent in tqdm(text):
        sent = decontracted(sent)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\"', ' ')
        sent = sent.replace('\\n', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        # https://gist.github.com/sebleier/554280
        #sent = ' '.join(e for e in sent.split() if e not in stopwords)
        sent = ' '.join(e for e in sent.split())
        preprocessed_sent.append(sent.lower().strip())
    return preprocessed_sent

In [16]:
preprocessed_sent_train = preprocess_text(df_train['Sentence'].values)
preprocessed_span_train = preprocess_text(df_train['Span_text'].values)

100%|██████████| 6128/6128 [00:00<00:00, 35294.65it/s]
100%|██████████| 6128/6128 [00:00<00:00, 51103.06it/s]


In [17]:
preprocessed_sent_dev = preprocess_text(df_dev['Sentence'].values)
preprocessed_span_dev = preprocess_text(df_dev['Span_text'].values)

100%|██████████| 1063/1063 [00:00<00:00, 30203.88it/s]
100%|██████████| 1063/1063 [00:00<00:00, 52541.87it/s]


In [18]:
preprocessed_sent_test = preprocess_text(df_test['Sentence'].values)
preprocessed_span_test = preprocess_text(df_test['Span_text'].values)

100%|██████████| 1790/1790 [00:00<00:00, 28710.75it/s]
100%|██████████| 1790/1790 [00:00<00:00, 50529.02it/s]


In [19]:
df_train['preprocessed_sent']=preprocessed_sent_train
df_train['preprocessed_span']=preprocessed_span_train
df_train.head()

Unnamed: 0,Id,Sentence,Span_text,ratio,Span,Span_start,Span_end,class,preprocessed_sent,preprocessed_span
0,111111111,Geneva - The World Health Organisation chief o...,appeared,32.0,"(149, 157)",149,157,Doubt,geneva the world health organisation chief on ...,appeared
1,111111111,"""The next transmission could be more pronounce...",The next transmission could be more pronounced...,2.666667,"(265, 323)",265,323,Appeal_to_Authority,the next transmission could be more pronounced...,the next transmission could be more pronounced...
2,111111111,"But Tedros voiced alarm that ""plague in Madaga...","a very, very different",4.25,"(1069, 1091)",1069,1091,Repetition,but tedros voiced alarm that plague in madagas...,a very very different
3,111111111,He also pointed to the presence of the pneumon...,He also pointed to the presence of the pneumon...,1.0,"(1334, 1462)",1334,1462,Appeal_to_fear-prejudice,he also pointed to the presence of the pneumon...,he also pointed to the presence of the pneumon...
4,111111111,He praised the rapid response from WHO and Mad...,but warned that the danger was not over,3.125,"(1577, 1616)",1577,1616,Appeal_to_fear-prejudice,he praised the rapid response from who and mad...,but warned that the danger was not over


In [20]:
df_dev['preprocessed_sent']=preprocessed_sent_dev
df_dev['preprocessed_span']=preprocessed_span_dev
df_dev.head()

Unnamed: 0,Id,Sentence,Span_text,ratio,Span,Span_start,Span_end,class,preprocessed_sent,preprocessed_span
0,730093263,"Earlier, I blogged that the police had release...",white,25.0,"(123, 128)",123,128,"Whataboutism,Straw_Men,Red_Herring",earlier i blogged that the police had released...,white
1,730093263,( Most people named Quentin Lamar Smith are bl...,black,12.0,"(352, 357)",352,357,"Whataboutism,Straw_Men,Red_Herring",most people named quentin lamar smith are blac...,black
2,730093263,He called them “true American heroes.”,“true American heroes.”,2.0,"(1370, 1393)",1370,1393,Flag-Waving,he called them true american heroes,true american heroes
3,730093263,"it is the same guy, and he is black.\n",black,9.0,"(2434, 2439)",2434,2439,"Whataboutism,Straw_Men,Red_Herring",it is the same guy and he is black,black
4,730093263,"This goes in our ""Black Murders Of White Cops""...","""Black Murders Of White Cops""",6.0,"(2458, 2487)",2458,2487,Loaded_Language,this goes in our black murders of white cops c...,black murders of white cops


In [21]:
df_test['preprocessed_sent']=preprocessed_sent_test
df_test['preprocessed_span']=preprocessed_span_test
df_test.head()

Unnamed: 0,Id,Sentence,Span_text,ratio,Span,Span_start,Span_end,preprocessed_sent,preprocessed_span
0,813452859,: I guess her only chance is if Labour decides...,to dishonour democracy,7.666667,"(717, 739)",717,739,i guess her only chance is if labour decides t...,to dishonour democracy
1,813452859,: I guess her only chance is if Labour decides...,I guess her only chance is if Labour decides t...,1.045455,"(657, 773)",657,773,i guess her only chance is if labour decides t...,i guess her only chance is if labour decides t...
2,813452859,There is a chance; as unfortunately there are ...,dead in the water,21.5,"(1293, 1310)",1293,1310,there is a chance as unfortunately there are m...,dead in the water
3,813452859,There is a chance; as unfortunately there are ...,bitterly against it,28.666667,"(1128, 1147)",1128,1147,there is a chance as unfortunately there are m...,bitterly against it
4,813452859,There is a chance; as unfortunately there are ...,remoaner MPs,43.0,"(1256, 1268)",1256,1268,there is a chance as unfortunately there are m...,remoaner mps


In [22]:
df_train.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-TC/processed_data/train_TC_dataset_preprocessed.csv',index=False)
df_dev.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-TC/processed_data/dev_TC_dataset_preprocessed.csv',index=False)
df_test.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-TC/processed_data/test_TC_dataset_preprocessed.csv',index=False)