## Data Preparation for the SI task

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import sys
from collections import defaultdict
import spacy
import nltk
from nltk.corpus import stopwords
nlp = spacy.load('en_core_web_sm')

In [5]:
def dataprepare(data):
    span_dict_val=defaultdict(list)
    val_text=defaultdict(str)
    
    
    if data=="train":
        path2=r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/train-labels-task-si" # Please provide the path for train-labels-task-si folder in datasets
        path = r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/train-articles"  # Please provide the path for train-articles folder in datasets
    else:
        if data=="dev":
            path2=r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/dev-labels-task-si" # Please provide the path for  dev-labels-task-si folder in datasets
            path = r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/dev-articles" # Please provide the path for  dev-articles folder in datasets
    
    fileList = os.listdir(path)
    
    for filename in tqdm(sorted(fileList)):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with open(path+"//"+filename, "r", encoding="utf-8") as file:
            file_text = file.read()
        val_text[article_id]=file_text
        span_article="article"+article_id+".task-si.labels"
        with open(path2+"//"+span_article, "r", encoding="utf-8") as f:
            for row in f.readlines():
                s=row.rstrip().split("\t")[1:]
                l=int(s[0])
                r=int(s[1])
                span_dict_val[article_id].append((l,r))
                
    return span_dict_val, val_text

In [6]:
train_span_dict,train_text= dataprepare("train")
dev_span_dict,dev_text= dataprepare("dev")

100%|██████████| 371/371 [02:15<00:00,  2.73it/s]
100%|██████████| 75/75 [00:54<00:00,  1.37it/s]


In [8]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
{'of', "didn't", "haven't", 'that', 'few', 'wouldn', "doesn't", 'am', 'each', 'this', 'because', 'than', 'our', 'in', 'were', "it's", 'up', 'been', 'nor', 'when', 'do', "mustn't", 's', 'aren', "won't", 'her', 'itself', "mightn't", 'under', 'through', 'once', 'has', 'which', 'not', 'further', 'own', 'how', 'mightn', 'now', 'out', 'herself', "hasn't", "don't", 'theirs', 'same', 'won', 'at', 'd', 'ours', 'shouldn', 'ourselves', 'have', 'doing', "shouldn't", 'did', 'will', 'o', 'from', 'can', 'after', 'it', 'my', 'they', 'on', 'and', 'y', "weren't", 'other', "you'd", "wouldn't", 'whom', 'during', 'couldn', 'was', 'some', 'm', 'he', 'for', 'does', "aren't", "you've", 'hers', 'very', "couldn't", 'about', 'mustn', 'me', 've', 'above', 'needn', 'we', 'any', 'who', 'more', "you're", 'just', 'should', 'down', 'both', 'no', 'them', 'weren', 'himself', 'is', 'those', 'wasn', 'didn', "isn'

## P/NP encoding

In [9]:
def dataset(datatype):
    if datatype=="train":
        val_text= train_text
        span_dict_val = train_span_dict
    if datatype=="dev":
        val_text = dev_text
        span_dict_val = dev_span_dict
    id_article=[]
    text=[]
    text_labels=[]
    len_text=[]
    len_labels=[]
    token_index=[]
    for key, value in tqdm(val_text.items()):
        doc = nlp(val_text[key])
        for sent1 in doc.sents:
            sent=[]
            sent_label=[]
            t_idx=[]
            for token in sent1:
                if key not in span_dict_val:
                    if token.text.isalpha() and (token.text.lower() not in stop_words):
                        sent_label.append('NP')
                        sent.append(token.text)
                        t_idx.append(token.idx)
                else:
                    if token.text.isalpha() and (token.text.lower() not in stop_words):
                        is_prop=False
                        for x in span_dict_val[key]:
                            l=x[0]
                            r=x[1]
                            if token.idx>=l and token.idx<r:
                                sent_label.append('P')
                                is_prop=True
                                break
                        if is_prop==False:
                            sent_label.append('NP')
                        sent.append(token.text)
                        t_idx.append(token.idx)
            s = ' '.join(w for w in sent)
            l = ' '.join(l for l in sent_label)
            t = ' '.join(str(t) for t in t_idx)
    #         if len(s)==0:
    #             print(sent1)
    #             print("zero")
            if len(s)!=0:
    #             print(sent1)
    #             print("zero")
                text.append(s)
                text_labels.append(l)
                len_text.append(len(sent))
                len_labels.append(len(sent_label))
                id_article.append(key)
                token_index.append(t)
                
    dict1 = {'Id': id_article, 'Text': text, 'Labels':text_labels,'tok_idx':token_index,'lentext':len_text,'lenLabels':len_labels}     
    df = pd.DataFrame(dict1)
    if datatype=="train":
        df.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/train_SI_dataset.csv',index=False)
    if datatype=="dev":
        df.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/dev_SI_dataset.csv',index=False)

In [10]:
dataset("train")
dataset("dev")

100%|██████████| 371/371 [00:53<00:00,  6.93it/s]
100%|██████████| 75/75 [00:08<00:00,  8.59it/s]


In [11]:
df_train = pd.read_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/train_SI_dataset.csv')
df_train.head()


Unnamed: 0,Id,Text,Labels,tok_idx,lentext,lenLabels
0,111111111,Next plague outbreak Madagascar could stronger,NP NP NP NP NP NP,0 5 12 24 35 45,6,6
1,111111111,Geneva World Health Organisation chief Wednesd...,NP NP NP NP NP NP NP NP NP NP P NP NP NP NP NP...,61 74 80 87 100 109 119 126 133 140 149 171 18...,20,20
2,111111111,next transmission could pronounced stronger Di...,P P P P P NP NP NP NP NP NP NP NP NP NP NP,269 274 287 301 315 330 339 347 354 362 374 37...,16,16
3,111111111,outbreak bubonic plague spread infected rats v...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,443 460 468 485 495 504 509 513 518 529 539 54...,22,22
4,111111111,Madagascar suffered bubonic plague outbreaks a...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP,653 668 677 685 692 702 709 715 720 732 738 74...,15,15


In [12]:
df_dev =  pd.read_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/dev_SI_dataset.csv')
df_dev.head()

Unnamed: 0,Id,Text,Labels,tok_idx,lentext,lenLabels
0,730081389,Police previously gone home Ohio patrol office...,NP NP NP NP NP NP NP NP,0 11 22 30 41 46 53 67,8,8
1,730081389,CLEVELAND,NP,75,1,1
2,730081389,Police invstigating domestic disputes previous...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,87 94 107 116 129 140 152 165 169 177 182 186 ...,23,23
3,730081389,Westerville Officers Eric Joering Anthony More...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP,312 324 333 338 355 363 381 388 402 407 424 43...,16,16
4,730081389,suspect old Quentin Smith shot wounded officer...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,490 507 511 519 530 539 554 567 576 581 587 59...,17,17


## BIOE ENCODING

In [13]:
df_train = pd.read_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/train_SI_dataset.csv')
df_train.head()

Unnamed: 0,Id,Text,Labels,tok_idx,lentext,lenLabels
0,111111111,Next plague outbreak Madagascar could stronger,NP NP NP NP NP NP,0 5 12 24 35 45,6,6
1,111111111,Geneva World Health Organisation chief Wednesd...,NP NP NP NP NP NP NP NP NP NP P NP NP NP NP NP...,61 74 80 87 100 109 119 126 133 140 149 171 18...,20,20
2,111111111,next transmission could pronounced stronger Di...,P P P P P NP NP NP NP NP NP NP NP NP NP NP,269 274 287 301 315 330 339 347 354 362 374 37...,16,16
3,111111111,outbreak bubonic plague spread infected rats v...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,443 460 468 485 495 504 509 513 518 529 539 54...,22,22
4,111111111,Madagascar suffered bubonic plague outbreaks a...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP,653 668 677 685 692 702 709 715 720 732 738 74...,15,15


In [14]:

def creatBIOE(data):
    if data=="train":
        labels = df_train['Labels'].values
    if data=="dev":
        labels = df_dev['Labels'].values
    len_bioelabels=[]
    bioe = []
    for lab in tqdm(labels):
        bioe_1=[]
        seenP = False
        labarr=lab.split()
        for i in range(len(labarr)):
            if labarr[i]=='NP':
                bioe_1.append('O')
                if seenP==True:
                    seenP=False
            if labarr[i]=='P':
                if seenP==True:
                    if i+1<len(labarr):
                        if labarr[i+1]=='NP':
                            bioe_1.append('E')
                        else:
                            bioe_1.append('I')
                    if i+1==len(labarr):
                        bioe_1.append('E')
                else:
                    bioe_1.append('B')
                    seenP=True
        bioe.append(" ".join(l for l in bioe_1))
        len_bioelabels.append(len(bioe_1))
    return bioe,len_bioelabels

In [15]:
train_bioe_label,train_len_bioe = creatBIOE("train")
dev_bioe_label,dev_len_bioe = creatBIOE("dev")

100%|██████████| 18567/18567 [00:00<00:00, 220925.69it/s]
100%|██████████| 3589/3589 [00:00<00:00, 247946.98it/s]


In [16]:
df_train['BIOE_LABELS']=train_bioe_label
df_train['BIOE_LABELS_LEN'] = train_len_bioe

In [17]:
df_dev['BIOE_LABELS']=dev_bioe_label
df_dev['BIOE_LABELS_LEN'] = dev_len_bioe

In [18]:
df_train.head()

Unnamed: 0,Id,Text,Labels,tok_idx,lentext,lenLabels,BIOE_LABELS,BIOE_LABELS_LEN
0,111111111,Next plague outbreak Madagascar could stronger,NP NP NP NP NP NP,0 5 12 24 35 45,6,6,O O O O O O,6
1,111111111,Geneva World Health Organisation chief Wednesd...,NP NP NP NP NP NP NP NP NP NP P NP NP NP NP NP...,61 74 80 87 100 109 119 126 133 140 149 171 18...,20,20,O O O O O O O O O O B O O O O O O O O O,20
2,111111111,next transmission could pronounced stronger Di...,P P P P P NP NP NP NP NP NP NP NP NP NP NP,269 274 287 301 315 330 339 347 354 362 374 37...,16,16,B I I I E O O O O O O O O O O O,16
3,111111111,outbreak bubonic plague spread infected rats v...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,443 460 468 485 495 504 509 513 518 529 539 54...,22,22,O O O O O O O O O O O O O O O O O O O O O O,22
4,111111111,Madagascar suffered bubonic plague outbreaks a...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP,653 668 677 685 692 702 709 715 720 732 738 74...,15,15,O O O O O O O O O O O O O O O,15


In [19]:
df_dev.head()

Unnamed: 0,Id,Text,Labels,tok_idx,lentext,lenLabels,BIOE_LABELS,BIOE_LABELS_LEN
0,730081389,Police previously gone home Ohio patrol office...,NP NP NP NP NP NP NP NP,0 11 22 30 41 46 53 67,8,8,O O O O O O O O,8
1,730081389,CLEVELAND,NP,75,1,1,O,1
2,730081389,Police invstigating domestic disputes previous...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,87 94 107 116 129 140 152 165 169 177 182 186 ...,23,23,O O O O O O O O O O O O O O O O O O O O O O O,23
3,730081389,Westerville Officers Eric Joering Anthony More...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP,312 324 333 338 355 363 381 388 402 407 424 43...,16,16,O O O O O O O O O O O O O O O O,16
4,730081389,suspect old Quentin Smith shot wounded officer...,NP NP NP NP NP NP NP NP NP NP NP NP NP NP NP N...,490 507 511 519 530 539 554 567 576 581 587 59...,17,17,O O O O O O O O O O O O O O O O O,17


In [20]:
df_train.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/train_SI_labels_bioe_PNP.csv',index=False)
df_dev.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/dev_SI_labels_bioe_PNP.csv',index=False)

## TEST data preparation

In [21]:
span_dict_val=defaultdict(list)
val_text=defaultdict(str)
# class_dict=defaultdict(list)
path2=r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/test-task-tc-template.out" # please provide the path for test-task-tc-template.out inside datasets folder
path = r"/content/gdrive/MyDrive/titans_milestone3/codebase/datasets/test-articles"  # please provide the path for test-articles inside datasets folder
fileList = os.listdir(path)
for filename in tqdm(sorted(fileList)):
    article_id = os.path.basename(filename).split(".")[0][7:]
    with open(path+"//"+filename, "r", encoding="utf-8") as file:
        file_text = file.read()
    val_text[article_id]=file_text
    with open(path2, "r", encoding="utf-8") as f:
        for row in f.readlines():
            s=row.rstrip().split("\t")
            aid=s[0]
            l=int(s[2])
            r=int(s[3])
            if(article_id==aid):
                span_dict_val[article_id].append((l,r))

100%|██████████| 90/90 [00:26<00:00,  3.37it/s]


In [22]:
id_article=[]
text=[]
text_labels=[]
len_text=[]
len_labels=[]
token_index=[]
for key, value in tqdm(val_text.items()):
    doc = nlp(val_text[key])
    for sent1 in doc.sents:
        sent=[]
        t_idx=[]
        for token in sent1:
            if token.text.isalpha() and (token.text.lower() not in stop_words):
                sent.append(token.text)
                t_idx.append(token.idx)
        
        s = ' '.join(w for w in sent)
        t = ' '.join(str(t) for t in t_idx)
        if len(s)!=0:
            text.append(s)
            id_article.append(key)
            token_index.append(t)

100%|██████████| 90/90 [00:10<00:00,  8.47it/s]


In [23]:
import pandas as pd
dict1 = {'Id': id_article, 'Text': text,'tok_idx':token_index}     
df = pd.DataFrame(dict1)
df.to_csv('/content/gdrive/MyDrive/titans_milestone3/codebase/Task-SI/processed_data/test_SI_data.csv',index=False)

In [24]:
df.head()

Unnamed: 0,Id,Text,tok_idx
0,813452859,EU Profits Trading UK,0 3 16 29
1,813452859,London Loses Money Political Campaigner,38 45 51 59 69
2,813452859,Parliamentary vote British Prime Minister Ther...,90 104 112 120 126 135 143 149 156 161 171 176...
3,813452859,chance May deal make parliament fails could on...,332 344 350 360 376 397 408 419 427 437 446 45...
4,813452859,Sputnik spoke political campaigner Michael Swa...,470 478 489 499 510 518
