### Import Libraries

In [1]:
import xml.etree.ElementTree as ET
import os
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from tqdm import tqdm



### Loading Files

In [2]:
folder = "data/TBAQ-cleaned/AQUAINT/"
files = os.listdir(folder)
files = [folder+file for file in files]

In [3]:
folder2 = "data/TBAQ-cleaned/TimeBank/"
files2 = os.listdir(folder2)
files2 = [folder2+file for file in files2]

In [4]:
all_files = np.concatenate([files, files2])

In [5]:
all_files = [file for file in all_files if file.endswith("tml")]

### Parsing Text

In [6]:
data_full = pd.DataFrame({"sentences":[], "labels":[]})

for file in tqdm(all_files):
    
    text = ET.parse(file).getroot().find("TEXT")
    str_text = ET.tostring(text).decode()
    
    list_ = str_text.splitlines()
    list_ = np.concatenate([l.split(".") for l in list_])
    list_ = [l for l in list_ if l!=""]
    
    labels = ["TIMEX3" in l for l in list_]
    
    data = pd.DataFrame({"sentences":list_, "labels":labels})
    data.sentences = data.sentences.apply(lambda x: re.sub("<[^>]*>|`+|-+|\A +|\"|'+","",x))
    data["length"] = data.sentences.apply(lambda x: len(x.split(" ")))
    data = data[data["length"] > 5]
    data.drop(columns="length", inplace=True)
    
    data_full = pd.concat([data_full, data])

100%|██████████| 256/256 [00:09<00:00, 27.37it/s]


In [7]:
data_full.labels = data_full.labels.apply(int)
data_full.head()

Unnamed: 0,sentences,labels
2,The Justice Department is reviewing whether al...,0
3,"That matter is under review, Deputy Attorney G...",1
4,"We will look at that, try to make some kind of...",0
6,"In addition, the department is still consideri...",1
8,Juan Miguel Gonzalez objected to the media acc...,0


### Cleaning data

#### droping fake sentences

In [8]:
data_full["upper"] = data_full.sentences.apply(lambda x: x[0].isupper())
data_full = data_full[data_full.upper]
data_full.drop(columns="upper", inplace=True)

In [9]:
data_full.reset_index(drop=True, inplace=True)

### Adding data

In [10]:
guardian = pd.read_table("data/Guardian_time.txt", header=None, names=["sentences"])
financial = pd.read_table("data/Financial_time.txt", header=None, names=["sentences"])

  """Entry point for launching an IPython kernel.
  


In [11]:
new_data = pd.concat([guardian, financial])
new_data.reset_index(drop=True, inplace=True)
new_data["labels"] = [1]*len(new_data)

In [12]:
data_full = pd.concat([data_full, new_data])
data_full.reset_index(drop=True, inplace=True)
data_full.head()

Unnamed: 0,sentences,labels
0,The Justice Department is reviewing whether al...,0
1,"That matter is under review, Deputy Attorney G...",1
2,"We will look at that, try to make some kind of...",0
3,"In addition, the department is still consideri...",1
4,Juan Miguel Gonzalez objected to the media acc...,0


### Build balanced dataset

In [13]:
data_time = data_full[data_full.labels == 1]
data_no_time = data_full[data_full.labels == 0].sample(2000)

In [14]:
data = pd.concat([data_time, data_no_time])

In [15]:
data.labels.sum()/len(data)

0.41758881770529993

### Create training, testing and development datasets

In [16]:
X = data.sentences
y = data.labels

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.2)

In [20]:
X_train.to_csv("data/time_data_clean/train.data", index=False, header=False)
X_test.to_csv("data/time_data_clean/test.data", index=False, header=False)
X_dev.to_csv("data/time_data_clean/dev.data", index=False, header=False)
y_train.to_csv("data/time_data_clean/train.labels", index=False, header=False)
y_test.to_csv("data/time_data_clean/test.labels", index=False, header=False)
y_dev.to_csv("data/time_data_clean/dev.labels", index=False, header=False)