In [28]:
import torch
import numpy as np
import random
from torch.utils.data import DataLoader
import os
import urllib
import zipfile
import lxml.etree
import re
from collections import Counter

In [29]:
if not os.path.isfile('ted_en-20160408.xml'):
    urllib.request.urlretrieve("https://github.com/oxford-cs-deepnlp-2017/practical-1/blob/master/ted_en-20160408.xml?raw=true", filename="ted_en-20160408.xml")

In [30]:
doc = lxml.etree.parse('ted_en-20160408.xml')
input_text = doc.xpath('//content/text()')
label = doc.xpath('//head/keywords/text()')
del doc
len(input_text)

2085

In [31]:
# Preprocess sentences to exclude all characters except alphabets and numbers
texts = [re.sub(r'\([^)]*\)', '',text) for text in input_text]
texts = [re.sub('r([^a-zA-Z0-9\s])',' ',text) for text in texts] #Included '.'
texts = [re.sub('[^a-zA-Z0-9\']',' ',text) for text in texts] #To replace '.' with ' '
texts = [re.sub('[^a-zA-Z0-9 ]','',text) for text in texts]
texts = [text.lower() for text in texts] #uppercase->lowercase

In [32]:
text_labels = zip(texts,label)
texts = [text_label for text_label in text_labels if len(text_label[0]) > 500]
print('number of text greater than 500 words are:',len(texts))

number of text greater than 500 words are: 2076


In [33]:
texts,labels = zip(*texts)

In [34]:
words = [words for text in texts for words in text.split()]
words_count = Counter(words)
words_most_common =[word for word,count in words_count.most_common(100)]
words_least_common = [word for word,count in words_count.most_common() if count==1]

In [35]:
to_remove = words_most_common + words_least_common
words_to_remove = set(to_remove)
tokens = [word for word in words if word not in words_to_remove] #will be used during T-SNE
print('size of Token:',len(tokens)) 

size of Token: 1948385


In [36]:
texts = [[word for word in text.split() if word not in words_to_remove]for text in texts]

In [52]:
# Encode labels as ['ooo', 'Too', 'oEo', 'ooD', 'TEo', 'ToD', 'oED', 'TED']
label_coded = ['ooo']*len(labels)

for i,keyword in enumerate(labels):
    key = keyword.split(', ')
    label = list(label_coded[i])
    if 'technology' in key:
        label[0] = 'T'
    if 'entertainment' in key:
        label[1] = 'E'
    if 'design' in key:
        label[2] = 'D'
    else:
        pass
    label_coded[i] =''.join(label) 

In [38]:
count_labels=Counter(label_coded)
label_count = [word_count for word_count in count_labels.most_common()]
label_count

[('ooo', 1130),
 ('Too', 389),
 ('oEo', 169),
 ('ooD', 158),
 ('ToD', 137),
 ('TEo', 36),
 ('TED', 33),
 ('oED', 24)]

In [39]:
one_hotted = np.zeros(shape=(len(labels),8),dtype='int16')
label_lookup = ['ooo', 'Too', 'oEo', 'ooD', 'TEo', 'ToD', 'oED', 'TED']

In [40]:
label_lookup = ['ooo', 'Too', 'oEo', 'ooD', 'TEo', 'ToD', 'oED', 'TED']
for i,label in enumerate(label_coded):
    one_hotted[i][label_lookup.index(label)] = 1
print(one_hotted[:10])    

[[1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0]
 [0 0 0 0 0 1 0 0]
 [1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0]]


In [41]:
tokens.append('<UNK>')
tokens.append('<PAD>')

In [42]:
vocab = list(set(tokens))

In [43]:
print('size of vocabulary:',len(vocab))
id2word = dict(enumerate(vocab))
word2id = dict((val,key) for (key,val) in id2word.items())

size of vocabulary: 37328


In [44]:
# Stripping Text to fall within length of 500; incase if it is shorter then padd with '<UNK>'
length = 500 #sentence length
stripped_text = []#np.zeros((len(texts),length)
for i,text in enumerate(texts):
    inputs = []
    if len(text) >= 500:
        inputs.extend(text[:500])
    else:
        extra_length = 500-len(text)
        extra = ['<PAD>']*extra_length
        word_with_extra = text + extra
        inputs.extend(word_with_extra)
    stripped_text.append(inputs) 

In [45]:
stripped_length = len(stripped_text)
print(stripped_length)

2076


In [46]:
for i,code in enumerate(label_coded):
    one_hotted[i][label_lookup.index(code)] = 1

In [47]:
inputs = []
text_ids = []
for text in stripped_text:
    for word in text:
        i = word2id[word]
        inputs.append(i)
    text_ids.append(inputs)
    inputs = []

In [48]:
text_ids[0][100] , id2word[text_ids[0][100]], stripped_text[0][100], word2id['<UNK>']

(30582, 'even', 'even', 18598)

In [49]:
data = list(zip(text_ids,one_hotted))
tr_size = round(0.8*len(data))
vl_size = round(0.1*len(data))
te_size = tr_size + vl_size
n_classes = one_hotted.shape[1]
train_Xy , val_Xy , test_Xy = [],[],[]
for i in np.arange(n_classes):
    j = np.zeros(n_classes)
    j[i] = 1
    temp = [text_ohe for text_ohe in data if text_ohe[1][i]==j[i]]
    temp_len = len(temp)
    tr_split = round(temp_len*0.8)
    val_split = round(temp_len*0.9)
    train_Xy.extend(temp[:tr_split])
    val_Xy.extend(temp[tr_split:val_split])
    test_Xy.extend(temp[val_split:])
random.shuffle(train_Xy)
random.shuffle(val_Xy)
random.shuffle(test_Xy)

In [50]:
train_ids = torch.from_numpy(np.array([i[0] for i in train_Xy]))
train_labels = [i[1] for i in train_Xy]
val_ids = torch.from_numpy(np.array([i[0] for i in val_Xy]))
val_labels = [i[1] for i in val_Xy]
test_ids = torch.from_numpy(np.array([i[0] for i in test_Xy]))
test_labels = [i[1] for i in test_Xy]
train_labels = torch.tensor([label.tolist().index(1) for label in train_labels], dtype=torch.long)
val_labels = torch.tensor([label.tolist().index(1) for label in val_labels], dtype=torch.long)
test_labels = torch.tensor([label.tolist().index(1) for label in test_labels], dtype=torch.long)