In [1]:
import torch
import numpy as np
import random
from torch.utils.data import DataLoader
import os
import urllib
import zipfile
import lxml.etree
import re
from collections import Counter

In [2]:
if not os.path.isfile('ted_en-20160408.xml'):
    urllib.request.urlretrieve("https://github.com/oxford-cs-deepnlp-2017/practical-1/blob/master/ted_en-20160408.xml?raw=true", filename="ted_en-20160408.xml")

In [3]:
doc = lxml.etree.parse('ted_en-20160408.xml')
input_text = doc.xpath('//content/text()')
label = doc.xpath('//head/keywords/text()')
del doc
len(input_text)

2085

In [4]:
# Preprocess sentences to exclude all characters except alphabets and numbers
texts = [re.sub(r'\([^)]*\)', '',text) for text in input_text]
texts = [re.sub('r([^a-zA-Z0-9\s])',' ',text) for text in texts] #Included '.'
texts = [re.sub('[^a-zA-Z0-9\']',' ',text) for text in texts] #To replace '.' with ' '
texts = [re.sub('[^a-zA-Z0-9 ]','',text) for text in texts]
texts = [text.lower() for text in texts] #uppercase->lowercase

In [5]:
texts[2069][:160]

'   thank you   thank you very much  like the speaker before me    i am a ted virgin  i guess  im also the first time here  and      i dont know what to say   im'

In [6]:
sentences_ted = []
for sent_str in texts:
    tokens = sent_str.split()
    sentences_ted.append(tokens)

In [7]:
sentences_ted[0]

['here',
 'are',
 'two',
 'reasons',
 'companies',
 'fail',
 'they',
 'only',
 'do',
 'more',
 'of',
 'the',
 'same',
 'or',
 'they',
 'only',
 'do',
 'whats',
 'new',
 'to',
 'me',
 'the',
 'real',
 'real',
 'solution',
 'to',
 'quality',
 'growth',
 'is',
 'figuring',
 'out',
 'the',
 'balance',
 'between',
 'two',
 'activities',
 'exploration',
 'and',
 'exploitation',
 'both',
 'are',
 'necessary',
 'but',
 'it',
 'can',
 'be',
 'too',
 'much',
 'of',
 'a',
 'good',
 'thing',
 'consider',
 'facit',
 'im',
 'actually',
 'old',
 'enough',
 'to',
 'remember',
 'them',
 'facit',
 'was',
 'a',
 'fantastic',
 'company',
 'they',
 'were',
 'born',
 'deep',
 'in',
 'the',
 'swedish',
 'forest',
 'and',
 'they',
 'made',
 'the',
 'best',
 'mechanical',
 'calculators',
 'in',
 'the',
 'world',
 'everybody',
 'used',
 'them',
 'and',
 'what',
 'did',
 'facit',
 'do',
 'when',
 'the',
 'electronic',
 'calculator',
 'came',
 'along',
 'they',
 'continued',
 'doing',
 'exactly',
 'the',
 'same',

In [8]:
texts = [text for text in sentences_ted if len(text) > 500]
print('number of text greater than 500 words are:',len(texts))


number of text greater than 500 words are: 1993


In [9]:
words = [words for text in texts for words in text]
word_counter = Counter(words)
words_most_common =[word for word,count in word_counter.most_common(100)]
words_least_common = [word for word,count in word_counter.most_common() if count==1]

In [10]:
words_most_common[0]

'the'

In [11]:
target_for_removal = set(words_most_common + words_least_common)
tokens = [word for word in words if word not in target_for_removal]
print("Vocabulary Token length:",len(tokens))

Vocabulary Token length: 1934420


In [12]:
texts = [[word for word in text if word not in target_for_removal]for text in texts]

In [13]:
texts[0]

['reasons',
 'companies',
 'fail',
 'only',
 'same',
 'only',
 'whats',
 'new',
 'real',
 'real',
 'solution',
 'quality',
 'growth',
 'figuring',
 'balance',
 'between',
 'activities',
 'exploration',
 'exploitation',
 'both',
 'necessary',
 'too',
 'much',
 'good',
 'thing',
 'consider',
 'facit',
 'old',
 'enough',
 'remember',
 'facit',
 'fantastic',
 'company',
 'born',
 'deep',
 'swedish',
 'forest',
 'made',
 'best',
 'mechanical',
 'calculators',
 'everybody',
 'used',
 'did',
 'facit',
 'electronic',
 'calculator',
 'came',
 'along',
 'continued',
 'doing',
 'exactly',
 'same',
 'six',
 'months',
 'went',
 'maximum',
 'revenue',
 'gone',
 'gone',
 'irony',
 'facit',
 'story',
 'hearing',
 'facit',
 'engineers',
 'bought',
 'cheap',
 'small',
 'electronic',
 'calculators',
 'japan',
 'used',
 'double',
 'check',
 'calculators',
 'facit',
 'did',
 'too',
 'much',
 'exploitation',
 'exploration',
 'wild',
 'too',
 'few',
 'back',
 'worked',
 'closely',
 'alongside',
 'european',


In [14]:
len(texts)

1993

In [15]:
label_coded = ['ooo']*len(label)
for i,keyword in enumerate(label):
    key = keyword.split(', ')
    labels = list(label_coded[i])
    if 'technology' in key:
        labels[0] = 'T'
    if 'entertainment' in key:
        labels[1] = 'E'
    if 'design' in key:
        labels[2] = 'D'
    else:
        pass
    label_coded[i] =''.join(labels) 

In [16]:
count_labels=Counter(label_coded)
label_count = [word_count for word_count in count_labels.most_common()]
label_count

[('ooo', 1134),
 ('Too', 389),
 ('oEo', 173),
 ('ooD', 158),
 ('ToD', 137),
 ('TEo', 37),
 ('TED', 33),
 ('oED', 24)]

In [17]:
one_hotted = np.zeros(shape=(len(label),8),dtype='int16')
label_lookup = ['ooo', 'Too', 'oEo', 'ooD', 'TEo', 'ToD', 'oED', 'TED']

In [18]:
for i,code in enumerate(label_coded):
    one_hotted[i][label_lookup.index(code)] = 1