In [1]:
import numpy as np
import os
from random import shuffle
import re

In [2]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

### Part 0: Download the TED dataset

In [3]:
import urllib.request
import zipfile
import lxml.etree

In [4]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [5]:
# we're interested in the content and keywords text , so let's extract them from the XML:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
    contents_text = [] 
    keywords_text = [] 
    for content,keywords in zip(doc.iter('content'),doc.iter('keywords')):
        keywords_text.append(keywords.text)
        contents_text.append(content.text)
         
del doc
print('Size of keywords: ' , len(keywords_text))
print('Size of contents: ' , len(contents_text))

Size of keywords:  2085
Size of contents:  2085


### Part 1: Preprocessing
In this part, we attempt to clean up the raw contents a bit, so that we get only sentences.Let's start by removing all parenthesized strings using a regex:

In [6]:
contents_text_noparens = [] 
for content in contents_text:
    contents_text_noparens.append(re.sub(r'\([^)]*\)', '', content))

In [7]:
print(contents_text[0][850:900])
print('\nContents without parenthesized strings:')
print(contents_text_noparens[0][850:900])

ir calculators.
(Laughter)
Facit did too much expl

Contents without parenthesized strings:
ir calculators.

Facit did too much exploitation. 


Now, let's attempt to remove speakers' names that occur at the beginning of a line, by deleting pieces of the form "`<up to 20 characters>:`". Of course, this is an imperfect heuristic. 

In [8]:
contents_ted = [] 
for content in contents_text_noparens:
    content_sent = [] 
    for line in content.split('\n'):
        m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
        content_sent.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
    contents_ted.append(" ".join([x for x in content_sent]))

We're ready to tokenize each of them into words and make them all small case.

In [9]:
from nltk.tokenize import word_tokenize
import string

In [10]:
tokenized_content = []
for content in contents_ted:
    # split into words
    tokens = word_tokenize(content)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    tokenized_content.append([x for x in stripped if x != ''])

In [11]:
print(tokenized_content[0][0:10])

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more']


#### Labelling

In [85]:
keywords_text[31]

'talks, astronomy, cosmos, curiosity, exploration, nature, physics, science, space, technology, universe'

In [88]:
labels = []
labels_one_hot = [] 
for k in keywords_text:
    k = k.lower()
    if 'technology' in k and 'entertainment' in k and 'design' in k: 
        labels.append('TED')
        labels_one_hot.append(np.array([1, 0, 0, 0, 0, 0, 0, 0]))
    elif 'entertainment' in k and 'design' in k:
        labels.append('oED')
        labels_one_hot.append(np.array([0, 1, 0, 0, 0, 0, 0, 0]))
    elif 'technology' in k and 'design' in k:
        labels.append('ToD')
        labels_one_hot.append(np.array([0, 0, 1, 0, 0, 0, 0, 0]))
    elif 'technology' in k and 'entertainment' in k:
        labels.append('TEo')
        labels_one_hot.append(np.array([0, 0, 0, 1, 0, 0, 0, 0]))
    elif 'technology' in k: 
        labels.append('Too')
        labels_one_hot.append(np.array([0, 0, 0, 0, 1, 0, 0, 0]))
    elif 'entertainment' in k: 
        labels.append('oEo')
        labels_one_hot.append(np.array([0, 0, 0, 0, 0, 1, 0, 0]))
    elif 'design' in k : 
        labels.append('ooD')
        labels_one_hot.append(np.array([0, 0, 0, 0, 0, 0, 1, 0]))
    else: 
        labels.append('ooo')    
        labels_one_hot.append(np.array([0, 0, 0, 0, 0, 0, 0, 1]))

In [89]:
print(labels[31])
print(labels_one_hot[31])

Too
[0 0 0 0 1 0 0 0]


#### Split data

In [12]:
train = tokenized_content[0:1585]
validation = tokenized_content[1585:1835]
test = tokenized_content[1835:len(tokenized_content)]
print('Size of train data: ', len(train))
print('Size of validation data: ', len(validation))
print('Size of test data: ', len(test))

Size of train data:  1585
Size of validation data:  250
Size of test data:  250


#### Word Frequencies

In [13]:
from itertools import chain
from collections import Counter

In [38]:
counts_ted_top1000_dic =  dict(Counter(chain.from_iterable(train)))
vocab_size = len(counts_ted_top1000_dic)

In [15]:
counts_ted_top1000_dic = {k: v for k, v in sorted(counts_ted_top1000_dic.items(), key=lambda item: item[1], reverse=True)}

In [16]:
counts_ted_top1000 = []
i = 0 
for k in counts_ted_top1000_dic.keys(): 
    if i == 1000:
        break
    counts_ted_top1000.append(counts_ted_top1000_dic[k])
    i += 1 
len(counts_ted_top1000)

1000

### Part 2: Train Word2Vec

In [17]:
from gensim.models import Word2Vec

In [24]:
model_ted = Word2Vec(train, size=50, window=5, min_count=1, workers=10, iter=100)

In [25]:
model_ted.wv.most_similar("cat")

[('dog', 0.7214406728744507),
 ('horse', 0.6664350628852844),
 ('babysitter', 0.6273020505905151),
 ('rosebush', 0.6202597618103027),
 ('photo', 0.6202403903007507),
 ('dolphin', 0.6169617176055908),
 ('urethra', 0.6142557859420776),
 ('belly', 0.6122345924377441),
 ('lion', 0.6024689674377441),
 ('nyan', 0.5973218679428101)]

In [30]:
model_ted["man"]

  """Entry point for launching an IPython kernel.


array([-2.1511629 ,  0.47705597,  3.3663535 , -0.92788744, -2.731441  ,
       -3.4608452 , -3.9246788 ,  1.3752449 ,  1.4074894 ,  0.97740906,
        2.7248678 , -0.40133044,  1.2729995 ,  3.9497259 , -3.2209232 ,
        0.20522638,  0.02104493, -0.7628617 , -1.8167784 ,  2.2501826 ,
        1.4072636 , -0.7820428 , -3.4791026 ,  5.167877  ,  6.4685907 ,
        0.60829467,  0.80817497, -0.5068998 ,  2.8806748 ,  2.1530259 ,
       -2.3161252 , -0.41240472,  4.1655307 , -1.091243  ,  0.86448574,
       -0.7295571 ,  3.311775  , -0.6505842 , -1.578327  ,  5.741625  ,
        1.5627375 ,  0.42035607,  0.47050866, -1.5403761 ,  3.3178728 ,
       -3.3865097 ,  1.1462854 ,  1.0933002 , -2.3152647 , -0.6321932 ],
      dtype=float32)

#### t-SNE visualization

In [26]:
words_top_ted = []
i = 0 
for k in counts_ted_top1000_dic.keys(): 
    if i == 1000:
        break
    words_top_ted.append(k)
    i += 1 

In [27]:
# This assumes words_top_ted is a list of strings, the top 1000 words
words_top_vec_ted = model_ted[words_top_ted]

  


In [28]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(words_top_vec_ted)

In [29]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_top_ted))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

In [44]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [49]:
encoded_contents = [one_hot(c, vocab_size) for c in contents_ted]
print(tokenized_content[0][0:10])
print(encoded_contents[0][0:10])

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more']
[5785, 43949, 18981, 13351, 30197, 45750, 11892, 44604, 28866, 7111]


The sequences have different lengths and Keras prefers inputs to be vectorized and all inputs to have the same length. We will pad all input sequences to have the length of the input with the  bigger length. Again, we can do this with a built in Keras function, in this case the pad_sequences() function.

In [53]:
max_length = len(max(contents_ted ,key = len))
print(max_length)

34680


In [56]:
# pad documents to a max length of max words
padded_contents = pad_sequences(encoded_contents, maxlen=max_length, padding='post')
print(padded_contents[0])

[ 5785 43949 18981 ...     0     0     0]


### Part 3: Define Model
We are now ready to define our Embedding layer as part of our neural network model.
The Embedding has a vocabulary of 53184 and an input length of 34680. We will choose an embedding space of 50 dimensions.

In [58]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 34680, 50)         2659200   
_________________________________________________________________
flatten_1 (Flatten)          (None, 1734000)           0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 1734001   
Total params: 4,393,201
Trainable params: 4,393,201
Non-trainable params: 0
_________________________________________________________________
None
