In [3]:
import numpy as np
import os
from random import shuffle
import re

In [4]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

### Part 0: Download the TED dataset

In [5]:
import urllib.request
import zipfile
import lxml.etree

In [6]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [7]:
# we're interested in the content and keywords text , so let's extract them from the XML:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
    contents_text = [] 
    keywords_text = [] 
    for content,keywords in zip(doc.iter('content'),doc.iter('keywords')):
        keywords_text.append(keywords.text)
        contents_text.append(content.text)
         
del doc
print('Size of keywords: ' , len(keywords_text))
print('Size of contents: ' , len(contents_text))

Size of keywords:  2085
Size of contents:  2085


### Part 1: Preprocessing
In this part, we attempt to clean up the raw contents a bit, so that we get only sentences.Let's start by removing all parenthesized strings using a regex:

#### Labelling

In [8]:
keywords_text[31]

'talks, astronomy, cosmos, curiosity, exploration, nature, physics, science, space, technology, universe'

In [107]:
labels_encoded = []
for k in keywords_text:
    k = k.lower()
    if 'technology' in k and 'entertainment' in k and 'design' in k: 
        labels_encoded.append('TED')
    elif 'entertainment' in k and 'design' in k:
        labels_encoded.append('oED')
    elif 'technology' in k and 'design' in k:
        labels_encoded.append('ToD')
    elif 'technology' in k and 'entertainment' in k:
        labels_encoded.append('TEo')
    elif 'technology' in k: 
        labels_encoded.append('Too')
    elif 'entertainment' in k: 
        labels_encoded.append('oEo')
    elif 'design' in k : 
        labels_encoded.append('ooD')
    else: 
        labels_encoded.append('ooo')    

In [108]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
labels_one_hot = encoder.fit_transform(labels)
print(labels_one_hot)

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]]


In [109]:
# Count number of each labels
count_labels=Counter(labels_encoded)
label_count = [word_count for word_count in count_labels.most_common()]
label_count

[('ooo', 1128),
 ('Too', 381),
 ('oEo', 173),
 ('ooD', 164),
 ('ToD', 145),
 ('TEo', 36),
 ('TED', 34),
 ('oED', 24)]

In [11]:
contents_text_noparens = [] 
for content in contents_text:
    contents_text_noparens.append(re.sub(r'\([^)]*\)', '', content))

In [12]:
print(contents_text[0][850:900])
print('\nContents without parenthesized strings:')
print(contents_text_noparens[0][850:900])

ir calculators.
(Laughter)
Facit did too much expl

Contents without parenthesized strings:
ir calculators.

Facit did too much exploitation. 


Now, let's attempt to remove speakers' names that occur at the beginning of a line, by deleting pieces of the form "`<up to 20 characters>:`". Of course, this is an imperfect heuristic. 

In [13]:
contents_ted = [] 
for content in contents_text_noparens:
    content_sent = [] 
    for line in content.split('\n'):
        m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
        content_sent.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
    contents_ted.append(" ".join([x for x in content_sent]))

We're ready to tokenize each of them into words and make them all small case.

In [14]:
contents_ted = [x.lower() for x in contents_ted]    

#### Split data

In [15]:
x_train, y_train = contents_ted[0:1585], labels_one_hot[0:1585]
x_val, y_val = contents_ted[1585:1835], labels_one_hot[1585:1835]
x_test, y_test = contents_ted[1835:len(contents_ted)], labels_one_hot[1835:len(labels_one_hot)]
print('Size of train data: ', len(x_train),len(y_train))
print('Size of validation data: ', len(x_val))
print('Size of test data: ', len(x_test))

Size of train data:  1585 1585
Size of validation data:  250
Size of test data:  250


In [16]:
from keras.preprocessing.text import text_to_word_sequence
x_train_tokenized = [text_to_word_sequence(x) for x in x_train]

Using TensorFlow backend.


In [17]:
print(x_train_tokenized[0][0:10])

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more']


#### Word Frequencies

In [18]:
from itertools import chain
from collections import Counter

In [19]:
counts_ted_top1000_dic =  dict(Counter(chain.from_iterable(x_train_tokenized)))
vocab_size = len(counts_ted_top1000_dic)
print(vocab_size)

49332


In [20]:
counts_ted_top1000_dic = {k: v for k, v in sorted(counts_ted_top1000_dic.items(), key=lambda item: item[1], reverse=True)}

In [21]:
counts_ted_top1000 = []
i = 0 
for k in counts_ted_top1000_dic.keys(): 
    if i == 1000:
        break
    counts_ted_top1000.append(counts_ted_top1000_dic[k])
    i += 1 
len(counts_ted_top1000)

1000

### Part 2: Train Word2Vec

In [22]:
from gensim.models import Word2Vec

In [23]:
model_ted = Word2Vec(x_train_tokenized, size=50, window=5, min_count=1, workers=10, iter=100)

In [24]:
model_ted.wv.most_similar("king")

[('persia', 0.7193150520324707),
 ('cobra', 0.7127224206924438),
 ('luther', 0.700444221496582),
 ('lydia', 0.6905084252357483),
 ('babur', 0.6805806159973145),
 ('leopold', 0.6607900857925415),
 ('jeremy', 0.6524101495742798),
 ('johannes', 0.6500877141952515),
 ('cobras', 0.6387594938278198),
 ('martin', 0.6386305093765259)]

In [25]:
model_ted["man"]

  """Entry point for launching an IPython kernel.


array([-3.3412035 ,  1.2935245 ,  3.0905519 , -0.12618665,  1.7795151 ,
        0.31722188,  2.5709684 ,  4.395839  ,  0.25685427, -0.28278896,
        1.1214838 , -1.5416692 ,  0.9363304 , -6.628903  , -1.0026269 ,
       -2.6319    , -2.0408547 , -3.660595  ,  1.8219569 ,  3.2100043 ,
        1.1944795 ,  3.0835738 ,  4.29036   , -0.92035884, -0.53932244,
        0.48056132,  2.261056  , -6.558177  , -0.5069615 ,  0.77864605,
       -3.2934647 , -2.3911524 ,  2.9690154 ,  4.8550186 ,  3.1458728 ,
        0.4306431 ,  0.8823594 ,  3.7421134 , -3.2186782 , -2.145983  ,
       -1.8278564 ,  0.09976048, -1.0409398 ,  4.476471  ,  2.1944745 ,
       -0.38855085,  0.16045779,  1.2596943 ,  2.2787738 , -0.07975758],
      dtype=float32)

#### t-SNE visualization

In [26]:
words_top_ted = []
i = 0 
for k in counts_ted_top1000_dic.keys(): 
    if i == 1000:
        break
    words_top_ted.append(k)
    i += 1 

In [27]:
# This assumes words_top_ted is a list of strings, the top 1000 words
words_top_vec_ted = model_ted[words_top_ted]

  


In [28]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(words_top_vec_ted)

In [29]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_top_ted))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

In [33]:
from keras.utils import to_categorical
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [34]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(x_train)
vocab_size = len(t.word_index) + 1
print(vocab_size)
# integer encode the documents
encoded_x_train = t.texts_to_sequences(x_train)
encoded_x_val = t.texts_to_sequences(x_val)
encoded_x_test = t.texts_to_sequences(x_test)
print(encoded_x_train[0][0:10])
print(encoded_x_val[0][0:10])

49333
[82, 17, 99, 847, 529, 1880, 20, 115, 33, 55]
[74, 945, 1143, 3, 27, 527, 1, 638, 3, 141]


The sequences have different lengths and Keras prefers inputs to be vectorized and all inputs to have the same length. We will pad all input sequences to have the length of the input with the  bigger length. Again, we can do this with a built in Keras function, in this case the pad_sequences() function.

In [35]:
max_length = len(max(contents_ted ,key = len))
print(max_length)

34680


In [36]:
# pad documents to a max length of max words
padded_x_train = np.array([x.tolist() for x in pad_sequences(encoded_x_train, maxlen=max_length, padding='post')]) 
padded_x_val = np.array([x.tolist() for x in pad_sequences(encoded_x_val, maxlen=max_length, padding='post')]) 
padded_x_test = np.array([x.tolist() for x in pad_sequences(encoded_x_test, maxlen=max_length, padding='post')]) 

In [37]:
print(padded_x_train.shape)
print(padded_x_val.shape)
print(padded_x_test.shape)

(1585, 34680)
(250, 34680)
(250, 34680)


In [38]:
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = model_ted[word]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

  This is separate from the ipykernel package so we can avoid doing imports until


In [146]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.layers import LSTM

We chose the 50-dimensional version, therefore the Embedding layer must be defined with output_dim set to 100. Finally, we do not want to update the learned word weights in this model, therefore we will set the trainable attribute for the model to be False.

In [155]:
e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=500, trainable=False)

### Part 3: Define Model
We are now ready to define our Embedding layer as part of our neural network model.
The Embedding has a vocabulary of 53184 and an input length of 34680. We will choose an embedding space of 50 dimensions.

In [165]:
# define the model
model = Sequential()
model.add(e)
model.add(LSTM(10))
model.add(Dense(8, activation='softmax'))

opt = Adam(learning_rate=0.001)
# compile the model
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 500, 50)           2466650   
_________________________________________________________________
lstm_7 (LSTM)                (None, 10)                2440      
_________________________________________________________________
dense_34 (Dense)             (None, 8)                 88        
Total params: 2,469,178
Trainable params: 2,528
Non-trainable params: 2,466,650
_________________________________________________________________
None


In [166]:
pruned_x_train = np.array([x[0:500] for x in padded_x_train ])
pruned_x_val = np.array([x[0:500] for x in padded_x_val ])
pruned_x_test = np.array([x[0:500] for x in padded_x_test ])
pruned_x_train.shape
pruned_x_val.shape

(250, 500)

In [167]:
# fit the model
model.fit(pruned_x_train, y_train,validation_data=(pruned_x_val, y_val),epochs=100, verbose=2, batch_size=50,validation_freq=5)

Train on 1585 samples, validate on 250 samples
Epoch 1/100
 - 5s - loss: 1.9201 - accuracy: 0.3703
Epoch 2/100
 - 5s - loss: 1.6226 - accuracy: 0.6114
Epoch 3/100
 - 5s - loss: 1.4086 - accuracy: 0.6114
Epoch 4/100
 - 5s - loss: 1.2914 - accuracy: 0.6114
Epoch 5/100
 - 5s - loss: 1.2498 - accuracy: 0.6114 - val_loss: 1.9331 - val_accuracy: 0.3440
Epoch 6/100
 - 5s - loss: 1.2305 - accuracy: 0.6114
Epoch 7/100
 - 5s - loss: 1.2210 - accuracy: 0.6120
Epoch 8/100
 - 5s - loss: 1.2332 - accuracy: 0.6120
Epoch 9/100
 - 5s - loss: 1.2115 - accuracy: 0.6132
Epoch 10/100
 - 5s - loss: 1.1968 - accuracy: 0.6145 - val_loss: 1.8771 - val_accuracy: 0.3480
Epoch 11/100
 - 5s - loss: 1.1911 - accuracy: 0.6158
Epoch 12/100
 - 5s - loss: 1.1802 - accuracy: 0.6151
Epoch 13/100
 - 5s - loss: 1.1664 - accuracy: 0.6164
Epoch 14/100
 - 5s - loss: 1.1633 - accuracy: 0.6183
Epoch 15/100
 - 5s - loss: 1.1429 - accuracy: 0.6233 - val_loss: 1.9149 - val_accuracy: 0.3480
Epoch 16/100
 - 5s - loss: 1.1339 - accur

<keras.callbacks.callbacks.History at 0x2a11bcc0a08>

In [168]:
# evaluate the model
loss, accuracy = model.evaluate(pruned_x_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 36.000001


### Part 4: Use Glove

In [45]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.50d.txt' , encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [47]:
# create a weight matrix for words in training data
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector