In [1]:
import numpy as np
import os
from random import shuffle
import re

In [2]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

### Part 0: Download the TED dataset

In [3]:
import urllib.request
import zipfile
import lxml.etree

In [4]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [5]:
# we're interested in the content and keywords text , so let's extract them from the XML:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
    contents_text = [] 
    keywords_text = [] 
    for content,keywords in zip(doc.iter('content'),doc.iter('keywords')):
        keywords_text.append(keywords.text)
        contents_text.append(content.text)
         
del doc
print('Size of keywords: ' , len(keywords_text))
print('Size of contents: ' , len(contents_text))

Size of keywords:  2085
Size of contents:  2085


### Part 1: Preprocessing
In this part, we attempt to clean up the raw contents a bit, so that we get only sentences.Let's start by removing all parenthesized strings using a regex:

In [6]:
contents_text_noparens = [] 
for content in contents_text:
    contents_text_noparens.append(re.sub(r'\([^)]*\)', '', content))

In [7]:
print(contents_text[0][850:900])
print('\nContents without parenthesized strings:')
print(contents_text_noparens[0][850:900])

ir calculators.
(Laughter)
Facit did too much expl

Contents without parenthesized strings:
ir calculators.

Facit did too much exploitation. 


Now, let's attempt to remove speakers' names that occur at the beginning of a line, by deleting pieces of the form "`<up to 20 characters>:`". Of course, this is an imperfect heuristic. 

In [8]:
contents_ted = [] 
for content in contents_text_noparens:
    content_sent = [] 
    for line in content.split('\n'):
        m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
        content_sent.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
    contents_ted.append(" ".join([x for x in content_sent]))

We're ready to tokenize each of them into words and make them all small case.

In [9]:
from nltk.tokenize import word_tokenize
import string

In [10]:
tokenized_content = []
for content in contents_ted:
    # split into words
    tokens = word_tokenize(content)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    tokenized_content.append([x for x in stripped if x != ''])

In [11]:
print(tokenized_content[0][0:10])

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more']


#### Split data

In [12]:
train = tokenized_content[0:1585]
validation = tokenized_content[1585:1835]
test = tokenized_content[1835:len(tokenized_content)]
print('Size of train data: ', len(train))
print('Size of validation data: ', len(validation))
print('Size of test data: ', len(test))

Size of train data:  1585
Size of validation data:  250
Size of test data:  250


#### Word Frequencies

In [13]:
from itertools import chain
from collections import Counter

In [14]:
counts_ted_top1000_dic =  dict(Counter(chain.from_iterable(train)))

In [15]:
counts_ted_top1000_dic = {k: v for k, v in sorted(counts_ted_top1000_dic.items(), key=lambda item: item[1], reverse=True)}

In [16]:
counts_ted_top1000 = []
i = 0 
for k in counts_ted_top1000_dic.keys(): 
    if i == 1000:
        break
    counts_ted_top1000.append(counts_ted_top1000_dic[k])
    i += 1 
len(counts_ted_top1000)

1000

### Part 2: Train Word2Vec

In [17]:
from gensim.models import Word2Vec

In [18]:
model_ted = Word2Vec(train, size=200, window=5, min_count=1, workers=10, iter=10)

In [19]:
model_ted.wv.most_similar("cat")

[('dog', 0.8297110199928284),
 ('horse', 0.7753918766975403),
 ('tshirt', 0.6997356414794922),
 ('monkey', 0.6809483766555786),
 ('knife', 0.6808750033378601),
 ('cow', 0.6681587100028992),
 ('photo', 0.6677738428115845),
 ('cake', 0.6644402742385864),
 ('baby', 0.654541015625),
 ('rat', 0.6540597677230835)]

#### t-SNE visualization

In [20]:
words_top_ted = []
i = 0 
for k in counts_ted_top1000_dic.keys(): 
    if i == 1000:
        break
    words_top_ted.append(k)
    i += 1 

In [21]:
# This assumes words_top_ted is a list of strings, the top 1000 words
words_top_vec_ted = model_ted[words_top_ted]

  


In [22]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(words_top_vec_ted)

In [23]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_top_ted))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)