In [1]:
import numpy as np
import os
from random import shuffle
import re

In [2]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [6]:
import urllib
import zipfile
import lxml.etree

In [7]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [8]:
# For now, we're only interested in the subtitle text, so let's extract that from the XML:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_text = '\n'.join(doc.xpath('//content/text()'))
del doc

In [9]:
i = input_text.find("Hyowon Gweon: See this?")
input_text[i-20:i+150]

u' baby does.\n(Video) Hyowon Gweon: See this? (Ball squeaks) Did you see that? (Ball squeaks) Cool. See this one? (Ball squeaks) Wow.\nLaura Schulz: Told you. (Laughs)\n(Vide'

In [10]:
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

In [11]:
i = input_text_noparens.find("Hyowon Gweon: See this?")
input_text_noparens[i-20:i+150]

u"hat the baby does.\n Hyowon Gweon: See this?  Did you see that?  Cool. See this one?  Wow.\nLaura Schulz: Told you. \n HG: See this one?  Hey Clara, this one's for you. You "

In [12]:
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)

# Uncomment if you need to save some RAM: these strings are about 50MB.
# del input_text, input_text_noparens

# Let's view the first few:
sentences_strings_ted[:5]

[u"Here are two reasons companies fail: they only do more of the same, or they only do what's new",
 u'To me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation',
 u' Both are necessary, but it can be too much of a good thing',
 u'Consider Facit',
 u" I'm actually old enough to remember them"]

In [16]:
from stop_words import get_stop_words

stop_words = get_stop_words('english')
sentences_strings_stop_ted = []
for i in range(len(sentences_strings_ted)):
    text = ' '.join([word for word in sentences_strings_ted[i].split() if word not in stop_words])
    sentences_strings_stop_ted.append(text)

In [17]:
sentences_strings_stop_ted[:5]

[u'Here two reasons companies fail: same, new',
 u'To real, real solution quality growth figuring balance two activities: exploration exploitation',
 u'Both necessary, can much good thing',
 u'Consider Facit',
 u"I'm actually old enough remember"]

In [18]:
sentences_ted = []
for sent_str in sentences_strings_stop_ted:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences_ted.append(tokens)

In [19]:
len(sentences_ted)

266694

In [22]:
sentences_ted[3]

[u'consider', u'facit']

In [23]:
sentences_ted[0]

[u'here', u'two', u'reasons', u'companies', u'fail', u'same', u'new']

In [24]:
from gensim.models import Word2Vec

In [25]:
model_ted = Word2Vec(sentences_ted, size=100, window=5, min_count=10, workers=4)

In [26]:
model_ted.most_similar("man")

[(u'woman', 0.8943348526954651),
 (u'girl', 0.7999122738838196),
 (u'son', 0.794209361076355),
 (u'boy', 0.7898585796356201),
 (u'lady', 0.7837948799133301),
 (u'mary', 0.7776634097099304),
 (u'daughter', 0.7673740983009338),
 (u'sister', 0.7465887665748596),
 (u'brother', 0.7460297346115112),
 (u'husband', 0.7396356463432312)]

In [27]:
model_ted.most_similar("computer")

[(u'machine', 0.8168897032737732),
 (u'software', 0.7924525737762451),
 (u'computers', 0.7476165294647217),
 (u'robot', 0.7457460165023804),
 (u'3d', 0.7327775955200195),
 (u'algorithm', 0.7237495183944702),
 (u'device', 0.7149367928504944),
 (u'code', 0.7061489224433899),
 (u'interface', 0.6981454491615295),
 (u'lab', 0.678130567073822)]

In [28]:
from nltk import *

In [29]:
words = np.concatenate(sentences_ted)

In [30]:
fdist = FreqDist(words)

In [31]:
common = fdist.most_common(1000)
words_top_ted = []
for count, words in enumerate(common):
    words_top_ted.append(str(words[0]))

In [32]:
words_top_ted[950:]

['meaning',
 'decision',
 'fire',
 'taught',
 'expect',
 '80',
 'village',
 'mine',
 'wife',
 'freedom',
 'crisis',
 'sent',
 'tells',
 'robots',
 'meant',
 'sleep',
 'mom',
 'visual',
 'thousand',
 'late',
 'however',
 'trees',
 'serious',
 'lose',
 'smart',
 'killed',
 'conditions',
 'wish',
 'ended',
 'pattern',
 '18',
 'somehow',
 'rich',
 'brains',
 'square',
 'cold',
 'impossible',
 'compassion',
 'successful',
 'fundamental',
 'spread',
 'bunch',
 'increase',
 'thanks',
 'protect',
 'service',
 'truly',
 'pieces',
 'nuclear',
 'intelligence']

In [33]:
words_top_vec_ted = model_ted[words_top_ted]

In [34]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(words_top_vec_ted)

In [35]:

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_top_ted))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)