In [63]:
import torch
import numpy as np
import random
from torch.utils.data import DataLoader
import os
import urllib
import zipfile
import lxml.etree
import re
from collections import Counter

In [64]:
if not os.path.isfile('ted_en-20160408.xml'):
    urllib.request.urlretrieve("https://github.com/oxford-cs-deepnlp-2017/practical-1/blob/master/ted_en-20160408.xml?raw=true", filename="ted_en-20160408.xml")

In [65]:
doc = lxml.etree.parse('ted_en-20160408.xml')
input_text = doc.xpath('//content/text()')
label = doc.xpath('//head/keywords/text()')
del doc
len(input_text)

2085

In [66]:
# Preprocess sentences to exclude all characters except alphabets and numbers
texts = [re.sub(r'\([^)]*\)', '',text) for text in input_text]
texts = [re.sub('r([^a-zA-Z0-9\s])',' ',text) for text in texts] #Included '.'
texts = [re.sub('[^a-zA-Z0-9\']',' ',text) for text in texts] #To replace '.' with ' '
texts = [re.sub('[^a-zA-Z0-9 ]','',text) for text in texts]
texts = [text.lower() for text in texts] #uppercase->lowercase

In [67]:
texts[2069][:160]

'   thank you   thank you very much  like the speaker before me    i am a ted virgin  i guess  im also the first time here  and      i dont know what to say   im'

In [68]:
text_labels = zip(texts,label)
texts = [text_label for text_label in text_labels if len(text_label[0]) > 500]
print('number of text greater than 500 words are:',len(texts))

number of text greater than 500 words are: 2076


In [69]:
texts,labels = zip(*texts)

In [70]:
texts[0]

'here are two reasons companies fail  they only do more of the same  or they only do whats new  to me the real  real solution to quality growth is figuring out the balance between two activities  exploration and exploitation  both are necessary  but it can be too much of a good thing  consider facit  im actually old enough to remember them  facit was a fantastic company  they were born deep in the swedish forest  and they made the best mechanical calculators in the world  everybody used them  and what did facit do when the electronic calculator came along  they continued doing exactly the same  in six months  they went from maximum revenue     and they were gone  gone  to me  the irony about the facit story is hearing about the facit engineers  who had bought cheap  small electronic calculators in japan that they used to double check their calculators   facit did too much exploitation  but exploration can go wild  too  a few years back  i worked closely alongside a european biotech com

In [71]:
words = [words for text in texts for words in text.split()]
words_count = Counter(words)
words_most_common =[word for word,count in words_count.most_common(100)]
words_least_common = [word for word,count in words_count.most_common() if count==1]

In [72]:
to_remove = words_most_common + words_least_common
words_to_remove = set(to_remove)
tokens = [word for word in words if word not in words_to_remove] #will be used during T-SNE
print('size of Token:',len(tokens)) 

size of Token: 1948385


In [73]:
texts = [[word for word in text.split() if word not in words_to_remove]for text in texts]

In [76]:
# Encode labels as ['ooo', 'Too', 'oEo', 'ooD', 'TEo', 'ToD', 'oED', 'TED']
label_coded = ['ooo']*len(labels)
for i,keyword in enumerate(labels):
    key = keyword.split(', ')
    label = list(label_coded[i])
    if 'technology' in key:
        label[0] = 'T'
    if 'entertainment' in key:
        label[1] = 'E'
    if 'design' in key:
        label[2] = 'D'
    else:
        pass
    label_coded[i] =''.join(label) 

In [77]:
count_labels=Counter(label_coded)
label_count = [word_count for word_count in count_labels.most_common()]
label_count

[('ooo', 1130),
 ('Too', 389),
 ('oEo', 169),
 ('ooD', 158),
 ('ToD', 137),
 ('TEo', 36),
 ('TED', 33),
 ('oED', 24)]

In [78]:
ohe = np.zeros(shape=(len(labels),8),dtype='int16')
label_lookup = ['ooo', 'Too', 'oEo', 'ooD', 'TEo', 'ToD', 'oED', 'TED']

In [56]:
# Stripping Text to fall within length of 500; incase if it is shorter then padd with '<UNK>'
length = 500 #sentence length
stripped_text = []#np.zeros((len(texts),length)
for i,text in enumerate(texts):
    inputs = []
    if len(text) >= 500:
        inputs.extend(text[:500])
    else:
        extra_length = 500-len(text)
        extra = ['<PAD>']*extra_length
        word_with_extra = text + extra
        inputs.extend(word_with_extra)
    stripped_text.append(inputs) 

In [57]:
stripped_length = len(stripped_text)

In [58]:
for i,code in enumerate(label_coded):
    one_hotted[i][label_lookup.index(code)] = 1

In [59]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.optim as optim
from torchvision import datasets, transforms

In [60]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [61]:
class TEDModel(nn.Module):
    def __init__(self,vocab_size,embedding_dim=100):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        self.hidden_layer= nn.Sequential(
            nn.Linear(embedding_dim, 32, bias=True),
            nn.Tanh(),
            nn.Linear(32,8,bias=True),
            nn.Softmax(dim=8)
        )


    def forward(self, x):
        x = self.embedding(x)
        out = self.hidden_layer(x)



In [62]:
losses = []
loss_function = nn.NLLLoss()
model = TEDModel(len(stripped_text))
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in stripped_text:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

# To get the embedding of a particular word, e.g. "beauty"
print(model.embeddings.weight[word_to_ix["beauty"]])

ValueError: too many values to unpack (expected 2)