In [2]:
import numpy as np
import os
from random import shuffle
import re

In [3]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

## Downloading dataset

In [4]:
import urllib.request
import zipfile
import lxml.etree

In [5]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

# Loading Data

In [6]:
# For now, we're only interested in the subtitle text, so let's extract that from the XML:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc1 = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
    doc2 = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))

input_text = '\n\nGG\n\n'.join(doc1.xpath('//content/text()')) 
input_label= '\n'.join(doc2.xpath('//keywords/text()'))
del doc1
del doc2

# Pre Processing

In [7]:
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)
sentences_strings_ted = []
keywords_list=[]



clean_input=[]
'''
The following loop seperates each talk into a string and stores it in an array.
'''
for input_para in input_text_noparens.split('\n\nGG\n\n'):
    clean_input.append(input_para)
    
sentences_strings_ted = []

'''
Cleaning the data
'''
for line in clean_input:
    
    m = re.sub(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$',"", line) #Removing names
    
    m=m.replace("'",'') #Removing semi-colons
    
    tokens = re.sub(r"[^a-z0-9]+", " ", m.lower()) #Removing non-alphanumeric characters
    
    sentences_strings_ted.append(tokens.split()) #Tokenizing the input using spaces i.e splitting sentences into words.

'''
Creating labels as follows:
1) If the talk is based on Technology- 100 in binary i.e 4 
2) If the talk is based on Technology and Design- 101 in binary i.e 5
'''
for keyword_list in input_label.split('\n'):
    temp=[]
    outP=[]
    countT=0
    countE=0
    countD=0
    for word in keyword_list.split(', '):
        if word in ['technology','entertainment','design']:
            temp.append(word)
    if (len(temp)!=0):
        if 'technology' in temp:
            countT=1
        if 'entertainment' in temp:
            countE=1
        if 'design' in temp:
            countD=1
    
    outStr=str(countT)+str(countE)+str(countD)
    label=int(outStr,2)
    keywords_list.append(label)
    
input_list=list(zip(sentences_strings_ted,keywords_list)) #Joining each input with respective label

'''
Splitting dataset for training,testing and validation
'''
training_input=input_list[:1585]
validation_input=input_list[1585:1836]
test_input=input_list[1836:]

# Creating Text Embedding

In [8]:
from gensim.models import Word2Vec

'''
Creating Text Embedding:

1) Text embedding is the process of representing a word as a vector.
2) This vector can be passed into a neural network as input.
3) I am using Word2Vec to do this task for me. Basically, it will represent each of the words in the input as a vector.

More on Word2Vec:

https://iksinc.wordpress.com/tag/continuous-bag-of-words-cbow/

'''

model_ted = Word2Vec(sentences_strings_ted, min_count=10)# ...
print(len(model_ted.wv.vocab))

14448


In [9]:
'''
To see how well our text embedding has worked.
Printing the most similar words for every word.
'''

model_ted.most_similar("man")

  """Entry point for launching an IPython kernel.


[('guy', 0.8314803838729858),
 ('woman', 0.8081455230712891),
 ('boy', 0.7866109609603882),
 ('lady', 0.7831286191940308),
 ('girl', 0.7269756197929382),
 ('gentleman', 0.726377010345459),
 ('poet', 0.7124746441841125),
 ('john', 0.6770526766777039),
 ('david', 0.6767141819000244),
 ('kid', 0.6735748052597046)]

In [10]:
model_ted.most_similar("computer")

  """Entry point for launching an IPython kernel.


[('robot', 0.7306444048881531),
 ('machine', 0.7252529859542847),
 ('device', 0.6988846659660339),
 ('software', 0.6986162066459656),
 ('3d', 0.6592627763748169),
 ('mechanical', 0.6391801238059998),
 ('chip', 0.6363259553909302),
 ('printer', 0.633363664150238),
 ('video', 0.6132990121841431),
 ('code', 0.6080454587936401)]

In [11]:
curSum=np.zeros((1,100))
print (curSum)


[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]


In [12]:
'''
This function will accept an array of words (An entire TED talk) as input. It will find the vector representations of
every word and sum them up. It will divide the final vector by number of words and return it.

This is called Bag of Means method. It is very simple but is very effective at representing
phrases or sentences of different lengths using a vector of fixed length.

    
'''

def convert_to_vec(input_array):
    global model_ted
    curSum=np.zeros((1,100))
    
    for word in input_array:
        curSum=np.add(curSum,model_ted.wv[word])
    curSum=(1.0/(len(input_array)))*curSum
    return (curSum)
#print (convert_to_vec(['machine','computer']))

# Dataset Class

In [21]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

'''
This is the dataset class for TED_Dataset.

Those unfamiliar with writing a dataset for PyTorch may refer to this excellent tutorial:

http://pytorch.org/tutorials/beginner/data_loading_tutorial.html

'''


class TED_Dataset(Dataset):

    training_input=[]
    validation_input=[]
    test_input=[]
    
    def __init__(self,training_input,validation_input,test_input,train,validate,test):
        
        self.training_input=training_input
        self.validation_input=validation_input
        self.test_input=test_input
        
        self.train=train
        self.validate=validate
        self.test=test
        
        
    def __len__(self):
        if self.train==True:
            return 1585
        elif self.validate==True:
            return 250
        elif self.test==True:
            return 240
        
        
    def __getitem__(self,idx):
        global training_input
        global validation_input
        global test_input
        
        if self.train==True:
            item=training_input[idx]
        elif self.validate==True:
            item=validation_input[idx]
        elif self.test==True:
            item=test_input[idx]
        global model_ted
        curSum=np.zeros((1,100))
        
        input_array=item[0]
        label=item[1]
        
        for word in input_array:
            if word not in model_ted.wv.vocab:
                continue
            curSum=np.add(curSum,model_ted.wv[word])
        if (len(input_array)!=0):
            curSum=(1.0/(len(input_array)))*curSum
        else:
            #print ("HERE")
            curSum=np.zeros((1,100))
        sample={'input':curSum,'label':label}
        return (sample)


# Initialize Dataset

In [22]:
#Initializing Datasets

train_dataset=TED_Dataset(train=True,test=False,validate=False,training_input=training_input,validation_input=validation_input,test_input=test_input)
test_dataset=TED_Dataset(train=False,test=True,validate=False,training_input=training_input,validation_input=validation_input,test_input=test_input)
validate_dataset=TED_Dataset(train=False,test=False,validate=True,training_input=training_input,validation_input=validation_input,test_input=test_input)


# Making Dataset Iterable

In [23]:
#Fixing basic parameters to work with

batch_size = 50
n_iters = 15000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

# Neural Network Model 

In [24]:
'''
The network is defined in the class below.

It is a single-layered feedforward neural network using Tanh as activation function.

'''

class FeedforwardNeuralNetModel(nn.Module):
    
    
    def __init__(self, input_dim, hidden_dim, output_dim):
        
        super(FeedforwardNeuralNetModel, self).__init__()
        # Linear function 1
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        
        # Non-linearity 1
        self.tanh1 = nn.Tanh ()
        
        # Linear function 2 (readout): 100 --> 8
        self.fc2 = nn.Linear(hidden_dim, output_dim)  
    
    def forward(self, x):
        
        # Linear function 1
        out = self.fc1(x)
        
        # Non-linearity 1
        out = self.tanh1(out)
        
        # Linear function 2
        out = self.fc2(out)
        return out

# Instantiate Model Class

In [25]:
input_dim = 100
hidden_dim = 100
output_dim = 8

model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)

# Instantiate Loss Class

In [26]:
criterion = nn.CrossEntropyLoss()

#  Instantiate Optimizer Class

In [27]:
learning_rate = 0.1

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training Model

In [30]:
iter = 0
for epoch in range(num_epochs):
    for i,datapoint in enumerate(train_loader):
        
        input_ar=datapoint['input']
        labels=datapoint['label']
        
        
        input_ar = Variable(input_ar.view(-1, 100))
        labels = Variable(labels)
        
        input_ar=input_ar.float()
        
        #print (input_ar)
        #print (labels)
        
        
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
        
        # Forward pass to get output/logits
        #print (iter)
        outputs = model(input_ar)
        
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
        
        # Getting gradients w.r.t. parameters
        loss.backward()
        
        # Updating parameters
        optimizer.step()
        
        iter += 1
        
        '''
        Testing is done after every 500 training iterations
        '''
    
    
        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for ind,test_data in enumerate(test_loader):
                
                test_in=test_data['input']
                test_labels=test_data['label']
                
               
                test_in = Variable(test_in.view(-1, 100))
                
                test_in=test_in.float()
                
                # Forward pass only to get logits/output
                outputs = model(test_in)
                
                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)
                
                # Total number of labels
                total += test_labels.size(0)
                
              
                # Total correct predictions
                correct += (predicted.cpu() == test_labels.cpu()).sum()
            
            accuracy = 100 * correct / total
            
            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.data[0], accuracy))
        
        

Iteration: 500. Loss: 1.5915523767471313. Accuracy: 38.75
Iteration: 1000. Loss: 1.5469393730163574. Accuracy: 38.333333333333336
Iteration: 1500. Loss: 1.2432589530944824. Accuracy: 38.333333333333336
