# Natural Language Processing with PyTorch

In [16]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings
warnings.simplefilter('ignore')

# 1) Torch's tensor libraries 

## Creating tensors 
torch.Tensor()

In [9]:
# 1D vector
vec_data = [1., 2., 3.]
vec = torch.Tensor(vec_data)
print(vec)

#create matrix
mat_data = [[1.,2.,3.], [4.,5.,6.]]
M = torch.Tensor(mat_data)
print(M)

#creates a 3D tensor of size 2x2
T_data = [[[1.,2.],[3.,4.]],[[5.,6.],[7.,8.]]]
T = torch.Tensor(T_data)
print(T)

tensor([1., 2., 3.])
tensor([[1., 2., 3.],
        [4., 5., 6.]])
tensor([[[1., 2.],
         [3., 4.]],

        [[5., 6.],
         [7., 8.]]])


In [10]:
#Index into V get a scalar
print(vec[0])

#Index in M get a vector
print(M[0])

#Index in T and get matirx
print(T[0])

tensor(1.)
tensor([1., 2., 3.])
tensor([[1., 2.],
        [3., 4.]])


In [12]:
x = torch.randn((3,4,5))
x

tensor([[[-0.0986,  0.1418,  1.8484,  0.7456, -0.4697],
         [ 0.7196,  0.6807,  0.2613, -0.0581, -1.9292],
         [ 1.0769,  0.3044, -0.9771,  0.6452, -0.8265],
         [ 0.5110,  0.6283, -0.3448, -1.5382,  1.3169]],

        [[-0.3230,  1.1768,  0.4494, -0.4867, -0.6272],
         [ 1.3033, -0.3181, -1.0643,  0.4012,  0.6666],
         [-0.6953, -0.6589, -0.2782,  0.3641, -1.0407],
         [ 0.0546,  0.8572, -1.9430,  1.1409,  0.1295]],

        [[ 0.3445,  0.2413,  0.1938,  0.2094,  1.5771],
         [ 0.1517, -0.5234,  1.4689, -0.2187, -0.1143],
         [ 0.0629,  0.4502,  1.3818, -0.0603, -0.7738],
         [ 2.5325, -0.7391,  0.5957, -0.7689, -0.0164]]])

## Operations with Tensors

In [13]:
x = torch.Tensor([1.,2.,3])
y = torch.Tensor([4.,5.,6.])
z = x+y
print(z)

tensor([5., 7., 9.])


In [23]:
# Default concatenation is along rows[Same num col]
x_1 = torch.randn(2,5)
y_1 = torch.randn(3,5)
z_1 = torch.cat([x_1,y_1])
print(z_1)

#Concate columns[same number of rows]
x_2 = torch.randn(2,3)
y_2 = torch.randn(2,5)
z_2 = torch.cat([x_2,y_2],1)
print(z_2)

tensor([[-1.2666,  0.2738,  0.9295, -0.5410, -0.4918],
        [ 0.0789,  0.6938,  0.6598, -0.0810,  0.1165],
        [ 1.7411, -0.5096,  0.5782, -0.1834, -0.6779],
        [ 1.0669,  0.7617,  0.5461,  0.0355,  0.3889],
        [ 1.5023,  1.1451,  0.6678,  0.3963, -0.6543]])
tensor([[ 0.5460, -1.0964,  1.0026, -0.1827,  1.0717,  0.6771,  0.9234,  0.3206],
        [-0.7633,  0.1964,  1.3181, -0.8131, -0.7522,  0.8126, -1.4560,  1.1771]])


In [28]:
x = torch.randn(2,3,4)
print(x)

print(x.view(2,12))# Reshape to 2x12
print(x.view(2,-1))

tensor([[[-0.0923,  0.7348, -1.3668, -0.7047],
         [ 0.3518, -1.1965,  1.2728, -0.6265],
         [-0.2134, -0.0736, -2.1381, -0.4731]],

        [[-0.1775, -1.2510,  0.1474, -0.5376],
         [-0.6031,  0.2670, -1.8217, -1.4385],
         [ 0.0368, -0.0427, -0.8904, -1.9731]]])
tensor([[-0.0923,  0.7348, -1.3668, -0.7047,  0.3518, -1.1965,  1.2728, -0.6265,
         -0.2134, -0.0736, -2.1381, -0.4731],
        [-0.1775, -1.2510,  0.1474, -0.5376, -0.6031,  0.2670, -1.8217, -1.4385,
          0.0368, -0.0427, -0.8904, -1.9731]])
tensor([[-0.0923,  0.7348, -1.3668, -0.7047,  0.3518, -1.1965,  1.2728, -0.6265,
         -0.2134, -0.0736, -2.1381, -0.4731],
        [-0.1775, -1.2510,  0.1474, -0.5376, -0.6031,  0.2670, -1.8217, -1.4385,
          0.0368, -0.0427, -0.8904, -1.9731]])


# 2) Computation Graphs and Automatic Differentiation

In [29]:
# Variables wrap tensor objects
x = autograd.Variable( torch.Tensor([1.,2.,3.]), requires_grad=True)
# You can access the data with .data attribute
print(x.data)

y = autograd.Variable( torch.Tensor([4.,5.,6.]), requires_grad=True)
z = x + y
print(z.data)

# Z knows some extra data 
print(z.grad_fn)

tensor([1., 2., 3.])
tensor([5., 7., 9.])
<AddBackward0 object at 0x7f49d7623668>


In [30]:
s = z.sum()
print(s)
print(s.grad_fn)

tensor(21., grad_fn=<SumBackward0>)
<SumBackward0 object at 0x7f48e2026d30>


In [31]:
s.backward()
print(x.grad)

tensor([1., 1., 1.])


In [48]:
x = torch.rand((2,2))
y = torch.rand((2,2))
z = x + y # Normal tensor types and backprop would not be possible

var_x = autograd.Variable( x,requires_grad=True )
var_y = autograd.Variable( y,requires_grad=True )
var_z = var_x + var_y
print(var_z.grad_fn) # Has enough information to compute the gradients

var_z_data = var_z.data
new_var_z = autograd.Variable(var_z_data)
print(new_var_z.grad_fn) # Does not ahve information to compute gradients because it only store the data

<AddBackward0 object at 0x7f48d6408128>
None


The above method breaks the Variable chain 

# 3) DL building blocks: Affine maps, non-linearities and objectives

## Affine maps
f(x) = Ax + b ; A --> matrix , b and x --> vectors
The parameters to be learnt are A and b

In [51]:
lin = nn.Linear(5, 3) # Maps from R^5 to R^3, parameters A and b
data = autograd.Variable( torch.randn(2,5) )
print(data)
print(lin(data))

tensor([[ 0.5716,  0.2606, -0.1326,  0.3243,  1.1428],
        [ 0.7893,  1.4740, -0.1124,  0.2300,  1.4949]])
tensor([[ 0.1204, -0.3842, -0.2769],
        [ 0.3618, -0.2795, -0.6070]], grad_fn=<AddmmBackward>)


## Non-linearities

Consider we have two affine maps: f(x)=Ax+b and g(x)=Cx+d. Then f(g(x)) = f(Cx+d) = A(Cx+d)+b = ACx + (Ad+b) which is another affine map. This does not add anything new while computing the model hence we need non-linearities. That most common non-linearitites are tanh(x), ReLU(x), and σ(x) this is because their gradients are easy ot compute.

In [57]:
data = autograd.Variable(torch.randn(2,2))
print(data)
print(F.relu(data))

tensor([[-0.4266, -1.1660],
        [-1.4285,  1.7592]])
tensor([[0.0000, 0.0000],
        [0.0000, 1.7592]])


## Softmax and Probabilities

This is a special non-linearity that is used at the end of the network because it returns a probability distribution. The ith component of the Softmax(x) is   

In [17]:
data = autograd.Variable(torch.randn(5))
print(data)
print(F.softmax(data))
print(F.softmax(data).sum())
print(F.log_softmax(data))

tensor([-1.5175, -0.6878, -0.3383,  1.1900,  0.5298])
tensor([0.0341, 0.0783, 0.1110, 0.5120, 0.2646])
tensor(1.)
tensor([-3.3771, -2.5473, -2.1979, -0.6695, -1.3297])


## Objective Function
The Objective Function is the function being trained to minimize. It chooses an instance to run through the network and then updates parameters with the derivative of the loss function. Negative-log probabity is very common for multi-class classification.

# 4) Optimization and Training
Since the loss is an autograd.Variable it has enough information to compute the gradient with respect to all parameters used to compute it. If L(θ) is the loss function and n is a positive learning rate then : <br>
$$ \theta^{(t+1)} = \theta^{(t)} - \eta \nabla_\theta L(\theta) $$ <br>
[torch.optim has different optimizer packages]




# 5) Creating Network Components in Pytorch
Now we will use affine maps and non-linearities to build a network. We will compute the loss fucntion using the built in negative log likelihood and update using backpropagation.<br>

All network components should inheret from nn.Module and override the forward() method. This makes it keep track of its trainable parameters and you can swap between GPU(.cuda()) and CPU(.cpu()) <br>

Now we will write a logistic regression model that takes a sparse bag-of-words and outputs a probaility distribution over two labels "English" and "Spanish"   

## Example : Logistic Regression BOW classifier

The modle maps sparse BOW to log probabilities over lables and assign each word in the vocab to an index. Example "hello" and "world" are indices 0 and 1. The sentence "hello hello world" is <br>
                                       $$ [2,1] ==> [count(hello),count(world)]$$
If the BOW is a vector 'x' then the ouput of the network is: <br>
                                        $$ logSoftmax(Ax+b) $$

In [27]:
data = [ ("me gusta comer en la cafeteria".split(),"SPANISH"),
         ("Give it to me".split(),"ENGLISH"),
         ("No creo que sea una buena idea".split(),"SPANISH"),
         ("No it is not a good idea to get lost at sea".split(),"ENGLISH") ]

test_data = [("Yo creo que si".split(),"SPANISH"),
             ("it is lost on me".split(),"ENGLISH")]

#word_to_ix maps the words in vocab to unique integers which will be its index in the BOW vectors

word_to_ix={}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word]=len(word_to_ix)
print(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2

{'to': 8, 'get': 20, 'not': 17, 'buena': 14, 'gusta': 1, 'en': 3, 'is': 16, 'si': 24, 'me': 0, 'No': 9, 'sea': 12, 'comer': 2, 'at': 22, 'good': 19, 'una': 13, 'it': 7, 'cafeteria': 5, 'creo': 10, 'on': 25, 'idea': 15, 'Give': 6, 'la': 4, 'a': 18, 'Yo': 23, 'que': 11, 'lost': 21}


In [29]:
class BoWClassifier(nn.Module): #inheriting from nn.Module
    
    def __init__(self, num_labels, vocab_size):
        # calls init func of nn.Module
        super(BoWClassifier, self).__init__()
        
        # Define parameters required A,b and Torch provides nn.Linear() for affine map
        self.linear = nn.Linear(vocab_size, num_labels)
        
    def forward(self, bow_vec):
        # Pass the input through linear layer then softmax
        # Many other non-linearitities are present in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec))

In [47]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence : 
        vec[word_to_ix[word]] += 1
    return vec.view(1,-1)

def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

In [48]:
model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

# The model knows its parameters the first ouput is A and then b
# BoWClassifier will store the nn.Linear's parameters

for param in model.parameters():
    print(param)

Parameter containing:
tensor([[-0.1485, -0.0696,  0.0650, -0.0275,  0.0792,  0.0345,  0.0937,  0.0263,
         -0.1935, -0.0933,  0.1928, -0.1467,  0.1259, -0.1769, -0.1576,  0.1131,
         -0.1096, -0.1941, -0.1163, -0.0220,  0.1786,  0.0360, -0.0929,  0.0506,
          0.0272,  0.1691],
        [ 0.1182, -0.1236,  0.0987, -0.0111,  0.0800, -0.1806,  0.0618, -0.0493,
          0.1459, -0.0584, -0.0824, -0.1255,  0.1581, -0.0390,  0.1376,  0.1786,
         -0.1501, -0.1632, -0.0833, -0.0681, -0.0364, -0.0883,  0.1163, -0.0930,
          0.1896, -0.0311]], requires_grad=True)
Parameter containing:
tensor([0.0307, 0.0311], requires_grad=True)


In [49]:
sample = data[0]
bow_vectors = make_bow_vector(sample[0],word_to_ix)
log_probs = model(autograd.Variable(bow_vectors))
print(log_probs)

tensor([[-0.7180, -0.6689]], grad_fn=<LogSoftmaxBackward>)


In [50]:
label_to_ix = {"SPANISH":0, "ENGLISH":1}

We pass instances through the log probabilities compute loss function then gradient of loss function and update the parameters with gradient step. The nn.NLLLoss is the negative log likelihood loss. The input for it is vector of log probabilities and and the target labels. This does not compute the log probabilities but nn.CrossEntropyLoss() is the same as NLLLoss except it includes the log softmax. 

In [51]:
for instance, labels in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)
print( next(model.parameters())[:,word_to_ix['creo']])

tensor([[-0.5827, -0.8174]], grad_fn=<LogSoftmaxBackward>)
tensor([[-0.6103, -0.7835]], grad_fn=<LogSoftmaxBackward>)
tensor([ 0.1928, -0.0824], grad_fn=<SelectBackward>)


In [53]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(),lr=0.1)

for epoch in range(100):
    for instance, label in data:
        #Step1: Pytorch accumulates gradients, clear them before each instance
        model.zero_grad()
        
        #Step2: Make our BOW vector and also we must wrap the target variable as an integer. 0->SPANISH
        bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
        target = autograd.Variable(make_target(label,label_to_ix))
        
        #Step3: Run the forward pass
        log_probs = model(bow_vec)
        
        #Step4: Compute the loss,gradients and update parameters
   
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

In [57]:
for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance,word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)
print(next(model.parameters())[:,word_to_ix['creo']]) # Index corresponding to spanish goes up

tensor([[-0.0944, -2.4070]], grad_fn=<LogSoftmaxBackward>)
tensor([[-2.2920, -0.1065]], grad_fn=<LogSoftmaxBackward>)
tensor([ 0.6421, -0.5317], grad_fn=<SelectBackward>)


# 6) Word Embeddings : Encoding Lexical Semantics
Word embeddings are dense vectors of real numbers, one word per vocabulary. If we use one-hot encoding the vector might be large and sparse and thus would not provid erelevant information about the words. Thus word embeddings represent the semantics of the word, efficiently ecoding semantic information for the task. Word embedings are stored as $|V| \times D$ in pytorch, where $D$ is the dimensionlity of the embeddings such that the word with index $i$ has it's information in the $i$th row matrix . torch.nn.Embedding uses embeddings which takes two arguments vocabulary size and dimensionality of embeddings. To index the table use torch.LongTensor since the indexes are integers not floats.  

In [58]:
word_to_ix = {'hello':0,'world':1}
embeds = nn.Embedding(2,5) # 2 words in a 5D embedding 
lookup_tensor = torch.LongTensor(word_to_ix["hello"])
hello_embed = embeds(autograd.Variable(lookup_tensor))
print(hello_embed)

tensor([], size=(0, 5), grad_fn=<EmbeddingBackward>)
