# Changing the dimensionality of glove word embeddings

In [11]:
from tqdm.auto import tqdm
import numpy as np
import torch as tt
from torch import nn

## Load the embeddings

In [8]:
# most of the code in the cell is from Stackoverflow
GLOVE_FILE = '../data/glove.6B/glove.6B.50d.txt'

# Get number of vectors and hidden dim
with open(GLOVE_FILE, 'r') as f:
    for i, line in tqdm(enumerate(f)):
        pass
n_vec = i + 1
embedding_size = len(line.split(' ')) - 1

vecs = np.zeros((n_vec, embedding_size), dtype=np.float32)
embeds = {}

with open(GLOVE_FILE, 'r') as f:
    for i, line in tqdm(enumerate(f)):
        vecs[i] = np.array([float(n) for n in line.split(' ')[1:]], dtype=np.float32)
        embeds[line.split(' ')[0]] = i

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## Run the autoencoder

In [33]:
target_size = 10
batch_size = 100000
epochs = 1000
learning_rate=1e-3

In [17]:
class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Linear(embedding_size, target_size)
        self.decoder = nn.Linear(target_size, embedding_size)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [34]:
model = autoencoder()
criterion = nn.MSELoss()
optimizer = tt.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

for epoch in range(epochs):
    total_loss = 0
    id = 0
    while id < len(vecs):
        if id + batch_size <=len(vecs):
            batch = tt.tensor(vecs[id:id+batch_size])
        else:
            batch = tt.tensor(vecs[id:])
        id += batch_size
        output = model(batch)
        loss = criterion(output, batch)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print('epoch [{}/{}], loss:{:.4f}'.format(epoch + 1, epochs, total_loss))

epoch [1/1000], loss:2.0097
epoch [11/1000], loss:1.6421
epoch [21/1000], loss:1.4106
epoch [31/1000], loss:1.2638
epoch [41/1000], loss:1.1791
epoch [51/1000], loss:1.1312
epoch [61/1000], loss:1.1027
epoch [71/1000], loss:1.0845
epoch [81/1000], loss:1.0717
epoch [91/1000], loss:1.0619
epoch [101/1000], loss:1.0540
epoch [111/1000], loss:1.0473
epoch [121/1000], loss:1.0415
epoch [131/1000], loss:1.0364
epoch [141/1000], loss:1.0319
epoch [151/1000], loss:1.0278
epoch [161/1000], loss:1.0243
epoch [171/1000], loss:1.0212
epoch [181/1000], loss:1.0185
epoch [191/1000], loss:1.0162
epoch [201/1000], loss:1.0142
epoch [211/1000], loss:1.0125
epoch [221/1000], loss:1.0110
epoch [231/1000], loss:1.0097
epoch [241/1000], loss:1.0086
epoch [251/1000], loss:1.0076
epoch [261/1000], loss:1.0067
epoch [271/1000], loss:1.0060
epoch [281/1000], loss:1.0053
epoch [291/1000], loss:1.0047
epoch [301/1000], loss:1.0041
epoch [311/1000], loss:1.0036
epoch [321/1000], loss:1.0032
epoch [331/1000], los

## Check outpus

In [35]:
for vec in vecs[:10]:
    prediction = model(tt.tensor(vec)).detach().numpy()
    difference = np.sum(vec**2)- np.sum(prediction**2)
    print(np.sum(vec**2), np.sum(prediction ** 2), difference)
    print(vec[:10])
    print(prediction[:10])

24.679304 21.647974 3.03133
[ 0.418       0.24968    -0.41242     0.1217      0.34527    -0.044457
 -0.49688    -0.17862    -0.00066023 -0.6566    ]
[ 0.5069263   0.16810846 -0.6155796   0.35290584  0.31652027  0.09166866
 -0.46754444 -0.7058037  -0.22616246 -0.4175256 ]
20.051197 15.998968 4.052229
[ 0.013441  0.23682  -0.16899   0.40951   0.63812   0.47709  -0.42852
 -0.55641  -0.364    -0.23938 ]
[ 0.25697047  0.20482409 -0.20199686  0.52915436  0.532064   -0.04176922
 -0.7155416  -0.63826287 -0.49456692 -0.32819343]
19.77369 16.451874 3.3218155
[ 0.15164  0.30177 -0.16763  0.17684  0.31719  0.33973 -0.43478 -0.31086
 -0.44999 -0.29486]
[ 0.35975456  0.25510624 -0.2952394   0.3254284   0.39722192  0.0540567
 -0.6364787  -0.75441164 -0.37787542 -0.31099516]
24.562908 19.59288 4.970028
[ 0.70853  0.57088 -0.4716   0.18048  0.54449  0.72603  0.18157 -0.52393
  0.10381 -0.17566]
[ 0.5962781  -0.00294097 -0.6001053   0.35469854  0.39668727  0.25445452
 -0.4427682  -0.5820744  -0.30185026

## Save the embeddings

In [37]:
# most of the code in the cell is from Stackoverflow
NEW_GLOVE_FILE = '../data/glove.6B/glove.6B.' + str(target_size) + 'd.txt'
embed_inverse = {embeds[string]:string for string in embeds.keys()}

with open(NEW_GLOVE_FILE, 'w') as file:
    for i in tqdm(range(len(vecs))):
        file.write(embed_inverse[i] + " ")
        new_embedding = model.encoder(tt.tensor(vecs[i])).detach().numpy()
        file.write(" ".join(str(x) for x in new_embedding) + "\n")

HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))


