In [1]:
import pandas as pd
import numpy as np
import os

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import MNIST
from torchvision import transforms
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy

In [2]:
srt = ["source", "reference", "translation"]
language_pairs = [
    "cs-en",
    "de-en",
    "en-fi",
    "en-zh",
    "ru-en",
    "zh-en",
]
scores = {pair: pd.read_csv(f"corpus/{pair}/scores.csv") for pair in language_pairs}

In [3]:
pair = "cs-en"
embedding_ref = torch.from_numpy(np.load(f"corpus/{pair}/laser.reference_embeds.npy"))
embedding_src = torch.from_numpy(np.load(f"corpus/{pair}/laser.source_embeds.npy"))
embedding_trn = torch.from_numpy(np.load(f"corpus/{pair}/laser.translation_embeds.npy"))
score = torch.tensor(scores[pair]["z-score"]).unsqueeze(1).float()


In [4]:
score.shape

torch.Size([11585, 1])

In [4]:
# Exposing how to stack each embedding
TEST_VALUE = 1024
a = torch.tensor([[0 for i in range(TEST_VALUE)] for _ in range(11585)])
b = torch.tensor([[1 for i in range(TEST_VALUE)] for _ in range(11585)])
c = torch.tensor([[2 for i in range(TEST_VALUE)] for _ in range(11585)])
# each row is an embedding
tmp = torch.stack((a, b, c), 1).float()

In [34]:
embedding = torch.stack((embedding_ref, embedding_src, embedding_trn), 1).unsqueeze(1)
embedding.shape

torch.Size([11585, 1, 3, 1024])

In [5]:
class Model(pl.LightningModule):

    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, (3,4), stride=(3,8), padding=(0, 0))
        self.pool1 = nn.MaxPool2d((3, 3), stride=(3, 1), padding=(1,1))
        self.linear1 = nn.Linear(8192, 1, bias=True)
        self.dropout2 = nn.Dropout(0.2)
        self.flatten = nn.Flatten()
    
    def forward(self, x):
        ## Define forward behavior
        x = self.conv1(x)
        # print(x.shape)
        x = F.relu(x)
        x = self.pool1(x)
        # print(x.shape)
        
        # Flatten layer
        # print(x.shape)
        x = self.flatten(x) 
        # print(x.shape)
        x = self.dropout2(x)
        x = self.linear1(x)
        # print(x.shape)
        x = torch.tanh(x)
        # print(x.shape)
        return x

    def training_step(self, batch, batch_nb):
        x, y = batch
        # logits = self(x)
        # print(x.shape, y.shape)
        loss = F.mse_loss(self(x), y)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.03)
        # return Ranger21(self.parameters(), lr=0.02)

In [10]:
model = Model()
x = torch.randn(11585,1,3,1024)
predicts = model(x)

In [6]:
def Pearson(predicts, score):
    return np.corrcoef(predicts.detach().numpy().flatten(), score.numpy().flatten())[0][1]

In [11]:
Pearson(predicts, score)

0.0068179225874049594

In [7]:
from torch.utils.data import Dataset, TensorDataset


class WordsDataset(Dataset):
    def __init__(self, pair, transform=None):
        embedding_ref = torch.from_numpy(
            np.load(f"corpus/{pair}/laser.reference_embeds.npy")
        )
        embedding_src = torch.from_numpy(
            np.load(f"corpus/{pair}/laser.source_embeds.npy")
        )
        embedding_trn = torch.from_numpy(
            np.load(f"corpus/{pair}/laser.translation_embeds.npy")
        )
        self.embedding = (
            torch.stack((embedding_src, embedding_ref, embedding_trn), 1)
            .unsqueeze(1)
            .float()
        )
        self.score = torch.tensor(scores[pair]["z-score"]).float().unsqueeze(1)
        self.transform = transform

    def __len__(self):
        return len(self.score)

    def __getitem__(self, idx):
        emb = self.embedding[idx]
        classification = self.score[idx]
        if self.transform:
            embedding = self.transform(emb)
        sample = [emb, classification]
        return sample


In [16]:
model = Model()

train_ds = WordsDataset(pair)
print(f"Embedding shape: {train_ds.embedding.shape}")
print(f"Score shape: {train_ds.score.shape}")

train_loader = DataLoader(train_ds, batch_size=32, num_workers=12)

# Initialize a trainer
trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=10)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


Embedding shape: torch.Size([11585, 1, 3, 1024])
Score shape: torch.Size([11585, 1])


In [17]:
# Train the model ⚡
trainer.fit(model, train_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type      | Params
---------------------------------------
0 | conv1    | Conv2d    | 832   
1 | pool1    | MaxPool2d | 0     
2 | linear1  | Linear    | 8.2 K 
3 | dropout2 | Dropout   | 0     
4 | flatten  | Flatten   | 0     
---------------------------------------
9.0 K     Trainable params
0         Non-trainable params
9.0 K     Total params
0.036     Total estimated model params size (MB)


Epoch 2: 100%|██████████| 363/363 [00:02<00:00, 153.03it/s, loss=1.67, v_num=85]


In [18]:
model(x)

tensor([[-1.],
        [-1.],
        [-1.],
        ...,
        [-1.],
        [-1.],
        [-1.]], grad_fn=<TanhBackward>)