In [None]:
import os
import pickle
import torch
from torch import nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split
from torchvision import transforms
import pytorch_lightning as pl
from tqdm.notebook import tqdm
import numpy as np

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [None]:
dataset = pickle.load(open('conll_graph_all.pickle', 'rb'))
print(', '.join([split + f' : {len(dataset[split])}' for split in dataset]))
vocabulary = pickle.load(open('vocabulary_all.pickle', 'rb'))
print(', '.join([key + f' : {len(vocabulary[key])}' for key in vocabulary]))
voc2id = {}
for key in vocabulary:
    voc2id[key] = {l: i for i, l in enumerate(vocabulary[key])}
print('Ismail ID: ',voc2id['word']['ismail'], )
labels = pickle.load(open('labels.pickle', 'rb'))
label2id = {l: i for i, l in enumerate(labels)}
print(label2id)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, split, voc2id=voc2id, label2id=label2id, context='all'):
        X = []
        Y = []
        
        for doc in tqdm(dataset[split], desc=split.upper()):
            # Create the graph one-hot embedding
            onehot = {}
            for key in voc2id:
                onehot[key] = np.zeros(len(voc2id[key]))
                for v in doc[key]:
                    onehot[key][voc2id[key][v]] = 1.
            
            onehot['left'] = np.zeros(len(voc2id['word']))
            onehot['right'] = np.zeros(len(voc2id['word']))
            
            if context == 'all':
                context = max(len(doc['left_context']), len(doc['right_context']))
            
            for w in doc['left_context'][-context:]:
                onehot['left'][voc2id['word'][w]] = 1.
            for w in doc['right_context'][:context]:
                onehot['right'][voc2id['word'][w]] = 1.
            
            doc_embedding = np.concatenate([onehot['word'], onehot['left'], onehot['right'],
                                            onehot['pos'], onehot['chunk'], onehot['extra'], onehot['classes']])
            X.append(torch.tensor(doc_embedding))
            Y.append(label2id[doc['label']])

        self.X = X
        self.Y = Y
        self.X_len = len(X)
        self.voc2id = voc2id
        self.labels = sorted(label2id.keys())
        self.label2id = label2id

    def __len__(self):
        return self.X_len

    def __getitem__(self, index):
        x = torch.tensor(self.X[index])
        y = torch.tensor(self.Y[index])
        
        return x, y


    def voc(self, key):
        return self.voc2id[key]

In [None]:
train_set = Dataset(dataset, 'train', context=3)
dev_set = Dataset(dataset, 'validation', context=3)
test_set = Dataset(dataset, 'test', context=3)

In [None]:
len(train_set) - 50000

In [None]:
input_dim = train_set[0][0].numpy().shape[0]
print(input_dim)

In [None]:
train, val = random_split(train_set, [128610, 50000], generator=torch.Generator().manual_seed(42))

In [None]:
train_data = train_set + dev_set + test_set

In [None]:
class LitAutoEncoder(pl.LightningModule):

    def __init__(self, input_dim, embedding_dim=300):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, embedding_dim), 
                                     nn.ReLU(), 
                                     nn.Linear(embedding_dim, embedding_dim))
        self.decoder = nn.Sequential(nn.Linear(embedding_dim, embedding_dim), 
                                     nn.ReLU(), 
                                     nn.Linear(embedding_dim, input_dim))
    
    def forward(self, x):
        # in lightning, forward defines the prediction/inference actions
        embedding = self.encoder(x)
        return embedding

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop. It is independent of forward
        x, y = batch
        x = x.float()
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [None]:
embedding_dim = 2000

In [None]:
autoencoder = LitAutoEncoder(input_dim=input_dim, embedding_dim=embedding_dim)

trainer = pl.Trainer(max_epochs=15, gpus=1)

train_loader = DataLoader(train_data, batch_size=16)
val_loader = DataLoader(val, batch_size=1)

trainer.fit(autoencoder, train_loader, train_loader)

In [None]:
! ls /data/graphner_embeddings/ae_emb_npy_{embedding_dim}_15epochs/

In [None]:
! mkdir /data/graphner_embeddings/ae_emb_npy_{embedding_dim}_15epochs/
! mkdir /data/graphner_embeddings/ae_emb_npy_{embedding_dim}_15epochs/train
! mkdir /data/graphner_embeddings/ae_emb_npy_{embedding_dim}_15epochs/validation
! mkdir /data/graphner_embeddings/ae_emb_npy_{embedding_dim}_15epochs/test

In [None]:
! ls /data/graphner_embeddings/ae_emb_npy_{embedding_dim}_15epochs/

In [None]:
%%time

for split, dataset in {'train': train_set, 'validation': dev_set, 'test': test_set}.items():
    for i, example in tqdm(enumerate(dataset), total=len(dataset)):
        embedding = autoencoder(example[0].float()).detach().numpy()
        label = example[1].item()
        # print(embedding.shape, label)
        pickle.dump((embedding, label), open(f'/data/graphner_embeddings/ae_emb_npy_{embedding_dim}_15epochs/{split}/{i}.pickle', 'wb'))

In [None]:
! ls -lsh /data/graphner_embeddings/autoencoder_embeddings_100_2epochs/

In [None]:
! ls /data/graphner_embeddings

In [None]:
! mkdir /data/graphner_embeddings/train

In [None]:
! mkdir /data/graphner_embeddings/evaluation

In [None]:
! mkdir /data/graphner_embeddings/test