In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch
import tqdm


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

In [4]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [5]:
from movie import MovieReviewsTextDataset

train_dataset = MovieReviewsTextDataset(path='./data/aclImdb/train')
# valid_dataset = MovieReviewsTextDataset(path='./data/aclImdb/test')

pos Files:   0%|          | 0/12500 [00:00<?, ?it/s]

neg Files:   0%|          | 0/12500 [00:00<?, ?it/s]

In [6]:
from movie import Lang

VOCAB = Lang('imdb_en')
# VOCAB.__dict__

for t in train_dataset.texts:
    VOCAB.addSentence(t)

In [7]:
from torch.utils.data import DataLoader


BATCH_SIZE = 128
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [8]:
# for batch_idx, batch in enumerate(train_loader):
#     print(batch)
#     break

In [9]:
from models.seq2seq.Encoder import Encoder
from models.seq2seq.Decoder import Decoder
from models.seq2seq.Seq2Seq import Seq2Seq
import torch.optim as optim
import torch.nn as nn


input_size = VOCAB.n_words
output_size = VOCAB.n_words

encoder_emb_size = 32
encoder_hidden_size = 64
encoder_dropout = 0.2

decoder_emb_size = 32
decoder_hidden_size = 64
decoder_dropout = 0.2

learning_rate = 1e-3
model_type = "LSTM"

EPOCHS = 10
PAD_IDX = VOCAB.word2index['<pad>']



encoder = Encoder(input_size, encoder_emb_size, encoder_hidden_size, decoder_hidden_size, dropout = encoder_dropout, model_type = model_type)
decoder = Decoder(decoder_emb_size, encoder_hidden_size, encoder_hidden_size, output_size, dropout = decoder_dropout, model_type = model_type)
seq2seq_model = Seq2Seq(encoder, decoder, device)

optimizer = optim.Adam(seq2seq_model.parameters(), lr = learning_rate)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [10]:

total_loss = 0.

encoder.train()
decoder.train()
seq2seq_model.train()

progress_bar = tqdm.notebook.tqdm(train_loader, ascii=True)
for batch_idx, data in enumerate(progress_bar):

    # print(batch_idx)

    data = np.array(data['text']).T
    # print(data[0])
    data = np.array([[VOCAB.word2index[word] for word in review] for review in data])
    source = torch.from_numpy(data).to(device)
    target = torch.from_numpy(data).to(device)
    # print(data)
    # print(data[0])
    # print(data.reshape(-1).shape)
    
    reconstructed = seq2seq_model(source)
    reconstructed = reconstructed.reshape(-1, reconstructed.shape[-1])
    target = target.reshape(-1)

    optimizer.zero_grad()
    loss = criterion(reconstructed, target)
    loss.backward()
    optimizer.step()


    
    total_loss += loss
    progress_bar.set_description_str("Batch: %d, Loss: %.4f" % ((batch_idx + 1), loss.item()))

    
    # print(reconstructed.shape)
    

    # break

print(total_loss / len(train_loader))

  0%|          | 0/196 [00:00<?, ?it/s]

RuntimeError: CUDA error: unknown error

In [None]:
import tqdm

def translate(model, dataloader):
    # model.eval()
    with torch.no_grad():
        # Get the progress bar 
        print("Start")
        progress_bar = tqdm(dataloader.batches, ascii = True)
        for batch_idx, data in enumerate(progress_bar):
            print(batch_idx)
            source = np.array([example['text'] for example in data])
            # WORK!
            #labels = np.array([example['labels'] for example in data])
            print(source[0:9])
            source_encoded = np.array([list(map(lambda x: english_vocabulary.vocab.stoi[x], source[i])) for i in range(source.shape[0])])
            #print(f"Source: {source}")
            print(f"Source: {source.shape}")
            print(f"Source: {source_encoded.shape}")
            #target = data.trg.transpose(1,0)
            #print(source_encoded)
            source_torch = torch.from_numpy(source_encoded).to(device)
            print(source_torch)
            translation = model(source_torch)
            print(f"Activation: {activation}")
            activation_output = activation['norm_mh2']
            h1.remove()
            return translation, activation_output

translate(None, train_loader)

In [None]:
with open('data/baseline_data_for_cluster.p', 'rb') as infile:
    baseline = pickle.load(infile)

baseline

In [None]:
import os



pos_dir = './data/aclImdb/train/pos'
neg_dir = './data/aclImdb/train/neg'

train_df = pd.DataFrame(
    [['train'] + f[:-4].split('_') + ['pos'] + open(f'{pos_dir}/{f}').readlines() for f in os.listdir(pos_dir)] +
    [['train'] + f[:-4].split('_') + ['neg'] + open(f'{neg_dir}/{f}').readlines() for f in os.listdir(neg_dir)],
    columns=['type', 'id', 'score', 'label', 'review']
) \
.astype({
    'id': int,
    'score': int,
    'label': str,
    'review': str
})



train_df.head()


In [None]:
train_df.shape

In [None]:


pos_dir = './data/aclImdb/test/pos'
neg_dir = './data/aclImdb/test/neg'

test_df = pd.DataFrame(
    [['test'] + f[:-4].split('_') + ['pos'] + open(f'{pos_dir}/{f}').readlines() for f in os.listdir(pos_dir)] +
    [['test'] + f[:-4].split('_') + ['neg'] + open(f'{neg_dir}/{f}').readlines() for f in os.listdir(neg_dir)],
    columns=['type', 'id', 'score', 'label', 'review']
) \
.astype({
    'id': int,
    'score': int,
    'label': str,
    'review': str
})



test_df.head()

In [None]:
train_df.to_csv('./data/imdb_train.csv', index=False)
test_df.to_csv('./data/imdb_test.csv', index=False)