In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch
import tqdm
import torch.optim as optim
import torch.nn as nn

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

In [4]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [5]:
from movie import MovieReviewsTextDataset

train_dataset = MovieReviewsTextDataset(path='./data/aclImdb/train', max_len=25)
valid_dataset = MovieReviewsTextDataset(path='./data/aclImdb/test', max_len=25)

pos Files:   0%|          | 0/12500 [00:00<?, ?it/s]

neg Files:   0%|          | 0/12500 [00:00<?, ?it/s]

pos Files:   0%|          | 0/12500 [00:00<?, ?it/s]

neg Files:   0%|          | 0/12500 [00:00<?, ?it/s]

In [6]:
from movie import Lang

VOCAB = Lang('imdb_en')
# VOCAB.__dict__

for t in train_dataset.texts:
    VOCAB.addSentence(t)

for t in valid_dataset.texts:
    VOCAB.addSentence(t)

In [7]:
from torch.utils.data import DataLoader


BATCH_SIZE = 128
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [8]:
# for batch_idx, batch in enumerate(train_loader):
#     print(batch)
#     break

In [9]:


input_size = VOCAB.n_words
output_size = VOCAB.n_words

encoder_emb_size = 32
encoder_hidden_size = 64
encoder_dropout = 0.2

decoder_emb_size = 32
decoder_hidden_size = 64
decoder_dropout = 0.2

learning_rate = 1e-3
model_type = "LSTM"

EPOCHS = 10
PAD_IDX = VOCAB.word2index['<pad>']






In [10]:
# encoder_emb_size = 128
# encoder_hidden_size = 64
# encoder_dropout = 0.1

# decoder_emb_size = 32
# decoder_hidden_size = 64
# decoder_dropout = 0.1

# learning_rate = 0.0004
# model_type = "LSTM"

# EPOCHS = 10

In [11]:
from models.seq2seq.Encoder import Encoder
from models.seq2seq.Decoder import Decoder
from models.seq2seq.Seq2Seq import Seq2Seq


encoder = Encoder(input_size, encoder_emb_size, encoder_hidden_size, decoder_hidden_size, dropout = encoder_dropout, model_type = model_type)
decoder = Decoder(decoder_emb_size, encoder_hidden_size, encoder_hidden_size, output_size, dropout = decoder_dropout, model_type = model_type)
seq2seq_model = Seq2Seq(encoder, decoder, device)

In [12]:
# from autoencoder import AutoEncoder

# autoencoder = AutoEncoder(VOCAB.n_words, 32, 27, 3, PAD_IDX)
# autoencoder.to(device)

In [13]:
optimizer = optim.Adam(seq2seq_model.parameters(), lr = learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [14]:
def train(model, loader, criterion):
    total_loss = 0.


    model.train()

    progress_bar = tqdm.notebook.tqdm(loader, ascii=True)
    for batch_idx, data in enumerate(progress_bar):

        # print(batch_idx)

        data = np.array(data['text']).T
        data = np.array([[VOCAB.word2index[word] for word in review] for review in data])
        source = torch.tensor(data, requires_grad=False).to(device)
        target = torch.tensor(data, requires_grad=False).to(device)

        
        reconstructed = model(source)
        reconstructed = reconstructed.view(-1, reconstructed.shape[-1])
        target = target.view(-1)

        optimizer.zero_grad()
        loss = criterion(reconstructed, target)
        loss.backward()
        optimizer.step()


        
        total_loss += loss
        progress_bar.set_description_str("Batch: %d, Loss: %.4f" % ((batch_idx + 1), loss.item()))




    return total_loss, total_loss / len(loader)

In [15]:
def validate(model, loader, criterion):
    
    total_loss = 0.
    model.eval()

    with torch.no_grad():


        progress_bar = tqdm.notebook.tqdm(loader, ascii=True)
        for batch_idx, data in enumerate(progress_bar):

            # print(batch_idx)

            data = np.array(data['text']).T
            data = np.array([[VOCAB.word2index[word] for word in review] for review in data])
            source = torch.tensor(data, requires_grad=False).to(device)
            target = torch.tensor(data, requires_grad=False).to(device)

            
            reconstructed = model(source)
            reconstructed = reconstructed.view(-1, reconstructed.shape[-1])
            target = target.view(-1)
            
            loss = criterion(reconstructed, target)

            
            total_loss += loss
            progress_bar.set_description_str("Batch: %d, Loss: %.4f" % ((batch_idx + 1), loss.item()))


    return total_loss, total_loss / len(loader)

In [16]:
metrics = dict()

for epoch in range(EPOCHS):

    train_loss, avg_train_loss = train(seq2seq_model, train_loader, criterion)
    scheduler.step(train_loss)

    val_loss, avg_val_loss = validate(seq2seq_model, valid_loader, criterion)

    print(f'Train loss: {avg_train_loss.item()} ||| Validation loss: {avg_val_loss.item()}')

    metrics[epoch] = {
        'avg_train_loss': avg_train_loss,
        'avg_val_loss': avg_val_loss
    }

  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

Train loss: 7.600804805755615 ||| Validation loss: 6.437191963195801


  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

Train loss: 6.367506980895996 ||| Validation loss: 6.426760196685791


  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

Train loss: 6.341692924499512 ||| Validation loss: 6.422240257263184


  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

Train loss: 6.323130130767822 ||| Validation loss: 6.418086528778076


  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

Train loss: 6.311318874359131 ||| Validation loss: 6.4168500900268555


  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

Train loss: 6.282530784606934 ||| Validation loss: 6.380202770233154


  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

Train loss: 6.253733158111572 ||| Validation loss: 6.384605884552002


  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

Train loss: 6.2383856773376465 ||| Validation loss: 6.356140613555908


  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

Train loss: 6.2215189933776855 ||| Validation loss: 6.4284563064575195


  0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/196 [00:00<?, ?it/s]

Train loss: 6.231274127960205 ||| Validation loss: 6.388311862945557


In [None]:
Train loss: 7.483979225158691 ||| Validation loss: 6.462296485900879

In [None]:
with open('data/baseline_data_for_cluster.p', 'rb') as infile:
    baseline = pickle.load(infile)

baseline

In [None]:
import os



pos_dir = './data/aclImdb/train/pos'
neg_dir = './data/aclImdb/train/neg'

train_df = pd.DataFrame(
    [['train'] + f[:-4].split('_') + ['pos'] + open(f'{pos_dir}/{f}').readlines() for f in os.listdir(pos_dir)] +
    [['train'] + f[:-4].split('_') + ['neg'] + open(f'{neg_dir}/{f}').readlines() for f in os.listdir(neg_dir)],
    columns=['type', 'id', 'score', 'label', 'review']
) \
.astype({
    'id': int,
    'score': int,
    'label': str,
    'review': str
})



train_df.head()


In [None]:
train_df.shape

In [None]:


pos_dir = './data/aclImdb/test/pos'
neg_dir = './data/aclImdb/test/neg'

test_df = pd.DataFrame(
    [['test'] + f[:-4].split('_') + ['pos'] + open(f'{pos_dir}/{f}').readlines() for f in os.listdir(pos_dir)] +
    [['test'] + f[:-4].split('_') + ['neg'] + open(f'{neg_dir}/{f}').readlines() for f in os.listdir(neg_dir)],
    columns=['type', 'id', 'score', 'label', 'review']
) \
.astype({
    'id': int,
    'score': int,
    'label': str,
    'review': str
})



test_df.head()

In [None]:
train_df.to_csv('./data/imdb_train.csv', index=False)
test_df.to_csv('./data/imdb_test.csv', index=False)