In [None]:
import os
import pickle
import numpy as np
import torch
from torch.utils.data import DataLoader
from core.train_utils import train_bert, test_bert
from core.dataset_utils import TextDatasetBert, TextDatasetLSTM
from core.model_utils import get_cate_keywords, embedding_from_pretrain, get_embeddings
from core.LSTM_model import ReviewClassifier
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

#### Load preprocessed review data

In [4]:
save_file = pickle.load(open(os.path.join('Data/processed', 'processed_data.pkl'), 'rb'))
var_names = ['train_docs', 'val_docs', 'test_docs', 'train_labels', 'val_labels', 'ind2label', 'label2ind']
for var in var_names:
    exec("{} = save_file['{}']".format(var, var))

save_file = pickle.load(open(os.path.join('Data/processed', 'processed_bert_emb.pkl'), 'rb'))
var_names = ['X_train_emb', 'X_val_emb', 'X_test_emb']
for var in var_names:
    exec("{} = save_file['{}']".format(var, var))

#### Train Bert-based model with center loss

In [None]:
train_dataset = TextDatasetBert(X_train_emb, train_labels)
val_dataset = TextDatasetBert(X_val_emb, val_labels)
test_dataset = TextDatasetBert(X_test_emb, np.zeros(len(X_test_emb['input_ids'])))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = train_bert(train_loader, val_loader, device, num_epoch=1, LR_Bert=1e-6, alpha_CL=0.1, LR_CL=10)

In [None]:
best_model_name = os.path.join('models', 'best_bert_finetuned_model')
test_bert(best_model_name, test_loader, ind2label, device)

#### Train LSTM-based model with attention layer and center loss on keywords

In [None]:
cate_keywords = get_cate_keywords(train_docs, train_labels)

ind = 1
print (ind2label[ind])
wordcloud = WordCloud(width=800, height=500,background_color="white").generate(' '.join(cate_keywords[ind]))

# Display the generated image:
plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation='bilinear')

In [None]:
word_cnts = Counter()
for corpus in [train_docs, val_docs, test_docs]:
    for line in corpus:
        word_cnts.update(line)

ind2word = ['UNK', 'PAD']
for word, cnts in word_cnts.items():
    if cnts <= 3:
        continue
    ind2word.append(word)
emb_matrix, word2ind = get_embeddings(ind2word, os.path.join('embeddings', 'glove.42B.300d.txt'))

In [None]:
train_dataset = TextDatasetLSTM(train_docs, train_labels, word2ind=word2ind)
val_dataset = TextDatasetLSTM(val_docs, val_labels, word2ind=word2ind)
test_dataset = TextDatasetLSTM(test_docs, np.zeros(len(test_docs)), word2ind=word2ind)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

lstm_model = ReviewClassifier(emb_matrix, cate_keywords, word2ind, ind2label, use_attention=True, hidden_dim=100, num_labels=10, device=device, lstm_model_name=None, center_name=None)
lstm_model.train(train_loader, val_loader, lstm_lr=1e-4, num_epochs=100)

In [None]:
lstm_model_name = os.path.join('models', 'kw_best_lstm')
center_model_name = os.path.join('models', 'kw_best_center')

best_lstm_model = ReviewClassifier(emb_matrix, cate_keywords, word2ind, ind2label, hidden_dim=100, num_labels=10, device=device, lstm_model_name=lstm_model_name, center_name=center_model_name)

best_lstm_model.validation(val_loader)
best_lstm_model.test(test_loader)