In [1]:
# Importing the relevant modules, also here's the inspo: https://colab.research.google.com/github/dlmacedo/starter-academic/blob/master/content/courses/deeplearning/notebooks/pytorch/Time_Series_Prediction_with_LSTM_Using_PyTorch.ipynb#scrollTo=vIWvJCpOVmwU
from transformers import BertTokenizer, BertModel, BertConfig
import pandas as pd
import numpy as np
import torch
import tqdm, json
#config = (vocab_size = 30522, hidden_size = 768,
#          num_hidden_layers = 12, num_attention_heads = 12,
#          intermediate_size = 3072, hidden_act = 'gelu',
#          hidden_dropout_prob = 0.1. attention_probs_dropout_prob = 0.1,
#          max_position_embeddings = 512, type_vocab_size = 2, initializer_range = 0.02,
#          layer_norm_eps = 1e-12, pad_token_id = 0, 
#          position_embedding_type = 'absolute', use_cache = True,
#          classifier_dropout = None**kwargs )

#makes inferrence faster
configuration = BertConfig(intermediate_size = 2048,  output_hidden_states = True,
                           hidden_size = 516, num_hidden_layers = 8)
#delete .cuda() if you don't have a good graphics card/it isn't configured to work with training
model = BertModel(configuration).cuda()

# This is the same tokenizer that
# was used in the model to generate
# embeddings to ensure consistency
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

2021-12-07 20:08:48.911396: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-07 20:08:48.911687: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-07 20:08:48.914550: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [2]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

In [4]:
filename = '../yelp_academic_dataset_review.json'
reviews = []
with open(filename, 'rt') as f:
        for line in tqdm.tqdm(f):
            data = json.loads(line)

            reviews.append({
                key: data[key]
                for key in ['review_id', 'user_id', 'business_id', 'stars', "text"]
            })
            #rapid development
            if len(reviews) > 100: break
review_df = pd.DataFrame(reviews)
del reviews

for cat in review_df.columns:
    if cat != "stars":
        review_df[cat] = review_df[cat].astype("string")

100it [00:00, 54584.90it/s]


In [20]:
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

#needed to pad sequences :P
text = "[SEP]"
tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
SEP_EMBEDDING = get_bert_embeddings(tokens_tensor.cuda(), segments_tensors.cuda(), model)[0]

def convert_sentence_to_list_embeddings(text, pad_to, tokenizer = tokenizer, seq_length = 512):
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
    ret = []
    tokens = chunks(tokens_tensor[0].numpy(), seq_length)
    segments = chunks(segments_tensors[0].numpy(), seq_length)
    for tokens_, segments_ in zip(tokens, segments):
        ret+= get_bert_embeddings(torch.Tensor([tokens_]).int().cuda(),
                                  torch.tensor([segments_]).int().cuda(), model)
    ret += [SEP_EMBEDDING]*(pad_to - len(ret))
    return np.array(ret)

In [26]:
import warnings
warnings.filterwarnings("ignore")
test_df = review_df.head(20)

#get pad size
max_size = 0
for text in test_df["text"]:
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
    max_size = max(max_size, len(tokenized_text))

#create new df
test_df["seq"] = test_df["text"].apply(lambda x: convert_sentence_to_list_embeddings(x, pad_to = max_size))

In [41]:
import torch
import torch.nn as nn
from torch.autograd import Variable
class LSTM(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers, seq_length):
        super(LSTM, self).__init__()
        
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        print(x)
        h_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size))
        
        c_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size))
        
        # Propagate input through LSTM
        ula, (h_out, _) = self.lstm(x, (h_0, c_0))
        
        h_out = h_out.view(-1, self.hidden_size)
        
        out = self.fc(h_out)
        
        return out

In [42]:
num_epochs = 2000
learning_rate = 0.01

input_size = 516
hidden_size = 256
num_layers = 1

num_classes = 1

lstm = LSTM(num_classes, input_size, hidden_size, num_layers, max_size)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    outputs = lstm(test_df["seq"].values)
    optimizer.zero_grad()
    
    # obtain the loss function
    loss = criterion(outputs, trainY)
    
    loss.backward()
    
    optimizer.step()
    if epoch % 100 == 0:
      print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))

[array([[ 1.74673581, -1.1778357 ,  1.12215698, ...,  0.48197943,
          0.83313698, -0.45976985],
        [ 2.36452603, -0.53093255,  0.18928151, ...,  0.55637705,
          0.0851301 ,  0.00905598],
        [ 0.64944446,  0.37697053,  0.35114574, ...,  0.0204777 ,
          1.22808337, -0.59115922],
        ...,
        [ 0.79090834, -0.40771866, -0.15133946, ..., -0.44330767,
          0.4001312 , -1.43794405],
        [ 0.01071478,  0.57925272,  1.22340786, ...,  1.7247678 ,
          0.47757462, -0.51489353],
        [ 2.10566473,  0.40637681,  0.65402728, ..., -0.15148453,
          0.25683278, -0.47660798]])
 array([[ 1.92555225, -0.86355537,  1.78062141, ...,  1.08589649,
          0.45323732, -0.37013352],
        [ 1.96811426, -1.5662303 ,  0.15587023, ...,  1.18492961,
          0.63843763, -0.90759379],
        [ 1.10643566, -0.41290733, -0.68686706, ..., -0.25571406,
          1.39038324,  0.09403047],
        ...,
        [ 1.67387819, -0.91761404,  0.90260994, ...,  1

TypeError: 'int' object is not callable