In [1]:
# Importing the relevant modules
from transformers import BertTokenizer, BertModel, BertConfig
import pandas as pd
import numpy as np
import torch
import tqdm, json
#config = (vocab_size = 30522, hidden_size = 768,
#          num_hidden_layers = 12, num_attention_heads = 12,
#          intermediate_size = 3072, hidden_act = 'gelu',
#          hidden_dropout_prob = 0.1. attention_probs_dropout_prob = 0.1,
#          max_position_embeddings = 512, type_vocab_size = 2, initializer_range = 0.02,
#          layer_norm_eps = 1e-12, pad_token_id = 0, 
#          position_embedding_type = 'absolute', use_cache = True,
#          classifier_dropout = None**kwargs )

hidden_size = 300
#makes inference faster
configuration = BertConfig(intermediate_size = 2048,  output_hidden_states = True,
                           hidden_size = hidden_size, num_hidden_layers = 8)
#delete .cuda() if you don't have a good graphics card/it isn't configured to work with training
model = BertModel(configuration).cuda()

# This is the same tokenizer that
# was used in the model to generate
# embeddings to ensure consistency
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

2021-12-14 00:32:14.573880: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-14 00:32:14.574257: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2021-12-14 00:32:14.574266: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2021-12-14 00:32:14.574480: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN

In [2]:
def bert_text_preparation(text, tokenizer = tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

In [3]:
filename = '../yelp_academic_dataset_review.json'
reviews = []
with open(filename, 'rt') as f:
        for line in tqdm.tqdm(f):
            data = json.loads(line)

            reviews.append({
                key: data[key]
                for key in ['review_id', 'user_id', 'business_id', 'stars', "text"]
            })
            #rapid development
            if len(reviews) > 3: break
review_df = pd.DataFrame(reviews)
del reviews

for cat in review_df.columns:
    if cat != "stars":
        review_df[cat] = review_df[cat].astype("string")

3it [00:00, 17142.93it/s]


In [4]:
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

#needed to pad sequences :P
text = "[SEP]"
tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
SEP_EMBEDDING = get_bert_embeddings(tokens_tensor.cuda(), segments_tensors.cuda(), model)[0]

def convert_sentence_to_list_embeddings(text, pad_to, tokenizer = tokenizer, seq_length = 512):
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
    ret = []
    tokens = chunks(tokens_tensor[0].numpy(), seq_length)
    segments = chunks(segments_tensors[0].numpy(), seq_length)
    for tokens_, segments_ in zip(tokens, segments):
        ret+= get_bert_embeddings(torch.Tensor([tokens_]).int().cuda(),
                                  torch.tensor([segments_]).int().cuda(), model)
    ret += [SEP_EMBEDDING]*(pad_to - len(ret))
    return np.array(ret)

In [5]:
import tensorflow
class MyCorpusSentsTagged(tensorflow.keras.utils.Sequence):
    def __init__(self, batch_size):
        self.batch_size = batch_size
    
    def __len__(self):
        return len(review_df) // self.batch_size
    
    def __getitem__(self, index):
        X = review_df.iloc[index * self.batch_size:(index + 1) * self.batch_size]["seq"].values
        Y = review_df["stars"].values[index * self.batch_size:(index + 1) * self.batch_size]
        X = np.array([i for i in X])
        return X, Y/5

In [6]:
import warnings

#get pad size
max_size = 0
for text in tqdm.tqdm(review_df["text"]):
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
    max_size = max(max_size, len(tokenized_text))
    #create new df
warnings.filterwarnings("ignore")
review_df["seq"] = review_df["text"].apply(lambda x: convert_sentence_to_list_embeddings(x, pad_to = max_size))

100%|██████████| 4/4 [00:00<00:00, 437.08it/s]


In [69]:
review_df["max"] = review_df["seq"].apply(lambda x: np.max(x))
review_df["min"] = review_df["seq"].apply(lambda x: np.min(x))
MIN = np.min(review_df["min"].values)
MAX = np.max(review_df["max"].values)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.data_min_ = MIN
scaler.data_max_ = MAX
scaler.min_ = .01
scaler.scale_ = .5
scaler.transform(review_df["seq"][0])

array([[-0.67096435, -0.22226565,  0.57651962, ..., -0.08539443,
        -0.21303857,  0.00731292],
       [-0.11436866, -0.26132988,  0.15417067, ...,  0.26289273,
         0.02030965, -0.26024677],
       [-0.49506043, -0.1133729 ,  0.76526106, ...,  0.81353719,
         0.74180205, -0.20380036],
       ...,
       [-0.38710659, -0.44933628,  0.55043239, ...,  0.27131868,
        -0.42394756, -0.02236621],
       [-0.59919821, -0.20126078,  0.28864319, ...,  0.44738151,
        -0.07630822, -0.26450109],
       [-0.62032728, -0.51285725, -0.10072138, ..., -0.20616235,
         0.91448201,  0.24781478]])

array([[-1.1619287 , -0.2645313 ,  1.33303924, ...,  0.00921113,
        -0.24607714,  0.19462583],
       [-0.04873732, -0.34265976,  0.48834134, ...,  0.70578547,
         0.22061931, -0.34049355],
       [-0.81012087, -0.0467458 ,  1.71052213, ...,  1.80707438,
         1.66360409, -0.22760071],
       ...,
       [-0.59421318, -0.71867256,  1.28086479, ...,  0.72263737,
        -0.66789513,  0.13526758],
       [-1.01839643, -0.22252156,  0.75728638, ...,  1.07476301,
         0.02738356, -0.34900217],
       [-1.06065457, -0.8457145 , -0.02144276, ..., -0.23232471,
         2.00896401,  0.67562957]])

In [44]:
MIN

-4.371129035949707

In [8]:
from keras import metrics
import keras
from keras.regularizers import L1L2
import tensorflow as tf
import os

epochs = 10

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=1e-2, patience=5, verbose=0, mode='auto',
    baseline=None, restore_best_weights=True)

batch_size = 128
data_generator = MyCorpusSentsTagged(batch_size)

model = keras.Sequential()
model.add(keras.layers.LSTM(75, input_shape = (max_size, hidden_size), return_sequences=True, kernel_regularizer=L1L2(l1=0.01, l2=0.01)))
model.add(keras.layers.LSTM(75, kernel_initializer='he_uniform'))
model.add(keras.layers.Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(data_generator, epochs=epochs, verbose=2)

print(model.summary())

Epoch 1/10
50/50 - 135s - loss: 31.1483 - 135s/epoch - 3s/step
Epoch 2/10
50/50 - 180s - loss: 7.8346 - 180s/epoch - 4s/step
Epoch 3/10
50/50 - 227s - loss: 1.5103 - 227s/epoch - 5s/step
Epoch 4/10
50/50 - 248s - loss: 0.6775 - 248s/epoch - 5s/step
Epoch 5/10
50/50 - 282s - loss: 0.4291 - 282s/epoch - 6s/step
Epoch 6/10
50/50 - 310s - loss: 0.3199 - 310s/epoch - 6s/step
Epoch 7/10
50/50 - 354s - loss: 0.2688 - 354s/epoch - 7s/step
Epoch 8/10
50/50 - 355s - loss: 0.2401 - 355s/epoch - 7s/step
Epoch 9/10
50/50 - 327s - loss: 0.2172 - 327s/epoch - 7s/step
Epoch 10/10
50/50 - 296s - loss: 0.2055 - 296s/epoch - 6s/step
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1158, 75)          112800    
                                                                 
 lstm_1 (LSTM)               (None, 75)                45300     
                            

In [10]:
X = review_df["seq"].values[:30]
Y = review_df["stars"].values[:30]
X = np.array([i for i in X])
print(model.predict(X[:10]))
print(Y[:10])


[[0.7858824 ]
 [0.7858822 ]
 [0.78588223]
 [0.7858823 ]
 [0.78588223]
 [0.7858823 ]
 [0.7858824 ]
 [0.78588223]
 [0.7858822 ]
 [0.7858822 ]]
[4. 4. 5. 2. 4. 1. 2. 5. 4. 5.]


In [9]:
"""from keras import metrics
import keras
import tensorflow as tf
import os

X = test_df["seq"].values
Y = test_df["stars"].values
X = np.array([i for i in X])
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(Y.reshape(-1,1))
Y = enc.transform(Y.reshape(-1,1)).toarray()

epochs = 2
batch_size = 32
window_length = 20

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=1e-2, patience=5, verbose=0, mode='auto',
    baseline=None, restore_best_weights=True)


model = keras.Sequential()
#model.add(keras.layers.LSTM(100, dropout=0.1, recurrent_dropout=0.1, return_sequences=True))
model.add(keras.layers.Dense(50))
model.add(keras.layers.LSTM(50, kernel_initializer='he_uniform', name='encoder_2'))
model.add(keras.layers.Dense(5))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X, Y, epochs=epochs, batch_size=batch_size, verbose=2)
print(model.summary())


print(model.predict(X[:20]))
print(Y[:20])"""

'from keras import metrics\nimport keras\nimport tensorflow as tf\nimport os\n\nX = test_df["seq"].values\nY = test_df["stars"].values\nX = np.array([i for i in X])\nfrom sklearn.preprocessing import OneHotEncoder\nenc = OneHotEncoder(handle_unknown=\'ignore\')\nenc.fit(Y.reshape(-1,1))\nY = enc.transform(Y.reshape(-1,1)).toarray()\n\nepochs = 2\nbatch_size = 32\nwindow_length = 20\n\nearly_stop = tf.keras.callbacks.EarlyStopping(\n    monitor=\'val_loss\', min_delta=1e-2, patience=5, verbose=0, mode=\'auto\',\n    baseline=None, restore_best_weights=True)\n\n\nmodel = keras.Sequential()\n#model.add(keras.layers.LSTM(100, dropout=0.1, recurrent_dropout=0.1, return_sequences=True))\nmodel.add(keras.layers.Dense(50))\nmodel.add(keras.layers.LSTM(50, kernel_initializer=\'he_uniform\', name=\'encoder_2\'))\nmodel.add(keras.layers.Dense(5))\nmodel.compile(loss=\'mean_squared_error\', optimizer=\'adam\')\nmodel.fit(X, Y, epochs=epochs, batch_size=batch_size, verbose=2)\nprint(model.summary()

In [5]:
X = 1

In [7]:
X

1