In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import pandas as pd
import numpy as np


In [2]:
const = tf.constant(["This is a string", "This is another string"])
const[0]


<tf.Tensor: shape=(), dtype=string, numpy=b'This is a string'>

In [3]:
def transform_sentiment(input):
    dict = {
        1: 'very negative',
        2: 'slightly negative',
        3: 'neutral',
        4: 'slightly positive',
        5: 'very positive'
    }

    return dict[input]


df = pd.read_json('./Data/consolidated/final_2.json')
df = df.drop(columns=['index'], axis=0)
df['count'] = 1
df['sentiment'] = df['overall'].apply(transform_sentiment)
df = df.dropna(inplace=False)
df

Unnamed: 0,overall,verified,reviewTime,reviewerName,reviewText,summary,count,sentiment
0,1,True,"02 19, 2015",theodore j bigham,great,One Star,1,very negative
1,1,True,"04 10, 2017",Jacqueline Diaz,I didn't like this product it smudged all unde...,One Star,1,very negative
2,1,True,"11 19, 2016",rabiyaa123,it burns your eyes when u put it on and very ...,i do not recommend.,1,very negative
3,1,True,"03 24, 2018",Skip,It rusts.,It rusts.,1,very negative
4,1,True,"03 14, 2018",VB,Bought it as a present...doesn't fit a standar...,One Star,1,very negative
...,...,...,...,...,...,...,...,...
1316778,5,False,"11 29, 2001",Tina Evans,The only way to describe this wonderful piece ...,"A extream, but fun game!",1,very positive
1316779,5,False,"11 15, 1999",Kimberly P.Curtis,I found these games highly impressive and exci...,Excellent Game,1,very positive
1316780,5,False,"11 25, 2001",Amazon Customer,I hav had this game for more than a year and I...,Best GAmE ever,1,very positive
1316781,5,False,"11 12, 2001",Anne Callanan,"This game is as good as tehy come. Sure, ther...",a ma zing,1,very positive


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

X = df['reviewText'].values
y = enc.fit_transform(np.array(df['overall'].to_list()).reshape(-1,1)).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
enc.categories_

[array([1, 2, 3, 4, 5])]

In [6]:
type(X_train)

numpy.ndarray

In [7]:
X_train

array(["I wanted to find some comfortable cotton socks that were organic, sweatshop-free and fair-trade (or the equivalent) and ended up buying several pairs from both Maggies and Pact. The bottom line is that Id buy both kinds again but the Pact socks are more comfortable and I think a better product overall. In future Id probably choose Pact for their colored socks and then Maggies for their natural (unbleached) crew socks (Pact doesnt make white/unbleached crews).\n\nIve come to the conclusion that the sizes listed for Maggies socks are womens sizes. So if you're a man, then consider that size 9-11 for women is equivalent to 7.5-9.5 for men. Their size 10-13 socks are pretty snug on my size 9 feet. People have said that they shrink upon washing  I use a cool setting on the washer and haven't had this issue. The socks are a bit more rustic compared to Pacts socks but they do feel very natural and my feet like them :) In other words, they are slightly rougher in feel and not quite as 

In [8]:
type(X)

numpy.ndarray

In [9]:
len(X_train), len(X_test), len(y_train), len(y_test)

(1052881, 263221, 1052881, 263221)

In [10]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")



In [11]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up",
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84369963, -0.51361525, -0.8888222 , ..., -0.7479082 ,
        -0.7532988 ,  0.91979617],
       [-0.8720985 , -0.50547266, -0.9444924 , ..., -0.85849494,
        -0.71742827,  0.88083655]], dtype=float32)>

In [12]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
drop_out = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
final_layer = tf.keras.layers.Dense(5, activation='softmax', name="output")(drop_out)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [final_layer])

In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [14]:
METRICS = [
    tf.keras.metrics.CategoricalAccuracy(),
    tf.keras.metrics.CategoricalCrossentropy()
]

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=METRICS)




In [17]:
def scheduler(epoch, lr):
    if epoch > 2:
        return lr * 0.75
    else:
        return 0.00075

callbacks = [
    tf.keras.callbacks.LearningRateScheduler(scheduler),
    tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    verbose=1,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)]

model.fit(X_train, y_train, epochs=10,
          batch_size=128,
          validation_data=(X_test, y_test),
          callbacks = callbacks
          )


Epoch 1/10

KeyboardInterrupt: 

In [None]:
model.predict(['I dont really like this product'])

In [None]:
model.save('model_saved/Bert_v3')