# TensorFlow Lite: Sentimental Analysis using Amazon review

...

## Package import

In [59]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

!pip uninstall -q -y tensorflow google-colab grpcio
!pip install -q tf-nightly
#!pip install -q git+https://github.com/tensorflow/examples



In [60]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import os
import tensorflow as tf
assert tf.__version__.startswith('2')
import sys
sys.path.append('C:/Users/Adrian/Documents/examples')

from tensorflow_examples.lite.model_customization.core.data_util.text_dataloader import TextClassifierDataLoader
from tensorflow_examples.lite.model_customization.core.model_export_format import ModelExportFormat
import tensorflow_examples.lite.model_customization.core.task.text_classifier as text_classifier

from textdata_extension import from_panda
TextClassifierDataLoader.from_panda = from_panda

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


##  Load Dataset

In [61]:
rev_frame = pd.read_csv("Reviews.csv")

In [62]:
# df is the copy we process for sentimental analysis
df=rev_frame.copy()
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## Data cleaning and preprocessing

Regard only 'Text' and 'Score' for analysis and rename them to 'review' and 'rating'

In [63]:
df=df[['Text','Score']]
df['review']=df['Text']
df['rating']=df['Score']
df.drop(['Text','Score'],axis=1,inplace=True)

In [64]:
print(df.shape)
df.head()

(568454, 2)


Unnamed: 0,review,rating
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


Check for null values:

In [65]:
print(df['rating'].isnull().sum())
df['review'].isnull().sum()  # no null values.

0


0

Remove duplicates:

In [66]:
# remove duplicates/ for every duplicate we will keep only one row of that type. 
df.drop_duplicates(subset=['rating','review'],keep='first',inplace=True) 

In [67]:
# now check the shape. note that shape is reduced which shows that we did has duplicate rows.
size = df.shape
print(size)
df.head()


(393675, 2)


Unnamed: 0,review,rating
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [68]:
def mark_sentiment(rating):
  if(rating<=3):
    return 0
  else:
    return 1

In [69]:
df['sentiment']=df['rating'].apply(mark_sentiment)

In [70]:
df.drop(['rating'],axis=1,inplace=True)
df.head()

Unnamed: 0,review,sentiment
0,I have bought several of the Vitality canned d...,1
1,Product arrived labeled as Jumbo Salted Peanut...,0
2,This is a confection that has been around a fe...,1
3,If you are looking for the secret ingredient i...,0
4,Great taffy at a great price. There was a wid...,1


In [71]:
df.shape

(393675, 2)

## Cleaning

Has not proven to be more successful!

In [72]:
from bs4 import BeautifulSoup 
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords  #stopwords

# function to clean and pre-process the text.
def clean_reviews(review):  
    
    # 1. Removing html tags
    review_text = BeautifulSoup(review,"lxml").get_text()
    
    # 2. Retaining only alphabets.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    
    # 3. Converting to lower case and splitting
    word_tokens= review_text.lower().split()
    
    # 4. Remove stopwords
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_review=" ".join(word_tokens)
    return cleaned_review

## Ignore!

Note that pre processing all the reviews is taking way too much time and so we will take only 100K reviews. To balance the class we have taken equal instances of each sentiment.

In [109]:
pos_df=df.loc[df.sentiment==1,:][:25000]
neg_df=df.loc[df.sentiment==0,:][:25000]

In [110]:
#combining
df=pd.concat([pos_df,neg_df],ignore_index=True)
# shuffling rows
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape)
df.head()

(50000, 3)


Unnamed: 0,review,sentiment,clean_review
0,I was expecting a bit better after reading oth...,0,expecting bit better reading review hot chocol...
1,I recently ordered these and was extremely dis...,0,recently ordered extremely disappointed proble...
2,Bought this at my local health food store. As ...,0,bought local health food store eating way bit ...
3,I'm just not sure a new cartridge is better th...,0,sure new cartridge better washed one maybe cat...
4,Buyer beware. The coffee blend for Obsidian ap...,0,buyer beware coffee blend obsidian appears cha...


In [75]:
df['clean_review']=df['review'].apply(clean_reviews)

In [111]:
print(df.shape)
df.head()


(50000, 3)


Unnamed: 0,review,sentiment,clean_review
0,I was expecting a bit better after reading oth...,0,expecting bit better reading review hot chocol...
1,I recently ordered these and was extremely dis...,0,recently ordered extremely disappointed proble...
2,Bought this at my local health food store. As ...,0,bought local health food store eating way bit ...
3,I'm just not sure a new cartridge is better th...,0,sure new cartridge better washed one maybe cat...
4,Buyer beware. The coffee blend for Obsidian ap...,0,buyer beware coffee blend obsidian appears cha...


In the end the cleaning process did not help to improve the accuracy of the model

## Model creation

In the following steps the Panda-object is loaded into an object that Tensorflow can process.
We can choose to process the cleaned reviews or the raw reviews:

In [112]:
train_data, test_data = TextClassifierDataLoader.from_panda(df,pd_label=['review','sentiment']).split(0.8) # The fraction describes the size of the training data

Let's have a look at the data with it's classification and it's review:

In [113]:
for text, label in train_data.dataset.take(10):
  print ("%s: %s"%(train_data.index_to_label[label.numpy()], text.numpy()))

0: b'Item was in an envelope that was battered and very heavy, which was odd.  Upon opening I discovered why the envelope was so heavy, the chocolate was COMPLETELY melted.  They need to find a better way to ship this product, until then, I would STAY AWAY from ordering this item from THIS COMPANY!'
1: b"I've been using the MOCAFE Azteca for years.  Sweet and a bit of spice.  Mixes easily.  And you can get by with half the suggested amount.  Nothing warms you up faster on a cold, winter day."
0: b"I'm a flavored coffee fanatic, and was so excited to try this one.  What a major disappointment.  NO flavor at all.  Bleck.  Would NEVER buy or drink again, unless desperate and out of any other coffee!"
0: b"The world's largest gummy bear is in Austin. It is 27 lbs and in Campus Candy. It's $149.99 and the store is awfully loud. Back to my original point, this post is lying. I have not bought this product though."
1: b'I never buy flavored coffee.  I\'m a cream, sugar and drizzle of caramel 

In [114]:
model = text_classifier.create(train_data, epochs=3,num_words=10000)

INFO:tensorflow:Retraining the models...
Train for 1000 steps, validate for 125 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3


While trying out different parameters we found out that the model gives good results with the standard parameters. Increasing the word vocabulary or wordvector dimension lead to overfitting (training loss much smaller than validation loss).

We can have a look at the specifics of the model:

In [115]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 256, 16)           160048    
_________________________________________________________________
global_average_pooling1d_11  (None, 16)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 16)                272       
_________________________________________________________________
dropout_11 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 2)                 34        
Total params: 160,354
Trainable params: 160,354
Non-trainable params: 0
_________________________________________________________________


After training we should test our model on some unseen test data to evaluate our model:

In [116]:
loss, acc = model.evaluate(test_data)



Finally we can export the model to use it in the App using TensorFlow Lite :)

In [118]:
model.export('amazon_review_classifier_nclean50k.tflite', 'text_label_nc50k.txt', 'vocab_nc50k.txt')

INFO:tensorflow:Export to tflite model amazon_review_classifier_nclean50k.tflite, saved labels in text_label_nc50k.txt.
INFO:tensorflow:  Saved vocabulary in vocab_nc50k.txt.


In [119]:
# Read TensorFlow Lite model from TensorFlow Lite file.
with tf.io.gfile.GFile('amazon_review_classifier_nclean50k.tflite', 'rb') as f:
  model_content = f.read()

# Read label names from label file.
with tf.io.gfile.GFile('text_label_nc50k.txt', 'r') as f:
  label_names = f.read().split('\n')

# Initialze TensorFlow Lite inpterpreter.
interpreter = tf.lite.Interpreter(model_content=model_content)
interpreter.allocate_tensors()
input_index = interpreter.get_input_details()[0]['index']
output = interpreter.tensor(interpreter.get_output_details()[0]["index"])

# Run predictions on each test data and calculate accuracy.
accurate_count = 0
for i, (text, label) in enumerate(model.test_data.dataset):
    # Pre-processing should remain the same.
    text, label = model.preprocess(text, label)
    # Add batch dimension and convert to float32 to match with the model's input
    # data format.
    text = tf.expand_dims(text, 0).numpy()
    text = tf.cast(text, tf.float32)

    # Run inference.
    interpreter.set_tensor(input_index, text)
    interpreter.invoke()

    # Post-processing: remove batch dimension and find the label with highest
    # probability.
    predict_label = np.argmax(output()[0])
    # Get label name with label index.
    predict_label_name = label_names[predict_label]
    
    accurate_count += (predict_label == label.numpy())

accuracy = accurate_count * 1.0 / model.test_data.size
print('TensorFlow Lite model accuracy = %.4f' % accuracy)

TensorFlow Lite model accuracy = 0.8590
