# Predicting the score for a Youtube Video generated text file for Reviews

In [2]:
##Loading the required modules

from keras.models import load_model
from nltk.tokenize import sent_tokenize,word_tokenize,RegexpTokenizer
from nltk.corpus import stopwords
import emoji

In [3]:
from keras.datasets import imdb
import numpy as np
import matplotlib.pyplot as plt

In [4]:
## For representing the 0s or negetive reviews as sad face and 1s or positive reviews as happy face
emoji_dict = {"0" : ":crying_face:",
              "1" : ":grinning_face_with_big_eyes:"}

def print_emoji(label):
    
    return emoji.emojize(emoji_dict[label], use_aliases=True)

In [5]:
# Loading the Trained Model

In [7]:
model = load_model('best_model.h5')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                640064    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total params: 640,129
Trainable params: 640,129
Non-trainable params: 0
_________________________________________________________________


In [8]:
word_idx = imdb.get_word_index()   ## The dictionary for word to its index from the source dataset

In [9]:
word_dict = dict([value, key] for (key, value) in word_idx.items())  ## Reversing this dict to get index to word

In [10]:
## Our predefined sentence-vectorizing function

def vect_sent(sentences, dims=10000):
    
    outputs = np.zeros((len(sentences),dims))
    
    for i,idx in enumerate(sentences):
        outputs[i,idx] = 1
    
    return outputs

In [11]:
def func(snt):

    sent = snt.split()
    test = []
    
    for ix in sent:
        try:
            if word_idx[ix]+3 > 10000:      ## the first 3 letters are .?/ so we need to skip them
                test.append(9999)           ## if the word index exceeds the 10000 vocabuary, set it to last index
            else:
                test.append(word_idx[ix]+3)
        
        except:
                test.append(9999)
    
    return test

In [12]:
sent = ["it is really bad and poor quality", "was really easy to hold"]

tes = []
for s in sent:
    
    tes.append(func(s))

In [13]:
tes = vect_sent(tes)

In [14]:
## Predicting the vectors on the pretrained model weights :: 0==>bad review, 1==>good review

pred = model.predict_classes(tes)

for ix in range(len(pred)):
    print(ix, print_emoji(str(ix)))

0 😢
1 😃


# Loading the text file

In [15]:
## The text file contains copied data from the Youtube transcript of 3 reviews on Nikon D5600 Camera

In [16]:
with open('Recognized.txt', 'r') as in_file:   #Loading the text file adn splitting it in sentences
    text = in_file.read()
    sents = sent_tokenize(text)

In [17]:
data = []
data = sents[0].split('\n')

reviews = []
for ix in range(0,len(data),2):     # Every 2nd sentence is a timestamp which we need to skip
    
    reviews.append(data[ix])

In [18]:
sw = set(stopwords.words('english'))    ## Stopwords like a the so it etc that have no specific meaning are removed
tokenizer = RegexpTokenizer("[a-zA-Z@]+")   ## RegexTokenizer removes unwanted text like D5600 which has no meaning

for ix in range(len(reviews)):
    
    word_list = tokenizer.tokenize(reviews[ix].lower())
    useful_words = [w for w in word_list if w not in sw and int]
    
    reviews[ix] = ' '.join(useful_words)

In [19]:
reviews = np.array(reviews)
print(type(reviews))

<class 'numpy.ndarray'>


In [20]:
tes = []
for s in reviews:
    
    tes.append(func(s))

In [21]:
tes = vect_sent(tes)

In [22]:
pred = model.predict_classes(tes)

In [23]:
score = np.sum(pred==[1])/pred.shape[0]
print(score)

0.7076923076923077


In [24]:
## Prediction on the given Video scraped data is that 71% of the sentences used  by the person are good reviews
## i.e it was a positive review