# Predicting the score for a Youtube Video generated text file for Reviews

In [6]:
##Loading the required modules

from tensorflow.keras.models import load_model
from nltk.tokenize import sent_tokenize,word_tokenize,RegexpTokenizer
from nltk.corpus import stopwords
import emoji

In [2]:
from keras.datasets import imdb
import numpy as np
import matplotlib.pyplot as plt

In [3]:
## For representing the 0s or negetive reviews as sad face and 1s or positive reviews as happy face
emoji_dict = {"0" : ":crying_face:",
              "1" : ":grinning_face_with_big_eyes:"}

def print_emoji(label):
    
    return emoji.emojize(emoji_dict[label], use_aliases=True)

In [4]:
# Loading the Trained Model

In [7]:
model = load_model('best_model.h5')
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                640064    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total params: 640,129
Trainable params: 640,129
Non-trainable params: 0
_________________________________________________________________


In [8]:
word_idx = imdb.get_word_index()   ## The dictionary for word to its index from the source dataset

In [9]:
word_dict = dict([value, key] for (key, value) in word_idx.items())  ## Reversing this dict to get index to word

In [10]:
## Our predefined sentence-vectorizing function

def vect_sent(sentences, dims=10000):
    
    outputs = np.zeros((len(sentences),dims))
    
    for i,idx in enumerate(sentences):
        outputs[i,idx] = 1
    
    return outputs

In [11]:
def func(snt):

    sent = snt.split()
    test = []
    
    for ix in sent:
        try:
            if word_idx[ix]+3 > 10000:      ## the first 3 letters are .?/ so we need to skip them
                test.append(9999)           ## if the word index exceeds the 10000 vocabuary, set it to last index
            else:
                test.append(word_idx[ix]+3)
        
        except:
                test.append(9999)
    
    return test

In [12]:
sent = ["it is really bad and poor quality", "was really easy to hold"]

tes = []
for s in sent:
    
    tes.append(func(s))

In [13]:
tes = vect_sent(tes)

In [14]:
## Predicting the vectors on the pretrained model weights :: 0==>bad review, 1==>good review

pred = model.predict_classes(tes)

for ix in range(len(pred)):
    print(ix, print_emoji(str(ix)))

0 😢
1 😃


# Loading the text file

In [15]:
## The text file contains copied data from the Youtube transcript of 3 reviews on Nikon D5600 Camera

In [40]:
import glob

data = glob.glob("youtube-api extracted transcripts/*")

In [41]:
print(len(data))

177


In [42]:
sw = set(stopwords.words('english'))    ## Stopwords like a the so it etc that have no specific meaning are removed
tokenizer = RegexpTokenizer("[a-zA-Z@]+")   ## RegexTokenizer removes unwanted text like D5600 which has no meaning

scores = []

for vid_file in data:
    
    try:
        with open(vid_file, 'r') as in_file:   #Loading the text file adn splitting it in sentences
            text = in_file.read()
            sents = sent_tokenize(text)

        data = []
        data = sents[0].split('\n')
        reviews = []

        for ix in range(0,len(data),2):     # Every 2nd sentence is a timestamp which we need to skip
            reviews.append(data[ix])

        for ix in range(len(reviews)):

            word_list = tokenizer.tokenize(reviews[ix].lower())
            useful_words = [w for w in word_list if w not in sw and int]
            reviews[ix] = ' '.join(useful_words)

        reviews = np.array(reviews)

        tes = []
        for s in reviews:
            tes.append(func(s))

        tes = vect_sent(tes)
        pred = model.predict_classes(tes)

        score = np.sum(pred==[1])/pred.shape[0]
        scores.append(score)
    
    except:
        continue

In [45]:
scores = np.array(scores)

print(np.average(scores))

0.7366435830908725


In [24]:
## Prediction on the given Video scraped data is that 73.7% of the sentences used in youtube are good reviews
## i.e it was a positive review

In [47]:
## the reviews can be checked here::

print(scores)

[1.         0.7625     0.64102564 0.6826087  1.         0.67213115
 0.47777778 0.51351351 0.5952381  0.65454545 0.75       0.57894737
 0.72222222 0.74774775 0.65625    1.         0.9        0.69886364
 0.73469388 0.75471698 0.88235294 0.74846626 1.         0.58181818
 0.88888889 0.72340426 0.5        0.60869565 0.66666667 1.
 0.75163399 0.67213115 0.63461538 0.65124555 1.         0.69411765
 0.69565217 1.         0.85882353 0.67857143 0.64705882 0.5879397
 1.         0.70535714 0.75       0.62222222 0.63636364 0.75862069
 0.68656716 0.57894737 0.66666667 0.72727273 0.51111111 0.78571429
 0.60606061 0.42857143 0.75555556 0.75       1.         1.
 0.615      0.7027027  1.         0.69230769 0.75581395 1.
 0.67973856 1.         0.625      0.73214286 0.66666667 1.
 0.66935484 0.57407407 0.82352941 0.68131868 0.57627119 0.79310345
 0.5978836  0.78571429 0.70103093 1.         0.64772727 0.63120567
 0.79569892 1.         0.65740741 0.62903226 0.7        0.70967742
 0.73333333 0.84848485 0.557