In [211]:
import numpy as np
import pandas as pd
import sklearn
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [212]:
movies = pd.read_csv('archive/movies_metadata.csv')

In [213]:
movies = movies.filter(['original_title', 'id', 'imdb_id','vote_average'])

In [214]:
movies["reviews"] = [[] for _ in range(len(movies))]
movies.head()

Unnamed: 0,original_title,id,imdb_id,vote_average,reviews
0,Toy Story,862,tt0114709,7.7,[]
1,Jumanji,8844,tt0113497,6.9,[]
2,Grumpier Old Men,15602,tt0113228,6.5,[]
3,Waiting to Exhale,31357,tt0114885,6.1,[]
4,Father of the Bride Part II,11862,tt0113041,5.7,[]


In [215]:
movies['vote_average'].isnull().values.any()

True

In [217]:
movies.dropna(subset = ['vote_average'], inplace=True)

Unnamed: 0,original_title,id,imdb_id,vote_average,reviews
0,Toy Story,862,tt0114709,7.7,[]
1,Jumanji,8844,tt0113497,6.9,[]
2,Grumpier Old Men,15602,tt0113228,6.5,[]
3,Waiting to Exhale,31357,tt0114885,6.1,[]
4,Father of the Bride Part II,11862,tt0113041,5.7,[]


In [218]:
labels = []
for n in movies['vote_average']:
    num=round(n)
    labels+=[num]
movies.insert(4, "label", labels, True)

In [219]:
movies.head()

Unnamed: 0,original_title,id,imdb_id,vote_average,label,reviews
0,Toy Story,862,tt0114709,7.7,8,[]
1,Jumanji,8844,tt0113497,6.9,7,[]
2,Grumpier Old Men,15602,tt0113228,6.5,6,[]
3,Waiting to Exhale,31357,tt0114885,6.1,6,[]
4,Father of the Bride Part II,11862,tt0113041,5.7,6,[]


In [221]:
import json
import os
import re
import requests

REVIEW_PATTERN = 'https://api.themoviedb.org/3/movie/{movie_id}/reviews?api_key={key}'
KEY = '6f2ceb0e0b459afe90f6e854fcd410eb'
            
def _get_json(url):
    r = requests.get(url)
    return r.json()


def get_tmdb_reviews():
    """ request reviews with 'id' from TMDB and append them to the reviews column """
    k=0
    for index, row in movies.iterrows():
        id = row['id']
        r = _get_json(REVIEW_PATTERN.format(key=KEY,movie_id=id))
        
        if not r:
            print("request cannot be located")
            continue
        
        n = r.get('total_results')
        
        reviews = []
        for i in range(n):
            k += 1
            result = r['results'][i]
            review = result.get('content')
            reviews.append(review)
            #rating = result.get('author_details').get('rating')
            
        movies.at[index, 'reviews'] = reviews
        if not reviews:
            movies.at[index, 'reviews'] = float("NaN")
        
        # SET LIMIT
        if index > 1000:
            print(k)
            break

    return None

get_tmdb_reviews()
movies.head()

KeyboardInterrupt: 

In [222]:
movies.head()

Unnamed: 0,original_title,id,imdb_id,vote_average,label,reviews
0,Toy Story,862,tt0114709,7.7,8,[This movie came out when I was three. Now I'm...
1,Jumanji,8844,tt0113497,6.9,7,"[Throw the dice and take a turn, Jumanji made ..."
2,Grumpier Old Men,15602,tt0113228,6.5,6,
3,Waiting to Exhale,31357,tt0114885,6.1,6,
4,Father of the Bride Part II,11862,tt0113041,5.7,6,


In [223]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize

import string

def tokenize(list):
    t = []
    
    for s in list:
        tokens = s.split()

        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]
        
        tokens = [word for word in tokens if word.isalpha()]
        
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]
        
        tokens = [word for word in tokens if len(word) > 1]
        
        t.append(tokens)
    
    t = [word.lower() for sl in t for word in sl]
        
    lemmatizer = WordNetLemmatizer()
    for w in t:
        w = lemmatizer.lemmatize(w)
    
    # returns a flat list of strings
    return t

def clean_data():
    movies.dropna(subset = ["reviews"], inplace=True)
    for index, row in movies.iterrows():
        review = row['reviews']
        movies.at[index, 'reviews'] = tokenize(review)
        
        # SET LIMIT
        #if index > 20:
            #print(k)
            #break
            
    return None

clean_data()
movies.head(20)

[nltk_data] Downloading package wordnet to C:\Users\Gina
[nltk_data]     Wu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Gina
[nltk_data]     Wu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,original_title,id,imdb_id,vote_average,label,reviews
0,Toy Story,862,tt0114709,7.7,8,"[this, movie, came, three, now, im, twenty, se..."
1,Jumanji,8844,tt0113497,6.9,7,"[throw, dice, take, turn, jumanji, made, criti..."
5,Heat,949,tt0113277,7.7,8,"[you, dont, live, live, among, remains, dead, ..."
7,Tom and Huck,45325,tt0112302,5.4,5,"[good, enough, it, doesnt, come, close, disney..."
9,GoldenEye,710,tt0113189,6.6,7,"[really, solid, entry, series, brosnan, person..."
11,Dracula: Dead and Loving It,12110,tt0112896,5.7,6,"[most, probably, least, favourite, film, mel, ..."
14,Cutthroat Island,1408,tt0112760,5.7,6,"[not, bad, notoriously, condemned, still, clea..."
15,Casino,524,tt0112641,7.8,8,"[sharon, stone, robert, de, niro, amazing, rob..."
22,Assassins,9691,tt0112401,6.0,6,"[check, mate, sly, stallone, stars, ageing, as..."
23,Powder,12665,tt0114168,6.3,6,"[cant, imagine, theres, meanness, cant, imagin..."


In [224]:
movies["sentences"] = ["" for _ in range(len(movies))]
for index, row in movies.iterrows():
        words = row['reviews']
        movies.at[index, 'sentences'] = ' '.join(words)    

In [225]:
#max_length=movies.apply(lambda row: row.apply(len).argmax(), axis=1)
def get_max_len():
    max_length = movies['reviews'].map(lambda x: len(x)).max()
    return max_length

In [226]:
from keras.preprocessing.text import Tokenizer

# fit a tokenizer
def create_tokenizer(list):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(list)
    return tokenizer

In [15]:
def get_vocab_size(tokenizer):
    vocab_size = len(tokenizer.word_index) + 1
    return vocab_size

In [16]:
from keras.preprocessing.sequence import pad_sequences

def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [17]:
movies["sentences"] = ["" for _ in range(len(movies))]
for index, row in movies.iterrows():
        words = row['reviews']
        movies.at[index, 'sentences'] = ' '.join(words)

In [18]:
movies.head()

Unnamed: 0,original_title,id,imdb_id,vote_average,label,reviews,sentences
0,Toy Story,862,tt0114709,7.7,8,"[this, movie, came, three, now, im, twenty, se...",this movie came three now im twenty seven godd...
1,Jumanji,8844,tt0113497,6.9,7,"[throw, dice, take, turn, jumanji, made, criti...",throw dice take turn jumanji made critics gurn...
5,Heat,949,tt0113277,7.7,8,"[you, dont, live, live, among, remains, dead, ...",you dont live live among remains dead people h...
7,Tom and Huck,45325,tt0112302,5.4,5,"[good, enough, it, doesnt, come, close, disney...",good enough it doesnt come close disneys film ...
9,GoldenEye,710,tt0113189,6.6,7,"[really, solid, entry, series, brosnan, person...",really solid entry series brosnan personally f...


In [33]:
reviews=movies.replace(" ", np.nan, inplace=False)
reviews.dropna(subset = ["sentences"], inplace=False)

Unnamed: 0,original_title,id,imdb_id,vote_average,label,reviews,sentences
0,Toy Story,862,tt0114709,7.7,8,"[this, movie, came, three, now, im, twenty, se...",this movie came three now im twenty seven godd...
1,Jumanji,8844,tt0113497,6.9,7,"[throw, dice, take, turn, jumanji, made, criti...",throw dice take turn jumanji made critics gurn...
5,Heat,949,tt0113277,7.7,8,"[you, dont, live, live, among, remains, dead, ...",you dont live live among remains dead people h...
7,Tom and Huck,45325,tt0112302,5.4,5,"[good, enough, it, doesnt, come, close, disney...",good enough it doesnt come close disneys film ...
9,GoldenEye,710,tt0113189,6.6,7,"[really, solid, entry, series, brosnan, person...",really solid entry series brosnan personally f...
...,...,...,...,...,...,...,...
45461,رگ خواب,439050,tt6209470,4.0,4,[],
45462,Siglo ng Pagluluwal,111109,tt2028550,9.0,9,[],
45463,Betrayal,67758,tt0303758,3.8,4,[],
45464,Satana likuyushchiy,227506,tt0008536,0.0,0,[],


In [227]:
reviews=pd.read_csv('ReviewsDataset.csv')

In [228]:
reviews = reviews.filter(['label', 'sentences'])
print(len(reviews))
reviews.head()

928


Unnamed: 0,label,sentences
0,8,this movie came three now im twenty seven godd...
1,7,throw dice take turn jumanji made critics gurn...
2,8,you dont live live among remains dead people h...
3,5,good enough it doesnt come close disneys film ...
4,7,really solid entry series brosnan personally f...


In [229]:
reviews["sentences"].replace(" ", np.nan, inplace=True)
reviews.dropna(subset = ["sentences"], inplace=True)
print(len(reviews))

928


In [230]:
reviews.isna().any()

label        False
sentences    False
dtype: bool

In [231]:
reviews.to_csv('ReviewsDataset.csv')

In [232]:
X_train, X_test, y_train, y_test = train_test_split(reviews["sentences"], reviews["label"], test_size=0.20, random_state=42)

In [313]:
from keras.utils import np_utils

tokenizer = create_tokenizer(X_train)
print("Vocab size: ", get_vocab_size(tokenizer))

X_train_encoded = encode_text(tokenizer, X_train, get_max_len())
X_test_encoded = encode_text(tokenizer, X_test, get_max_len())
print("Input shape: ", X_train_encoded.shape)

y_train_encoded = np_utils.to_categorical(y_train)
#y_train_encoded = np.hstack((y_train_encoded))
y_test_encoded = np_utils.to_categorical(y_test)
print("Output shape: ", y_train_encoded.shape)

Vocab size:  21694
Input shape:  (742, 968)
Output shape:  (742, 9)


In [310]:
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

def custom_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    
    # merge
    merged = concatenate([flat1, flat2, flat3])
    
    # interpretation
    dense1 = Dense(9, activation='relu')(merged)
    outputs = Dense(9, activation='softmax')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    
    # compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # summarize
    print(model.summary())
    return model

In [311]:
model = custom_model(get_max_len(), get_vocab_size(tokenizer))

Tensor("dense_36/Relu:0", shape=(?, 9), dtype=float32)
Model: "model_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_55 (InputLayer)           (None, 968)          0                                            
__________________________________________________________________________________________________
input_56 (InputLayer)           (None, 968)          0                                            
__________________________________________________________________________________________________
input_57 (InputLayer)           (None, 968)          0                                            
__________________________________________________________________________________________________
embedding_55 (Embedding)        (None, 968, 100)     2169400     input_55[0][0]                   
____________________________________

In [268]:
from array import array

In [281]:
print(X_train_encoded.shape)

(742, 968)


In [312]:
print(y_train_encoded.shape)

(6678,)


In [279]:
np.array([X_train_encoded, X_train_encoded, X_train_encoded]).shape

(3, 742, 968)

In [317]:
# fit model
model.fit([X_train_encoded, X_train_encoded, X_train_encoded], y_train_encoded, epochs=10, batch_size=16)
loss, acc = model.evaluate([X_test_encoded, X_test_encoded, X_test_encoded], y_test_encoded, verbose=0)
print("Test Loss: ", loss)
print("Test Accuracy: ", acc)

# save the model
model.save('model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss:  0.5086772800773702
Test Accuracy:  0.8560335040092468
