In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input
from tensorflow.keras.models import Sequential
import nltk
import string
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
pd.set_option("display.max_columns", None)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cgrow\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\cgrow\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cgrow\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [94]:
imdb_db_full = pd.read_csv('imdb_labelled.txt', sep='\t')
n_cols = imdb_db_full.shape[1]

reviews = imdb_db_full.iloc[:,0]
labels = imdb_db_full.iloc[:,1:]

print(reviews)
print(labels)

0      Not sure who was more lost - the flat characte...
1      Attempting artiness with black & white and cle...
2           Very little music or anything to speak of.  
3      The best scene in the movie was when Gerardo i...
4      The rest of the movie lacks art, charm, meanin...
                             ...                        
742    I just got bored watching Jessice Lange take h...
743    Unfortunately, any virtue in this film's produ...
744                     In a word, it is embarrassing.  
745                                 Exceptionally bad!  
746    All in all its an insult to one's intelligence...
Name: A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  , Length: 747, dtype: object
     0
0    0
1    0
2    0
3    1
4    0
..  ..
742  0
743  0
744  0
745  0
746  0

[747 rows x 1 columns]


In [95]:
review_text = np.asarray(reviews)
print(f"There are {len(review_text)} reviews.")
review_text = review_text[~pd.isnull(review_text)]
print(f"There are {len(review_text)} reviews.")

There are 747 reviews.
There are 747 reviews.


In [96]:
rev_labels = np.asarray(labels)
print(f"There are {len(rev_labels)} labels.")
rev_labels = rev_labels[~pd.isnull(rev_labels)]
print(f"There are {len(rev_labels)} labels.")

There are 747 labels.
There are 747 labels.


In [97]:
sentence_tokens = [sent_tokenize(review.lower()) for review in review_text]
print(f'Tokenized into {len(sentence_tokens)} elements.')
sentence_tokens[0]

Tokenized into 747 elements.


['not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.']

In [98]:
stops = stopwords.words('english') + list(string.punctuation) + ['...',' - ', 'ca', 'wo', "'s", "'ing","'ll", "'re"]
print(f"{len(stops)} stopwords")
stops

238 stopwords


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [99]:
negation = ['but', 'not', "don't", "aren't", "couldn't", "doesn't", "hadn't", "hasn't", "haven't", 
                  "isn't", "shouldn't", "wouldn't"]
for word in negation:
    stops.remove(word)
print(f"{len(stops)} stopwords")
stops

226 stopwords


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'by',
 'can',
 'couldn',
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 'doing',
 'don',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 'has',
 'hasn',
 'have',
 'haven',
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan

In [100]:
word_tokens = []

for sentences in sentence_tokens:
    temporary_list = []

    if (len(sentences)) == 1:
        words_in_sentence = word_tokenize(sentences[0])
        for word in words_in_sentence:
            if word == "n't":
                word = "not"
            temporary_list.append(word)
    else:
        num_sentences = len(sentences)

        for i in range(0, num_sentences):
            words_in_sentence = word_tokenize(sentences[i])
            for word in words_in_sentence:
                if word == "n't":
                    word = "not"
                temporary_list.append(word)

    if len(temporary_list) == 0:
        temporary_list = ['empty']

    trimmed_words = [word for word in temporary_list if word not in stops]
    word_tokens.append(trimmed_words)

In [101]:
i = 0 
bag_of_words = []
max_review_length = 0

for words in word_tokens:
    if len(words) > max_review_length:
        max_review_length = len(words)
        print(f"New maximum review length found: Index {i} contains {max_review_length} word tokens.")
    for word in words:
        bag_of_words.append(word)
    i += 1 

unique_words = set(bag_of_words)
num_uniq_words = len(unique_words)
print(f"There are {len(bag_of_words)} words in this dataset, of which {num_uniq_words} words are unique. \nThe maximum number of words in any single review is {max_review_length}.")

New maximum review length found: Index 0 contains 9 word tokens.
New maximum review length found: Index 1 contains 18 word tokens.
New maximum review length found: Index 18 contains 508 word tokens.
New maximum review length found: Index 135 contains 805 word tokens.
There are 8043 words in this dataset, of which 3001 words are unique. 
The maximum number of words in any single review is 805.


In [102]:
word_tokens[135]

['fact',
 'hard',
 'remember',
 'part',
 'ray',
 'charles',
 'acted',
 'not',
 'played',
 'man',
 '1',
 'ray',
 'charles',
 'legendary',
 '1',
 'ray',
 'charles',
 'life',
 'provided',
 'excellent',
 'biographical',
 'material',
 'film',
 'goes',
 'well',
 'beyond',
 'another',
 'movie',
 'musician',
 '1',
 'hitchcock',
 'great',
 'director',
 '1',
 'ironically',
 'mostly',
 'find',
 'films',
 'total',
 'waste',
 'time',
 'watch',
 '0',
 'secondly',
 'hitchcock',
 'pretty',
 'much',
 'perfected',
 'thriller',
 'chase',
 'movie',
 '1',
 'pandering',
 'audience',
 'sabotages',
 'films',
 '0',
 'hence',
 'whole',
 'story',
 'lacks',
 'certain',
 'energy',
 '0',
 'plot',
 'simply',
 'rumbles',
 'like',
 'machine',
 'desperately',
 'depending',
 'addition',
 'new',
 'scenes',
 '0',
 'usual',
 'hitchcock',
 'logic',
 'flaws',
 '0',
 'mishima',
 'extremely',
 'uninteresting',
 '0',
 'chilly',
 'unremarkable',
 'movie',
 'author',
 'living/working',
 'chilly',
 'abstruse',
 'culture',
 '0',
 '

In [103]:
row_list = []
i = 0
# Iterate through each index across the cleaned up word_tokens
for i in range(0, len(word_tokens)):
    # Generate a temporary dictionary and a temporary string
    temp_dict = {}
    temp_string = ""
    # Take each word out of the indexed word_tokens (instead of taking the entire list at that index) and add it to temp_string
    for word in word_tokens[i]:
        temp_string = temp_string + word + " "
    # Update dictionary with the appropriate key-value pairs (the columns we want and the values we want)
    temp_dict.update({"recommended" : rev_labels[i]})
    temp_dict.update({"review_text" : temp_string})
    # Add the temporary dictionary to the list of dictionaries
    row_list.append(temp_dict)
    # Iterate the counter
    i += 1
# Use the list of dictionaries to quickly build a dataframe
new_df = pd.DataFrame(row_list)
new_df

Unnamed: 0,recommended,review_text
0,0,not sure lost flat characters audience nearly ...
1,0,attempting artiness black white clever camera ...
2,0,little music anything speak
3,1,best scene movie gerardo trying find song keep...
4,0,rest movie lacks art charm meaning emptiness w...
...,...,...
742,0,got bored watching jessice lange take clothes
743,0,unfortunately virtue film production work lost...
744,0,word embarrassing
745,0,exceptionally bad


In [104]:
X = new_df['review_text']
y = new_df['recommended']

X_train, X_remaining, y_train, y_remaining = train_test_split(X, y, train_size=.7, random_state = 1987)

X_valid, X_test, y_valid, y_test = train_test_split(X_remaining, y_remaining, test_size=0.5, random_state = 1987)

In [105]:
tokenizer = Tokenizer(num_words= 5000, lower= False)
tokenizer.fit_on_texts(X_train)
index_size = len(tokenizer.word_index) + 1
print(f"The word index is {index_size} elements long.")
tokenizer.word_index

The word index is 2356 elements long.


{'movie': 1,
 'not': 2,
 'film': 3,
 '0': 4,
 '1': 5,
 'but': 6,
 'bad': 7,
 'one': 8,
 'good': 9,
 'like': 10,
 'really': 11,
 'great': 12,
 'even': 13,
 'acting': 14,
 'see': 15,
 'time': 16,
 'movies': 17,
 'characters': 18,
 'story': 19,
 'plot': 20,
 "''": 21,
 'could': 22,
 'well': 23,
 'ever': 24,
 '10': 25,
 'way': 26,
 'made': 27,
 'script': 28,
 'would': 29,
 'watching': 30,
 'films': 31,
 'seen': 32,
 'character': 33,
 'better': 34,
 'best': 35,
 'real': 36,
 'watch': 37,
 'scenes': 38,
 'look': 39,
 'love': 40,
 'think': 41,
 'also': 42,
 'wonderful': 43,
 'actors': 44,
 'funny': 45,
 'everything': 46,
 'excellent': 47,
 'make': 48,
 'years': 49,
 'show': 50,
 'people': 51,
 'awful': 52,
 'little': 53,
 'still': 54,
 'nothing': 55,
 'never': 56,
 'every': 57,
 'cast': 58,
 'much': 59,
 'get': 60,
 'ending': 61,
 'many': 62,
 'art': 63,
 'dialogue': 64,
 'man': 65,
 'short': 66,
 'anyone': 67,
 'worth': 68,
 'go': 69,
 'work': 70,
 'scene': 71,
 'thing': 72,
 'cinematography

In [106]:
X_train_encoded = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_encoded, max_review_length, padding='post')
print(f"The shape of the padded reviews within the training set is {X_train_padded.shape}")
print(X_train_padded)

The shape of the padded reviews within the training set is (522, 805)
[[ 436   30    0 ...    0    0    0]
 [ 294   22  800 ...    0    0    0]
 [ 802   20  167 ...    0    0    0]
 ...
 [2348    2   68 ...    0    0    0]
 [ 113  113  113 ...    0    0    0]
 [ 278 2349 2350 ...    0    0    0]]


In [107]:
X_valid_encoded = tokenizer.texts_to_sequences(X_valid)
X_valid_padded = pad_sequences(X_valid_encoded, max_review_length, padding='post')
print(f"The shape of the padded reviews within the training set is {X_valid_padded.shape}")
print(X_valid_padded)

The shape of the padded reviews within the training set is (112, 805)
[[   1 1840    0 ...    0    0    0]
 [ 116    0    0 ...    0    0    0]
 [ 182   14  267 ...    0    0    0]
 ...
 [1483  595    0 ...    0    0    0]
 [ 145   26   33 ...    0    0    0]
 [ 235 1396  377 ...    0    0    0]]


In [108]:
X_test_encoded = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_encoded, max_review_length, padding='post')
print(f"The shape of the padded reviews within the training set is {X_test_padded.shape}")
print(X_test_padded)

The shape of the padded reviews within the training set is (113, 805)
[[ 232  315  778 ...    0    0    0]
 [  20  436  144 ...    0    0    0]
 [ 108    3   82 ...    0    0    0]
 ...
 [ 508   25  902 ...    0    0    0]
 [  50   29 1437 ...    0    0    0]
 [ 573   65  781 ...    0    0    0]]


In [109]:
new_df.to_csv('task2.csv')

In [116]:
model = Sequential([Input(shape=(X_train_padded.shape)),Flatten(),
                    Dense(64, activation = "relu"), Dense(32, activation = "relu"),
                    Dense(1, activation='sigmoid')])
model.compile(optimizer= 'adam', loss= 'binary_crossentropy', metrics=['accuracy'])
print(model.summary())

None


In [119]:
early_stop_check = EarlyStopping(monitor= 'val_accuracy', patience=3)
results = model.fit(X_train_padded, y_train, validation_data= (X_valid_padded, y_valid), epochs=15, callbacks=early_stop_check)

Epoch 1/15


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("sequential_19_1/Cast:0", shape=(None, 805), dtype=float32). Expected shape (None, 522, 805), but input has incompatible shape (None, 805)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 805), dtype=int32)
  • training=True
  • mask=None