We'll be using the Stanford AI Large Movie Review Dataset from https://ai.stanford.edu/~amaas/data/sentiment/

First, Connect our google drive, or upload the dataset directly to the google colab session files.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Extract the dataset to our working directory, or to your google colab folder if it is connected

In [None]:
import tarfile
tar = tarfile.open("./aclImdb_v1.tar.gz")
tar.extractall()
tar.close()

# Reading and preprocessing the data

In [1]:
import glob
import os
from random import shuffle
from tqdm import tqdm
def pre_process_data(filepath):

	positive_path = os.path.join(filepath, 'pos')
	negative_path = os.path.join(filepath, 'neg')
	pos_label = 1
	neg_label = 0
	dataset = []

  # for the sake of memory limitations, we'll limit ourselves with only 5000 negative and 5000 positive samples 
	files_count = 0
	files_limit = 5000
	for filename in tqdm(glob.glob(os.path.join(positive_path, '*.txt'))):
		with open(filename, 'r') as f:
			dataset.append((pos_label, f.read()))
		files_count += 1
		if files_count > files_limit:
 			break
      
	files_count = 0
	for filename in tqdm(glob.glob(os.path.join(negative_path, '*.txt'))):
		with open(filename, 'r') as f:
			dataset.append((neg_label, f.read()))
		files_count += 1
		if files_count > files_limit:
			break

	shuffle(dataset)

	return dataset[:]

In [2]:
dataset = pre_process_data('aclImdb/train')

 40%|██████████████▊                      | 5000/12500 [00:13<00:19, 383.21it/s]
 40%|██████████████▊                      | 5000/12500 [00:13<00:20, 373.12it/s]


# Tokenizing and Vectorizing the data with word embeddings

In [3]:
dataset[0]

(1,
 'Made only ten years after the actual events, and set in the Bunker under the Reichstag, Pabst\'s film is wholly gripping. It reeks of sulfurous death awaiting the perpetrators of world war. Haven\'t seen this in over three decades, but it remains strong in my visual and emotional memory. The characters seem to be waiting to be walled up in their cave. Searing bit of dialog between two Generals: "Does God exist?" "If He did, we wouldn\'t." Shame this is not more readily available for exhibition or purchase because it would be interesting to view and compare this film with the documentary about Traudl Junge, "Im Toten Winkel" {aka "Blind Spot: Hitler\'s Secretary") and "Downfall" with Bruno Ganz.')

In [4]:
import numpy as np

In [5]:
import gensim
w2v = gensim.models.KeyedVectors.load_word2vec_format('/Users/khodor/Documents/Efrei/Courses/2021-2022/S8/NLP Course/exercizes/6-NN/GoogleNews-vectors-negative300-SLIM.bin.gz', binary=True)

#'./drive/MyDrive/Colab Notebooks/GoogleNews-vectors-negative300-SLIM.bin.gz'


In [6]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors

In [7]:
def tokenize_and_vectorize(dataset):
	print('tokenizing and vectorizing')
	tokenizer = TreebankWordTokenizer()
	vectorized_data = []
	expected = []
	for sample in tqdm(dataset):
		tokens = tokenizer.tokenize(sample[1])
		sample_vecs = []
		for token in tokens:
			try:
				sample_vecs.append(w2v[token])
			except KeyError:
				pass # No matching token in the Google w2v vocab
		
		vectorized_data.append(sample_vecs)
	
	return vectorized_data

In [8]:
def collect_expected(dataset):
    expected = []
    for sample in tqdm(dataset):
        expected.append(sample[0])
    return expected

In [9]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

tokenizing and vectorizing


100%|███████████████████████████████████| 10002/10002 [00:08<00:00, 1236.08it/s]
100%|████████████████████████████████| 10002/10002 [00:00<00:00, 3292373.93it/s]


In [11]:
# delete the word2vec model to save memory
# w2v = None

In [10]:
split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

# Model Building

In [None]:
from keras.preprocessing import sequence # A helper module to handle padding input
from keras.models import Sequential # The base Keras neural network model
from keras.layers import Dense, Dropout, Activation #The layer objects you’ll pile into the model
from keras.layers import Conv1D, GlobalMaxPooling1D # Your convolution layer, and pooling

In [13]:
maxlen = 200
batch_size = 32 # How many samples to show the net before backpropagating the error and updating the weights
embedding_dims = 300 # Length of the token vectors you’ll create for passing into the convnet
filters = 100 # Number of filters you’ll train
kernel_size = 3 # The width of the filters; actual filters will each be a matrix
                # of weights of size: embedding_dims x kernel_size, or 50 x 3 in your case
hidden_dims = 100 # Number of neurons in the plain feedforward net at the end of the chain
epochs = 2 # Number of times you’ll pass the entire training dataset through the network

### For a given dataset pad with zero vectors or truncate to maxlen

In [14]:
def pad_trunc(data, maxlen):
	new_data = []
	# Create a vector of 0s the length of our word vectors
	zero_vector = []
	for _ in range(len(data[0][0])):
		zero_vector.append(0.0)
	
	for sample in tqdm(data):
		if len(sample) > maxlen:
			temp = sample[:maxlen]
		elif len(sample) < maxlen:
			temp = sample
			# Append the appropriate number 0 vectors to the list
			additional_elems = maxlen - len(sample)
			for _ in range(additional_elems):
				temp.append(zero_vector)
		else:
			temp = sample
		new_data.append(temp)
	return new_data

In [15]:
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

100%|██████████| 8001/8001 [00:00<00:00, 145174.26it/s]
100%|██████████| 2001/2001 [00:00<00:00, 106463.06it/s]


## Model Layers Creation

In [17]:
model = Sequential()
model.add(Conv1D( # the first layer you add is the convolutional layer
  filters,
  kernel_size,
  padding='valid', # assume that it's ok for the output to be of smaller domentions than the input 
  activation='relu',
  strides=1,
  input_shape=(maxlen, embedding_dims))
)

Build model...


In [18]:
model.add(GlobalMaxPooling1D()) # Max Pooling layer.
                                # Instead of taking the max of a small subsection of each filter’s output,
                                # you’re taking the max of the entire output for that filter

In [19]:
model.add(Dense(hidden_dims)) # standard feedforward network
model.add(Dropout(0.2)) # only 80% of the embedding data, randomly chosen for
                        # each training sample, will pass into the next layer as it is. The rest will go in as 0s.
model.add(Activation('relu')) # use the Rectified Linear Units activation (relu) on the output end of each neuron.

In [20]:
# Here is the actual classifier
# a neuron that fires based on the sigmoid activation function; it gives a value between 0 and 1.
model.add(Dense(1))
model.add(Activation('sigmoid'))

## Compile the Model

In [21]:
model.compile(loss='binary_crossentropy', #The loss function is what the network will try to minimize
  optimizer='adam', # an optimization algorithm to minimizing the loss function
  metrics=['accuracy'] # based on what will we evaluate the performance of our model
)

## Start the Training

In [23]:
model.fit(x_train, y_train,
  batch_size=batch_size,
  epochs=epochs,
  validation_data=(x_test, y_test))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f9b9ff9a990>

## Test the Model

In [30]:
smpl =  tokenize_and_vectorize([(1, "The sugar is sweet")])
smpl = pad_trunc(smpl, maxlen)
test_vec = np.reshape(smpl, (len(smpl), maxlen, embedding_dims))
model.predict_classes(test_vec)

100%|██████████| 1/1 [00:00<00:00, 3266.59it/s]
100%|██████████| 1/1 [00:00<00:00, 1803.23it/s]

tokenizing and vectorizing





array([[1]], dtype=int32)

## Save and Load

In [None]:
# save the structure of the created model
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
  json_file.write(model_structure)

# save the trained weights of the model
model.save_weights("cnn_weights.h5")

In [None]:
from keras.models import model_from_json

# load the structure of a previously created model
with open("cnn_model.json", "r") as json_file:
  json_string = json_file.read()
model = model_from_json(json_string)

# load the previously trained weights
model.load_weights('cnn_weights.h5')