In [2]:
!pip install nlpia

Collecting nlpia
[?25l  Downloading https://files.pythonhosted.org/packages/89/f6/ab35e962dd0b19f1008e88e788b202d45a90d9cd70b9bbf0ac26489ee260/nlpia-0.5.2-py2.py3-none-any.whl (32.0MB)
[K     |████████████████████████████████| 32.0MB 148kB/s 
Collecting python-Levenshtein
[?25l  Downloading https://files.pythonhosted.org/packages/2a/dc/97f2b63ef0fa1fd78dcb7195aca577804f6b2b51e712516cc0e902a9a201/python-Levenshtein-0.12.2.tar.gz (50kB)
[K     |████████████████████████████████| 51kB 6.2MB/s 
[?25hCollecting html2text
  Downloading https://files.pythonhosted.org/packages/ae/88/14655f727f66b3e3199f4467bafcc88283e6c31b562686bf606264e09181/html2text-2020.1.16-py3-none-any.whl
Collecting pypandoc
  Downloading https://files.pythonhosted.org/packages/d6/b7/5050dc1769c8a93d3ec7c4bd55be161991c94b8b235f88bf7c764449e708/pypandoc-1.5.tar.gz
Collecting pugnlp
[?25l  Downloading https://files.pythonhosted.org/packages/f7/20/a9f0b1f45c074c63da716fc1b301916cc3b64c11d5cbf7cb305eafaf158a/pugnlp-0.2

In [3]:
import numpy as np
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Conv1D, GlobalMaxPooling1D

# read data
import glob
import os
from random import shuffle

from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
from nlpia.loaders import get_data

  [datetime.datetime, pd.datetime, pd.Timestamp])
  MIN_TIMESTAMP = pd.Timestamp(pd.datetime(1677, 9, 22, 0, 12, 44), tz='utc')
  np = pd.np
  np = pd.np
  np = pd.np
  np = pd.np


In [4]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [7]:
filepath = '/content/drive/MyDrive/sentiments/aclImdb/train'

In [9]:
# load data
def pre_process_data(filepath):

    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []

    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
            
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))

    shuffle(dataset)

    return dataset

dataset = pre_process_data(filepath)

In [None]:
# Vectorizer and tokenizer
word_vectors = get_data('w2v', limit=200000)
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
          try:
            sample_vecs.append(word_vectors[token])
          except KeyError:
            pass # No matching token in the Google w2v vocab

        vectorized_data.append(sample_vecs)

    return vectorized_data

def collect_expected(dataset):
    """ Peel off the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

100%|██████████| 402111/402111 [00:15<00:00, 25896.96it/s]






In [None]:
# train/test split
split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point_]
y_train_ = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

# CNN params
maxlen = 400
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

In [None]:
def pad_trunc(data, maxlen):
    """
    For a given dataset pad with zero vectors or truncate to maxlen
    """
    new_data = []
    
    # Create a vector of 0s the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
    	zero_vector.append(0.0)
    
    for sample in data:
        if len(sample) > maxlen:
        	temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            # Append the appropriate number 0 vectors to the list
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
            	temp.append(zero_vector)
        else:
        	temp = sample
        new_data.append(temp)
    return new_data

#  pass train and test data into the padder/truncator
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)


	1. For each input example, you applied a filter (weights and activation function).
    2. Convolved across the length of the input, which would output a 1D vector
    slightly smaller than the original input (1 x 398 which is input with the filter
    starting left-aligned and finishing right-aligned) for each filter.
    3. For each filter output (there are 250 of them, remember), you took the single
    maximum value from each 1D vector.
    4. At this point you have a single vector (per input example) that is 1 x 250 (the
    number of filters).


In [None]:
# CNN model
model = Sequential()
model.add(Conv1D(
                filters,
                kernel_size,
                padding='valid',
                activation='relu',
                strides=1,
                input_shape=(maxlen, embedding_dims)))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

In [None]:
# save model
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
	json_file.write(model_structure)
model.save_weights("cnn_weights.h5")

In [None]:
# using model
from keras.models import model_from_json
with open("cnn_model.json", "r") as json_file:
    json_string = json_file.read()
    
model = model_from_json(json_string)
model.load_weights('cnn_weights.h5')

vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
model.predict(test_vec)  # model.predict_classes(test_vec)