<a href="https://colab.research.google.com/github/Ankush-Chander/deep-learning-101/blob/main/linear_classification_with_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 27.3 gigabytes of available RAM



In [None]:
# load dataset 
from tensorflow.keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
num_words=10000)



Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
# vectorize dataset inputs
# 1. load glove word2vec
import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-100")


# vectorize dataset input
word_index = imdb.get_word_index()
reverse_word_index = {val:key for key, val in word_index.items()}

# print(" ".join([reverse_word_index.get(i-3, "?") for i in train_data[0]]))
def tokenize_and_vectorize(train_data):
  vectorized_train_data = []
  for sample in train_data:
    sample_vector = []
    for token_id in sample:
      try:
        word = reverse_word_index.get(token_id-3, "?")
        word_vector = word_vectors[word]
        sample_vector.append(word_vector)
      except KeyError as err:
        pass
    vectorized_train_data.append(sample_vector)

  return vectorized_train_data


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [None]:
vectorized_data = tokenize_and_vectorize(train_data)
expected = train_labels

In [None]:
# test/train split
split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point]
x_test = vectorized_data[split_point:]
y_train = expected[:split_point]
y_test =  expected[split_point:]
len(x_train[1])




185

In [None]:
# CNN parameters
maxlen = 400 # maximum length of sentences to be considered for sentiment analysis
batch_size = 32 # How many samples to show to the neural net before backpropagation
embedding_dims = 100 # length of token embedding vector
filters = 250 # Number of filters you will train
kernel_size = 3 # Width of filter
hidden_dims = 250 # Number of neurons at the end of plain feedforward net at the end if the chain
epochs = 2 # Number of times you will pass the entire dataset through the network


In [None]:
# Padding and truncating your token sequence
import numpy as np
print(min([len(sample) for sample in x_train]))
print(max([len(sample) for sample in x_train]))
print(len(x_train[0][0]))

def pad_trunc(dataset, maxlen, embedding_dims):
  return [sample[:maxlen] + (maxlen - len(sample)) * [[0.]*embedding_dims] for sample in dataset]

x_train = pad_trunc(x_train, maxlen, embedding_dims)
x_test = pad_trunc(x_test, maxlen, embedding_dims)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)

x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)




10
2477
100


In [None]:
# Construct a 1D CNN
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D

model = Sequential()
model.add(Conv1D(filters=filters, kernel_size=kernel_size, strides=1, padding="valid",activation="relu", input_shape=(maxlen, embedding_dims)))
model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation("sigmoid"))


In [None]:
# Compile the CNN
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train your model
model.fit(x_train, y_train,  batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f9b7ba36b90>

In [None]:
# Save your model
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
  json_file.write(model_structure)
model.save_weights("cnn_weights.h5")

In [None]:
# Use the model in the pipeline
# load model

from keras.models import model_from_json
with open("cnn_model.json", "r") as json_file:
  json_string = json_file.read()

loaded_model  = model_from_json(json_string)
loaded_model.load_weights('cnn_weights.h5')

negative_sample = "I didnt like the movie at all."
positive_sample = "The movie was awesome"
ambigous_sample1 = "The acting was awesome but plot was horrible."
ambigous_sample2 = "The movie was okayish."

def vectorize_input_text(dataset):
  vectorized_dataset = []
  for text in dataset:
    sample_vector = []
    for word in text.split():
      try:
        word_vector = word_vectors[word]
        sample_vector.append(word_vector)
      except KeyError as err:
        pass
    vectorized_dataset.append(sample_vector)
  return vectorized_dataset  


def predict_sentiment(input_data:list):
  # convert input_data into vectorized format
  vectorized_input_data = vectorize_input_text(input_data)
  # padding_truncating
  trunc_data = pad_trunc(vectorized_input_data, maxlen=maxlen, embedding_dims=embedding_dims)
  # reshape data
  reshaped_input_data = np.reshape(trunc_data, (len(trunc_data), maxlen, embedding_dims))
  x = loaded_model.predict(reshaped_input_data)
  return x

x = predict_sentiment([negative_sample, positive_sample, ambigous_sample1, ambigous_sample2])
print(x)

[[0.25989667]
 [0.87162495]
 [0.5543196 ]
 [0.2896278 ]]


In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 398, 250)          75250     
                                                                 
 global_max_pooling1d (Globa  (None, 250)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 250)               62750     
                                                                 
 dropout (Dropout)           (None, 250)               0         
                                                                 
 activation (Activation)     (None, 250)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 251       
                                                        