In [None]:
# Deep Learning for Natural Language Processing (NLP)

# Software Infrastructure <a name="software_infrastructure"></a>

We will use the following libraries and tools:

- Python 3
- **Matplotlib**, a Python library for plotting.
- **Pandas**, a Python library which provides a dataframe data structure, and operations for manipulating numerical tables and time series.
- **NumPy** Python library which provides support for large, multi-dimensional arrays and matrices, as well as a large collection of high-level mathematical functions to operate on these arrays. 
- Python **SciPy** library for scientific computing -- modules for optimization, linear algebra, integration, interpolation, special functions, FFT, signal and image processing, ODE solvers and other tasks common in science and engineering.
- Python **scikit-learn**, a machine learning library
- **Theano**, a numerical platform for developing deep learning models
- **TensorFlow**, another numerical platform for developing deep learning models
- **Keras**, a concise Python API for **Theano** and **TensorFlow**
- **NLTK**, a Python library for Natural Language Processing (NLP)
- **GenSim**, a a Python library for vector space information retrieval modeling and topic modeling
- **pydot** and **graphviz** libraries

# NLP Topics

- Data cleaning and normalization
- Bag-of-words model for text
- Distributed representations of text using word embedding models
- Text classification
- Neural language models for text generation
- Neural sentiment analysis (a classification problem)
- Neural machine translation (translating text from one language to another)
- Generating captions for images


# Benifits of DL to NLP

- Scalability
- Hierarchical Feature Learning

# Commonly Used Optimization Algorithms

- Stochastic Gradient Descent (SGD)
- Adam (requires tuning of learning rate)
- RMSprop (requires tuning of learning rate)

## Construction of a Simple Multilayer Perceptron (MLP) for Binary Classification Problem

In [None]:
# construction of a simple MLP 
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense

# define input layer
visible = Input(shape=(10,))

# define hidden layers with relu activation function
hidden1 = Dense(10, activation='relu')(visible)
hidden2 = Dense(20, activation='relu')(hidden1)
hidden3 = Dense(10, activation='relu')(hidden2)

# define output layer with sigmoid activation function
output = Dense(1, activation='sigmoid')(hidden3)

# define MLP model
model = Model(inputs=visible, outputs=output)

# summarize layers
model.summary()

# plot graph of the MLP model
# plot_model(model, to_file='mlp_model.png')

In the MLP model above,

   - How many neurons are there in the input layer?
   - How many neurons are there in the output layer?
   - How many hiddenlayers are there?
   - What activation functions are used in each hidden layer?
   - What activation function is used in the output layer?

## Construction of a Simple Convolutional Neural Network (CNN) 

In [None]:
# construction of a simple CNN
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D

# input is black-and-white 64 x 64 images

# define input layer
visible = Input(shape=(64,64,1))

# define first convolutional layer
conv1 = Conv2D(32, kernel_size=4, activation='relu')(visible)

# define first pooling layer
pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)

# define second convolutional layer with relu activation function
conv2 = Conv2D(16, kernel_size=4, activation='relu')(pool1)

# define second pooling layer
pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)

# define hidden layer with relu activation function
hidden1 = Dense(10, activation='relu')(pool2)

# define output layer with sigmoid activation function
output = Dense(1, activation='sigmoid')(hidden1)

# define CNN model
model = Model(inputs=visible, outputs=output)

# summarize layers
model.summary()

# plot graph of the CNN model
# plot_model(model, to_file='cnn_model.png')

## Construction of a Simple Recurrent Neural Network (RNN)

In [None]:
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers.recurrent import LSTM

# define input layer
visible = Input(shape=(100,1))

# define hidden layers
hidden1 = LSTM(10)(visible)
hidden2 = Dense(10, activation='relu')(hidden1)

# define output layer
output = Dense(1, activation='sigmoid')(hidden2)

# define RNN model
model = Model(inputs=visible, outputs=output)

# summarize layers
model.summary()

# plot graph of the RNN model
# plot_model(model, to_file='rnn_model.png')

## Textual Data Cleaning

In [1]:
# first install NLTK (one time task)
sudo pip install -U nltk

import nltk

# data download (one time task)
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [7]:
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
import string
import re

# load text file
filename = 'austen-emma.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

# split text into sentences
sentences = sent_tokenize(text)
print(sentences[0])

# split text into words by white space as delimiter
words = text.split()

# print first one hundred words
print(words[:100])

# or, you can also split text using words as delimiters
# define what a word is using a regular expression
# strings of alphanumeric characters (a-z, A-Z, 0-9 and _ (underscore)
words = re.split(r'\W+', text)

# print first one hundred words
print(words[:100])

# which are Python punctuation characters?
print(string.punctuation)

# as before, split text into words using white space
words = text.split()

# normalization - convert to lower case
words = [word.lower() for word in words]
print(words[:100])

# replace punctuation characters with nothing
re_punc = re.compile('[%s]' % re.escape(string.punctuation))

# remove punctuation from each word
stripped = [re_punc.sub('', w) for w in words]

# functuation-free words
print(stripped[:100])



[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.
['[Emma', 'by', 'Jane', 'Austen', '1816]', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse,', 'handsome,', 'clever,', 'and', 'rich,', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition,', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence;', 'and', 'had', 'lived', 'nearly', 'twenty-one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her.', 'She', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate,', 'indulgent', 'father;', 'and', 'had,', 'in', 'consequence', 'of', 'her', "sister's", 'marriage,', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'per

In [6]:
# NLTK stopword list
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
# NLTK tokenizer and stemmer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

filename = 'austen-emma.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

# split into words using NLTK tokenizer
tokens = word_tokenize(text)
print(tokens[:100])

# convert to lower case
tokens = [w.lower() for w in tokens]

# regex for punctutaion character filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))

# remove punctuation from each word
stripped = [re_punc.sub('', w) for w in tokens]

# remove tokens that are not alphabetic
words = [word for word in tokens if word.isalpha()]
print(words[:100])

# finally, remove stop words
stop_words = set(stopwords.words('english'))

words = [w for w in words if not w in stop_words]
print(words[:100])


['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', ';', 'and', 'had', 'lived', 'nearly', 'twenty-one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her', '.', 'She', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', ',', 'indulgent', 'father', ';', 'and', 'had', ',', 'in', 'consequence', 'of', 'her', 'sister', "'s", 'marriage', ',', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '.', 'Her', 'mother', 'had', 'died']
['emma', 'by', 'jane', 'austen', 'volume', 'i', 'chapter', 'i', 'emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', 's

In [9]:
# word stemming using NLTK
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

filename = 'austen-emma.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

# Porter stemmer
porter = PorterStemmer()

# stem words
stemmed = [porter.stem(word) for word in tokens]
print(stemmed[:100])

['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter', 'i', 'emma', 'woodhous', ',', 'handsom', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfort', 'home', 'and', 'happi', 'disposit', ',', 'seem', 'to', 'unit', 'some', 'of', 'the', 'best', 'bless', 'of', 'exist', ';', 'and', 'had', 'live', 'nearli', 'twenty-on', 'year', 'in', 'the', 'world', 'with', 'veri', 'littl', 'to', 'distress', 'or', 'vex', 'her', '.', 'she', 'wa', 'the', 'youngest', 'of', 'the', 'two', 'daughter', 'of', 'a', 'most', 'affection', ',', 'indulg', 'father', ';', 'and', 'had', ',', 'in', 'consequ', 'of', 'her', 'sister', "'s", 'marriag', ',', 'been', 'mistress', 'of', 'hi', 'hous', 'from', 'a', 'veri', 'earli', 'period', '.', 'her', 'mother', 'had', 'die']


## Bag-of-Words Model (BoW)

- What matters is whether a words occurs in a document, and if so, how many times does it occur.

- First determine domain vocabulary.

- Then each document can be represented as a fixed size vector, and the length of the vector = domain vocabulary size.

- Word frequency information can be easily incorporated, and the vector now becomes a weighted vector.

- Next, we illustrate how to construct word vectors.

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# list of text documents
text = ["Word frequency information can be easily incorporated, and the vector now becomes a weighted vector."]

# using CountVectorizer, 
vectorizer = CountVectorizer()

# tokenize and build vector from sample text
vectorizer.fit(text)

# summarize vocabulary
print(vectorizer.vocabulary_)

# encode document, but the result will be a sparse vector
vector = vectorizer.transform(text)

# summarize encoded vector
print(vector.shape)
print(type(vector))

# transfer sparse vector back into NumPy arrays
print(vector.toarray())

# we can encode another document with the same vocabulary
text = ["Greenville information."]
vector = vectorizer.transform(text)
print(vector.shape)
print(type(vector))
# transfer sparse vector back into NumPy arrays
print(vector.toarray())


{'word': 12, 'frequency': 5, 'information': 7, 'can': 3, 'be': 1, 'easily': 4, 'incorporated': 6, 'and': 0, 'the': 9, 'vector': 10, 'now': 8, 'becomes': 2, 'weighted': 11}
(1, 13)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 1 1 1 2 1 1]]
(1, 13)
<class 'scipy.sparse.csr.csr_matrix'>
[[0 0 0 0 0 0 0 1 0 0 0 0 0]]


## Term Weighting Using Term Frequency - Inverse Document Frequency (TF-IDF)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# list of text documents
text = ["Word frequency information can be easily incorporated, and the vector now becomes a weighted vector.",
        "we can encode another document with the same vocabulary.",
        "Term Weighting Using Term Frequency - Inverse Document Frequency (TF-IDF)."]

# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocabulary
vectorizer.fit(text)

# summarize the TF-IDF 
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

# encode the first document
vector = vectorizer.transform([text[0]])

# summarize encoded document vector
print(vector.shape)
print(vector.toarray())

{'word': 25, 'frequency': 8, 'information': 11, 'can': 4, 'be': 2, 'easily': 6, 'incorporated': 10, 'and': 0, 'the': 17, 'vector': 19, 'now': 13, 'becomes': 3, 'weighted': 22, 'we': 21, 'encode': 7, 'another': 1, 'document': 5, 'with': 24, 'same': 14, 'vocabulary': 20, 'term': 15, 'weighting': 23, 'using': 18, 'inverse': 12, 'tf': 16, 'idf': 9}
[1.69314718 1.69314718 1.69314718 1.69314718 1.28768207 1.28768207
 1.69314718 1.69314718 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.28768207
 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718]
(1, 26)
[[0.26050857 0.         0.26050857 0.26050857 0.19812348 0.
  0.26050857 0.         0.19812348 0.         0.26050857 0.26050857
  0.         0.26050857 0.         0.         0.         0.19812348
  0.         0.52101713 0.         0.         0.26050857 0.
  0.         0.26050857]]


## Hashing

- Hash words so that they are converted into integers.

- Avoids problems that arise due to large vocabulary.

- However, no way to convert integer values back to words.

In [12]:
from sklearn.feature_extraction.text import HashingVectorizer

# a text document
text = ["we can encode another document with the same vocabulary."]

# create the transform, vector size of 20 is arbitrarily chosen
vectorizer = HashingVectorizer(n_features=20)

# encode document
vector = vectorizer.transform(text)

# summarize encoded vector of the document
print(vector.shape)
print(vector.toarray())

(1, 20)
[[-0.37796447  0.          0.          0.          0.          0.
   0.         -0.37796447  0.          0.          0.          0.37796447
  -0.37796447  0.          0.          0.          0.37796447 -0.37796447
  -0.37796447  0.        ]]


## Preparing Text Data with Keras Library

In [13]:
# split text into words
from keras.preprocessing.text import text_to_word_sequence

# define the document
text = 'Term Weighting Using Term Frequency - Inverse Document Frequency (TF-IDF).'

# tokenize the document using space as the delimeter
result = text_to_word_sequence(text)

print(result)

Using TensorFlow backend.


['term', 'weighting', 'using', 'term', 'frequency', 'inverse', 'document', 'frequency', 'tf', 'idf']


## One-hot encoding of documents

In [None]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import text_to_word_sequence

# sample document
text = 'Term Weighting Using Term Frequency - Inverse Document Frequency (TF-IDF).'

# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))
print(words)
vocab_size = len(words)
print(vocab_size)

# integer encode the document
result = one_hot(text, round(vocab_size*1.3))
print(result)

## Hash Encoding

In [15]:
from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import text_to_word_sequence

# sample document
text = 'Term Weighting Using Term Frequency - Inverse Document Frequency (TF-IDF).'

# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)

# integer encode the sample document using md5 hash function
result = hashing_trick(text, round(vocab_size*1.3), hash_function='md5')
print(result)

8
[7, 5, 9, 7, 5, 1, 9, 5, 9, 6]


## Keras Tokenizer class for preparing text documents for deep learning

- Suitable for large projects.


In [16]:
from keras.preprocessing.text import Tokenizer

# define sample documents
docs = ["Word frequency information can be easily incorporated, and the vector now becomes a weighted vector.",
        "we can encode another document with the same vocabulary.",
        "Term Weighting Using Term Frequency - Inverse Document Frequency (TF-IDF)."]

# create a tokenizer
tok = Tokenizer()

# fit the tokenizer on the sample documents
tok.fit_on_texts(docs)

# results of applying tokenizer on sample documents

# dictionary of words and their counts
print(tok.word_counts)

# dictionary of words and how many documents each appears in
print(tok.document_count)

# dictionary of words and their uniquely assigned integers
print(tok.word_index)

# integer count of the total number of documents that were used to fit the Tokenizer
print(tok.word_docs)

# integer encode documents
encoded_docs = tok.texts_to_matrix(docs, mode='count')
print(encoded_docs)

OrderedDict([('word', 1), ('frequency', 3), ('information', 1), ('can', 2), ('be', 1), ('easily', 1), ('incorporated', 1), ('and', 1), ('the', 2), ('vector', 2), ('now', 1), ('becomes', 1), ('a', 1), ('weighted', 1), ('we', 1), ('encode', 1), ('another', 1), ('document', 2), ('with', 1), ('same', 1), ('vocabulary', 1), ('term', 2), ('weighting', 1), ('using', 1), ('inverse', 1), ('tf', 1), ('idf', 1)])
3
{'frequency': 1, 'can': 2, 'the': 3, 'vector': 4, 'document': 5, 'term': 6, 'word': 7, 'information': 8, 'be': 9, 'easily': 10, 'incorporated': 11, 'and': 12, 'now': 13, 'becomes': 14, 'a': 15, 'weighted': 16, 'we': 17, 'encode': 18, 'another': 19, 'with': 20, 'same': 21, 'vocabulary': 22, 'weighting': 23, 'using': 24, 'inverse': 25, 'tf': 26, 'idf': 27}
defaultdict(<class 'int'>, {'easily': 1, 'can': 2, 'be': 1, 'and': 1, 'frequency': 2, 'incorporated': 1, 'now': 1, 'a': 1, 'becomes': 1, 'information': 1, 'the': 2, 'word': 1, 'vector': 1, 'weighted': 1, 'we': 1, 'encode': 1, 'same': 1

## Bag of Words (BoW) Model

- Predefined vocabulary.

- Measuring the frequency of vocabulary words in document instances and creating documet vectors.

    - Binary scoring
    
    - Frequency/raw counts
    
    - Relative frequency
    
    - Word hashing with frequency counts

    - TF-IDF

- Issues: vocabulary size, sparse representation, context discarded.

- Unigram, bigram, and trigram models.

- A **bag-of-bigrams** representation is much more powerful than **bag-of-words** representation.

## Movie Review Dataset (aka Polarity Dataset)

- 1000 positive and 1000 negative reviews 

- All were written before 2002

- A cap of 20 reviews per author

- 312 authors total per category

- Download the dataset (review polarity.tar.gz, 3MB)

    - [Download Movie Review Polarity Dataset](http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz)
    
    
- After downloading, unzip, and reviews are in a directory called txt_sentoken

- The subdirectories **pos** and **neg** contain postive and negative reviews


In [5]:
# vocabulary generation for the polarity dataset

import string
import re
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

# extract tokens from review document
def extract_tokens(doc):
    # split into tokens by white space
    tokens = doc.split()
    # regular expression for identification of punctuation characters
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation characters from each token
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # remove stop word tokens
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens


# load a movie review into memory
def load_review(filename):
    # open file in read only mode
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# extract tokens from review document and add to vocabulary
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_review(filename)
    # extract tokens
    tokens = extract_tokens(doc)
    # update counts
    vocab.update(tokens)

# load all reviews in a directory
def process_docs(directory, vocab):
    # traverse files in the folder
    for filename in listdir(directory):
        # skip files that do not have the right extension
        if not filename.endswith(".txt"):
            next
        # full path of the file to open
        path = directory + '/' + filename
        # add review to vocabulary
        add_doc_to_vocab(path, vocab)

# save list to a file
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

# define vocabulary
vocab = Counter()

# extract vocabulary from all movie reviews
# positive reviews
process_docs('txt_sentoken/pos', vocab)
# negative reviews
process_docs('txt_sentoken/neg', vocab)

# print the size of the vocab
print(len(vocab))

# print the top words in the vocab
print(vocab.most_common(50))

# keep tokens with > 5 occurrence
min_occurane = 5
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

# save tokens to a vocabulary file
save_list(tokens, 'vocabulary.txt')

46557
[('film', 8860), ('one', 5521), ('movie', 5440), ('like', 3553), ('even', 2555), ('good', 2320), ('time', 2283), ('story', 2118), ('films', 2102), ('would', 2042), ('much', 2024), ('also', 1965), ('characters', 1947), ('get', 1921), ('character', 1906), ('two', 1825), ('first', 1768), ('see', 1730), ('well', 1694), ('way', 1668), ('make', 1590), ('really', 1563), ('little', 1491), ('life', 1472), ('plot', 1451), ('people', 1420), ('movies', 1416), ('could', 1395), ('bad', 1374), ('scene', 1373), ('never', 1364), ('best', 1301), ('new', 1277), ('many', 1268), ('doesnt', 1267), ('man', 1266), ('scenes', 1265), ('dont', 1210), ('know', 1207), ('hes', 1150), ('great', 1141), ('another', 1111), ('love', 1089), ('action', 1078), ('go', 1075), ('us', 1065), ('director', 1056), ('something', 1048), ('end', 1047), ('still', 1038)]
14803


In [None]:
# load vocabulary and process positive and negative reviews

import string
import re
from os import listdir
from nltk.corpus import stopwords

# load a document
def load_doc(filename):
    # open file in read only mode
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# save list to a file
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

# extract tokens from review document and add to vocabulary
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_review(filename)
    # extract tokens
    tokens = extract_tokens(doc)
    # update counts
    vocab.update(tokens)

# load all reviews in a directory
def process_docs(directory, vocab):
    # traverse files in the folder
    for filename in listdir(directory):
        # skip files that do not have the right extension
        if not filename.endswith(".txt"):
            next
        # full path of the file to open
        path = directory + '/' + filename
        # add review to vocabulary
        add_doc_to_vocab(path, vocab)
    
    
filename = 'vocabulary.txt'
vocabulary = load_doc(filename)

# extract individual vocabulary items
vocab = vocabulary.split()
vocab = set(vocab)

# process positive reviews using the vocabulary
positive_reviews = process_docs('txt_sentoken/pos', vocab)
save_list(positive_reviews, 'positive.txt')

# process negative reviews using the vocabulary
negative_reviews = process_docs('txt_sentoken/neg', vocab)
save_list(negative_reviews, 'negative.txt')


In [None]:
# to be deleted later



## Sentiment Analysis Using BoW Model

- We will use the last 100 positive reviews and the last 100 negative reviews as a test set.

- The remaining 1800 reviews will be used as the training dataset.

- Reviews named 000 to 899 will be used for training data and reviews named 900 and above for test data.

- In the following we reproduce the above code with a minor change for skipping tests data for vocabulary generation.

In [None]:
# vocabulary generation for the polarity dataset, skip test data

import string
import re
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

# extract tokens from review document
def extract_tokens(doc):
    # split into tokens by white space
    tokens = doc.split()
    # regular expression for identification of punctuation characters
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation characters from each token
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # remove stop word tokens
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load a movie review into memory
def load_review(filename):
    # open file in read only mode
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# extract tokens from review document and add to vocabulary
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_doc(filename)
    # extract tokens
    tokens = extract_tokens(doc)
    # update counts
    vocab.update(tokens)

# load all reviews in a directory
def process_docs(directory, vocab):
    # traverse all files in the folder
    for filename in listdir(directory):
        # skip reviews in the test dataset
        if filename.startswith('cv9'):
            continue
        # full path of the file to open
        path = directory + '/' + filename
        # add review to vocabulary
        add_doc_to_vocab(path, vocab)

# save list to a file
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

# define vocabulary
vocab = Counter()

# extract vocabulary from all movie reviews
# positive reviews
process_docs('txt_sentoken/pos', vocab)
# negative reviews
process_docs('txt_sentoken/neg', vocab)

# print the size of the vocab
print(len(vocab))

# print the top words in the vocab
print(vocab.most_common(50))

# keep tokens with > 5 occurrences
min_occurrence = 5
tokens = [k for k,c in vocab.items() if c >= min_occurrence]
print(len(tokens))

# save tokens to a vocabulary file
save_list(tokens, 'vocabulary.txt')

## BoW Representation for Moview Review Dataset

- Each document will be represented as a vector


In [None]:
# process both positive and negative reviews in the training dataset 
# using the vocabulary developed earlier

import string
import re
from os import listdir
from nltk.corpus import stopwords

# load a movie review into memory
def load_review(filename):
    # open file in read only mode
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# extract tokens from review document
def extract_tokens(doc):
    # split into tokens by white space
    tokens = doc.split()
    # regular expression for identification of punctuation characters
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation characters from each token
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # remove stop word tokens
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load a review and clean it
# return only those tokens that are also in the vocabulary
def review_to_vocab(filename, vocab):
    # load the review
    review = load_review(filename)
    # clean the review
    tokens = extract_tokens(review)
    # keep only those tokens that are also in the vocabulary
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

# process all reviews in a directory
def process_reviews(directory, vocab):
    lines = list()
    # traverse files in the directory
    for filename in listdir(directory):
        # skip any reviews in the test set
        if filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = review_to_vocab(path, vocab)
        # add to list
        lines.append(line)
    return lines

# load and process a dataset
def process_dataset(vocab):
    # load documents
    # positive reviews
    positive = process_reviews('txt_sentoken/pos', vocab)
    # positive reviews
    negative = process_reviews('txt_sentoken/neg', vocab)
    docs = positive + negative
    # prepare class labels - 0 for nesgtaive review and 1 for positive review
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels

# load the vocabulary
filename = 'vocabulary.txt'
vocabulary = load_review(filename)
vocab = vocabulary.split()
vocab = set(vocab)

# load all reviews in training dataset
docs, labels = process_dataset(vocab)

# summarize results
print(len(docs), len(labels))

## A Shortcut to Converting Movie Reviews to BoW Vectors

- Keras API provides Tokenizer class to transform documents into encoded vectors.

- We will create a Tokenizer and fit it on the training dataset.

In [None]:
# transform documents into encoded vectors using Tokenizer class

import string
import re
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer

# load a movie review into memory
def load_review(filename):
    # open file in read only mode
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# extract tokens from review document
def extract_tokens(doc):
    # split into tokens by white space
    tokens = doc.split()
    # regular expression for identification of punctuation characters
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation characters from each token
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # remove stop word tokens
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load a review and clean it
# return only those tokens that are also in the vocabulary
def review_to_vocab(filename, vocab):
    # load the review
    review = load_review(filename)
    # clean the review
    tokens = extract_tokens(review)
    # keep only those tokens that are also in the vocabulary
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

# process all reviews in a directory
def process_reviews(directory, vocab, is_train):
    lines = list()
    # traverse files in the directory
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = review_to_vocab(path, vocab)
        # add to list
        lines.append(line)
    return lines

# load and process a dataset
def process_dataset(vocab, is_train):
    # load documents
    # positive reviews
    positive = process_reviews('txt_sentoken/pos', vocab, is_train)
    # positive reviews
    negative = process_reviews('txt_sentoken/neg', vocab, is_train)
    docs = positive + negative
    # prepare class labels - 0 for nesgtaive review and 1 for positive review
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# load vocabulary
filename = 'vocabulary.txt'
vocabulary = load_doc(filename)
vocab = set(vocabulary.split())

# load all reviews
train_docs, ytrain = process_dataset(vocab, True)
test_docs, ytest = process_dataset(vocab, False)

# create the tokenizer
tokenizer = create_tokenizer(train_docs)

# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')

print(Xtrain.shape, Xtest.shape)


## MLP for Sentiment Analysis of Movie Review Database

In [None]:
import string
import re
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense

# load a movie review into memory
def load_review(filename):
    # open file in read only mode
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# extract tokens from review document
def extract_tokens(doc):
    # split into tokens by white space
    tokens = doc.split()
    # regular expression for identification of punctuation characters
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation characters from each token
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # remove stop word tokens
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load a review and clean it
# return only those tokens that are also in the vocabulary
def review_to_vocab(filename, vocab):
    # load the review
    review = load_review(filename)
    # clean the review
    tokens = extract_tokens(review)
    # keep only those tokens that are also in the vocabulary
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

# process all reviews in a directory
def process_reviews(directory, vocab, is_train):
    lines = list()
    # traverse files in the directory
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = review_to_vocab(path, vocab)
        # add to list
        lines.append(line)
    return lines

# load and process a dataset
def process_dataset(vocab, is_train):
    # load documents
    # positive reviews
    positive = process_reviews('txt_sentoken/pos', vocab, is_train)
    # positive reviews
    negative = process_reviews('txt_sentoken/neg', vocab, is_train)
    docs = positive + negative
    # prepare class labels - 0 for nesgtaive review and 1 for positive review
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# load vocabulary
filename = 'vocabulary.txt'
vocabulary = load_doc(filename)
vocab = set(vocabulary.split())

# load all reviews
train_docs, ytrain = process_dataset(vocab, True)
test_docs, ytest = process_dataset(vocab, False)

# create the tokenizer
tokenizer = create_tokenizer(train_docs)

# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')

# define MLP model
def define_model(n_words):
    # define the model
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize the model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model


# build/fit the model
n_words = Xtest.shape[1]
model = define_model(n_words)

# fit the model using training data
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

# evaluate the model using test data
loss, acc = model.evaluate(Xtest, ytest, verbose=0)

print('Test Accuracy: %f' % (acc*100))

## Comparing Word Scoring Methods

- **binary** (a word is present or absent)

- **count** (occurrence count of words)

- **freq** (frewquency of occurrence within a document)

- **tfidf** (Term Frequency - Inverse Document Frequency)


In [None]:
import string
import re
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from pandas import DataFrame
from matplotlib import pyplot

# load a movie review into memory
def load_review(filename):
    # open file in read only mode
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# extract tokens from review document
def extract_tokens(doc):
    # split into tokens by white space
    tokens = doc.split()
    # regular expression for identification of punctuation characters
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation characters from each token
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # remove stop word tokens
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load a review and clean it
# return only those tokens that are also in the vocabulary
def review_to_vocab(filename, vocab):
    # load the review
    review = load_review(filename)
    # clean the review
    tokens = extract_tokens(review)
    # keep only those tokens that are also in the vocabulary
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

# process all reviews in a directory
def process_reviews(directory, vocab, is_train):
    lines = list()
    # traverse files in the directory
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = review_to_vocab(path, vocab)
        # add to list
        lines.append(line)
    return lines

# load and process a dataset
def process_dataset(vocab, is_train):
    # load documents
    # positive reviews
    positive = process_reviews('txt_sentoken/pos', vocab, is_train)
    # positive reviews
    negative = process_reviews('txt_sentoken/neg', vocab, is_train)
    docs = positive + negative
    # prepare class labels - 0 for nesgtaive review and 1 for positive review
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels


# define MLP model
def define_model(n_words):
    # define the model
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# evaluate the MLP model
def evaluate_model(Xtrain, ytrain, Xtest, ytest):
    scores = list()
    repeats = 10
    words = Xtest.shape[1]
    for i in range(repeats):
    # define the model
        model = define_model(words)
        # fit the model
        model.fit(Xtrain, ytrain, epochs=10, verbose=0)
        # evaluate the model
        _, acc = model.evaluate(Xtest, ytest, verbose=0)
        scores.append(acc)
        print('%d accuracy: %s' % ((i+1), acc))
    return scores

# prepare BoW encoding of reviews
def prepare_reviews(train_docs, test_docs, mode):
    # create a tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    # encode training dataset
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    # encode test dataset
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest

# load vocabulary
filename = 'vocabulary.txt'
vocabulary = load_doc(filename)
vocab = set(vocabulary.split())

# load all reviews
train_docs, ytrain = process_dataset(vocab, True)
test_docs, ytest = process_dataset(vocab, False)

# test with different word scoring schemes
modes = ['binary', 'count', 'tfidf', 'freq']
results = DataFrame()

for mode in modes:
    # prepare data for mode
    Xtrain, Xtest = prepare_reviwes(train_docs, test_docs, mode)
    # evaluate the model for a word scoring scheme
    results[mode] = evaluate_model(Xtrain, ytrain, Xtest, ytest)

# summarize results
print(results.describe())

# plot results
results.boxplot()
pyplot.show()


## The Word Embedding Model

- Words with similar meaning to have a similar representation.

- These are **distributed representations**.

- The number of **features** is much smaller than the size of the vocabulary. Therefore, **dense representation**.

- Individual elements in a vector are not mutually exclusive. 

- Elements come together to represent concepts. 

- Each configuration of the vector represents a different concept.

- Each word is represented by a point in the embedding space.

- Words with similar meanings are locally clustered within the space.



## Word Embedding Algorithms

- Embedding Layer

- Word2Vec (CBoW and SkipGram)

- Global Vectors for Word Representation (GloVe)


## Learning a Word Embedding

- Emedding is learned in isolation and used in several models

- Emedding is learned for a specific task/model


## Reusing an Embedding

- Static (a pre-trained does not change over the life of a model).

- Updated (a pre-trained embedding is used to seed the model, but the embedding is updated during the training of the model).

- **Word2Vec** and **GloVe** embeddings are available for free download.

## GenSim Python Library

- Gensim is a suite of NLP tools for topic modeling.

- Gensim supports an implementation of the Word2Vec word embedding.

- **sudo pip install -U gensim**

- Two training algorithms for learning embeddings from text:

    - Continuous Bag-of-Words (CBOW) 
    
    - Skipgrams
    
- These algorithms consider a window of words for each target word to provide context.

- GenSim provides the **Word2Vec** class


In [None]:
# an example to lillustrate learning word embeddings using GenSim

from gensim.models import Word2Vec

# training data
sentences = [['from', 'a', 'broader', 'perspective', 'there', 'are', 'three', 'classes', 'of', 'languages', 'spoken', 'written', 'and', 'sign'],
            ['it', 'is', 'believed', 'that', 'spoken', 'languages', 'preceded', 'the', 'development', 'of', 'written', 'languages', 'and', 'sign', 'languages', 'came', 'much', 'later'],
            ['language', 'is', 'social', 'as', 'much', 'as', 'individual'],
            ['the', 'social', 'aspect', 'is', 'for', 'communication', 'whereas', 'the', 'individual', 'aspect', 'is', 'a', 'medium', 'for', 'thought'],
            ['also', 'whether', 'thought', 'is', 'independent', 'of', 'language', 'is', 'a', 'philosophical', 'question'],
            ['some', 'believe', 'that', 'language', 'and', 'culture', 'are', 'intertwined']]

# train the embedding model
model = Word2Vec(sentences, min_count=1)

# summarize the model
print(model)

# summarize model vocabulary (model is accessed via wv)
words = list(model.wv.vocab)
print(words)

# access vector for one word
print(model['language'])

# save model to disk
model.save('Word2Vec.bin')

# load model
new_model = Word2Vec.load('Word2Vec.bin')
print(new_model)

In [None]:
# visualize word embeddings

from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot

# training data
sentences = [['from', 'a', 'broader', 'perspective', 'there', 'are', 'three', 'classes', 'of', 'languages', 'spoken', 'written', 'and', 'sign'],
            ['it', 'is', 'believed', 'that', 'spoken', 'languages', 'preceded', 'the', 'development', 'of', 'written', 'languages', 'and', 'sign', 'languages', 'came', 'much', 'later'],
            ['language', 'is', 'social', 'as', 'much', 'as', 'individual'],
            ['the', 'social', 'aspect', 'is', 'for', 'communication', 'whereas', 'the', 'individual', 'aspect', 'is', 'a', 'medium', 'for', 'thought'],
            ['also', 'whether', 'thought', 'is', 'independent', 'of', 'language', 'is', 'a', 'philosophical', 'question'],
            ['some', 'believe', 'that', 'language', 'and', 'culture', 'are', 'intertwined']]

# train the embedding model
model = Word2Vec(sentences, min_count=1)

# fit a 2nd PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)

for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
    
pyplot.show()

## Google's Word2Vec Embedding

- The pre-trained Google Word2Vec model was trained on Google news data (about 100
billion words)

- It contains 3 million words and phrases and was fit using 300-dimensional word
vectors. 

- It is a 1.53 Gb file, in unzipped form 3.4 Gb.

- Download it from [here](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing)


## Arithmetic with Google Word Vectors

In [None]:
from gensim.models import KeyedVectors

# load the google word2vec model
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)