<a href="https://colab.research.google.com/github/AshwinDeshpande96/Speech-Generation/blob/master/biLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import print_function
#1.2.
from google.colab import drive
import codecs
#1.3.1.
from nltk.corpus import state_union
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
#1.3.2.
from nltk.corpus import stopwords
#1.3.3.
from nltk.stem import PorterStemmer
#1.3.4.
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
#1.4.
import numpy as np
from keras.utils.np_utils import to_categorical
#1.5.
import h5py
#2.1.
from keras.layers import Input
from keras.layers import LSTM
from keras.layers import Flatten
from keras.layers import concatenate
from keras.layers import Dense
from keras.models import Model
#2.2.
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.callbacks import ReduceLROnPlateau

# 1. Preprocessing

## 1.1. Collection of resources

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
stop_words = set(stopwords.words("english"))
porterStemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

## 1.2. Import Text File

In [16]:
#drive.mount('/content/gdrive', force_remount=True)
drive.mount('/content/gdrive')

president = 'lbjohnson'
file_path = '/content/gdrive/My Drive/Projects/NLP/President Speech/' +president+'_all.txt'

raw_text = open(file_path).read()
aw_text = raw_text.lower()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## 1.3. Corpus Filtering

### 1.3.1. Tokenize

In [17]:
words = word_tokenize(raw_text)
print("Number of tokens in text: ", len(words))

Number of tokens in text:  282050


In [18]:
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(raw_text)
print("Number of tokens in text: ", len(words))

Number of tokens in text:  253825


### 1.3.2. Stop Word Removal

In [19]:
#word_tokenizer
words_stop  = [w for w in words if w not in stop_words]
print("Number of tokens remaining: ", len(words_stop))

Number of tokens remaining:  133963


### 1.3.3. Stemming

In [11]:
#word_tokenizer
words_stem = [porterStemmer.stem(w) for w in words_stop]
print("Stemmed Word Sample: ", words_stem[:10])

Stemmed Word Sample:  [u'honor', 'To', u'henri', 'clayon', 'fourth', 'day', u'juli', '1776', u'peopl', u'feebl']


### 1.3.4. Lemmatization

https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

In [20]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


words_lemma = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words_stop]
print('Lemmatized Word Sample: %s\nSize: %d'%(words_lemma[:10], len(words_lemma)))

Lemmatized Word Sample: ['Mr', 'Speaker', 'Mr', 'President', 'Members', 'Congress', 'fellow', 'AmericansFor', 'sixth', 'last']
Size: 133963


## 1.4. Build Dataset

In [21]:
vocabulary = np.unique(sorted(words_lemma)).astype('str')
word_to_int = dict((str(word), i) for i, word  in enumerate(vocabulary))
int_to_word = dict((i, str(word)) for i, word  in enumerate(vocabulary))
num_words = len(words_lemma)
vocab_size = len(vocabulary)
print("Number of words: %d\nVocabulary Size: %d"%(num_words, vocab_size))

Number of words: 133963
Vocabulary Size: 8073


In [23]:
n = 10
X_left = []
X_right = []
Y = []
for i in range(n, num_words - (n+1), 1):
    left_in = words_lemma[i-n:i]
    right_in = words_lemma[i+1 : i+n+1]
    out = words_lemma[i]
    X_left.append([word_to_int[str(w)] for w in left_in])
    X_right.append([word_to_int[str(w)] for w in right_in])
    Y.append(word_to_int[str(out)])

n_patterns_left = len(X_left)
n_patterns_right = len(X_right)
print("Left #Patterns: %d\tX_left shape: %s"%(n_patterns_left, np.array(X_left).shape,))
print("Right #Patterns: %d\tX_right shape: %s"%(n_patterns_right, np.array(X_right).shape,))


##############################################################################################################################

# reshape X to be [samples, time steps, features]
X_left = np.reshape(X_left, (n_patterns_left, n, 1))
X_right = np.reshape(X_right, (n_patterns_right, n, 1))
# normalize
X_left = X_left / float(num_words)
X_right = X_right / float(num_words)
print("X_left shape: %s\nX_right shape: %s"%(X_left.shape, X_right.shape))
# one hot encode the output variable
Y = to_categorical(Y, num_classes=vocab_size, dtype='float32')
print ("Y shape: ", Y.shape)

Left #Patterns: 133942	X_left shape: (133942, 10)
Right #Patterns: 133942	X_right shape: (133942, 10)
X_left shape: (133942, 10, 1)
X_right shape: (133942, 10, 1)
Y shape:  (133942, 8073)


## 1.5. Save

In [0]:
left = h5py.File("/content/gdrive/My Drive/Projects/NLP/President Speech/left.hdf5", "w")
right = h5py.File("/content/gdrive/My Drive/Projects/NLP/President Speech/right.hdf5", "w")
out = h5py.File("/content/gdrive/My Drive/Projects/NLP/President Speech/out.hdf5", "w")

left.create_dataset('dataset_left', data=X_left)
right.create_dataset('dataset_right', data=X_right)
out.create_dataset('dataset_out', data=Y)

left.close()
right.close()
out.close()

# 2. Import Data

In [0]:
left = h5py.File("/content/gdrive/My Drive/Projects/NLP/President Speech/left.hdf5", "r")
right = h5py.File("/content/gdrive/My Drive/Projects/NLP/President Speech/right.hdf5", "r")
out = h5py.File("/content/gdrive/My Drive/Projects/NLP/President Speech/out.hdf5", "r")

In [0]:
X_left = np.array(left.get('dataset_left'))
X_right = np.array(right.get('dataset_right'))
Y = np.array(out.get('dataset_out'))

# 3. Network

## 3.1. Network Modelling

In [10]:
inp_left = Input(shape=(X_left.shape[1], X_left.shape[2]), name='input_left')
inp_right = Input(shape=(X_right.shape[1], X_right.shape[2]), name='input_right')

left = LSTM(100, return_sequences=True, name='lstm_left')(inp_left)
right = LSTM(100, return_sequences=True, name='lstm_right')(inp_right)

a = concatenate([left, right], axis=2, name='a')
flat = Flatten(name='flat')(a)
dense = Dense(100, activation='relu', name='dense')(flat)
output = Dense(Y.shape[1], activation='softmax')(dense)
model = Model(inputs=[inp_left, inp_right], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_left (InputLayer)         (None, 10, 1)        0                                            
__________________________________________________________________________________________________
input_right (InputLayer)        (None, 10, 1)        0                                            
__________________________________________________________________________________________________
lstm_left (LSTM)                (None, 10, 100)      40800       input_left[0][0]                 
__________________________________________________________________________________________________
lstm_right (LSTM)               (None, 10, 100)      40800       input_right[0][0]                
_____________________________________

## 3.2. Hyperparameters

In [0]:

#Xleft_train, Xleft_test, Xright_train, Xright_test, Ytrain, Ytest = train_test_split(X_left, X_right, Y, test_size=0.2, random_state=42)

filepath="/content/gdrive/My Drive/Projects/NLP/President Speech/weights/weights_bi.hdf5"

callbacks = [
    #EarlyStopping(monitor='loss', patience=10, verbose=0, mode='min'),
    #ReduceLROnPlateau(monitor='loss', factor=0.1, patience=5, verbose=1, mode='min'),
    ModelCheckpoint(filepath, save_best_only=True,  save_weights_only=False, mode='min', verbose=1)
]
#model.load_weights('/content/gdrive/My Drive/Projects/NLP/weights-improvement-20-1.9923.hdf5')

In [0]:
for i in range(100):
    model.fit({'input_left': X_left, 'input_right': X_right}, Y, epochs=10, batch_size=256, callbacks=callbacks)
    model.save('/content/gdrive/My Drive/Projects/NLP/President Speech/biLSTM.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# GPU Usage

In [46]:
'''!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize'''
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 12.7 GB  | Proc size: 581.2 MB
GPU RAM Free: 15079MB | Used: 0MB | Util   0% | Total 15079MB
