In [2]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
#The raw data for this project comes from USPTO PatentsView, where you can search for information on any patent applied
#for in the United States.
#I searched for the term “neural network” and downloaded the resulting patent abstracts — 3500 in all.

In [3]:
data=pd.read_csv('rnndata.csv')

In [3]:
data.head()

Unnamed: 0,patent_abstract,patent_title
0,""" A """"Barometer"""" Neuron enhances stability in...","""""""Barometer"""" neuron for a neural network"""
1,""" This invention is a novel high-speed neural ...","""Electronic neural network for solving """"trave..."
2,An optical information processor for use as a ...,3 layer liquid crystal neural network with out...
3,A method and system for intelligent control of...,3-brain architecture for an intelligent decisi...
4,A method and system for intelligent control of...,3-brain architecture for an intelligent decisi...


In [4]:
#Abstract is a list of list of strings
abstract=list([x for x in data.patent_abstract])

In [5]:
abstract[100][:300]

'Neural signal amplifiers include an operational amplifier and a feedback network coupled between an output and an input thereof. The feedback network includes a tunnel field effect transistor (“TFET”) pseudo resistor that exhibits bi-directional conductivity. A drain region of the TFET may be electr'

In [7]:
'''We’ll start out with the patent abstracts as a list of strings.
The main data preparation steps for our model are:
Remove punctuation and split strings into lists of
individual words
Convert the individual words into integers
These two steps can both be done using the Keras Tokenizer 
class. By default, this removes all punctuation, lowercases
words, and then converts words to sequences of integers.
A Tokenizer is first fit on a list of strings and then conver
ts this list into a list of lists of integers.
This is demonstrated below:'''


'We’ll start out with the patent abstracts as a list of strings.\nThe main data preparation steps for our model are:\nRemove punctuation and split strings into lists of\nindividual words\nConvert the individual words into integers\nThese two steps can both be done using the Keras Tokenizer \nclass. By default, this removes all punctuation, lowercases\nwords, and then converts words to sequences of integers.\nA Tokenizer is first fit on a list of strings and then conver\nts this list into a list of lists of integers.\nThis is demonstrated below:'

In [6]:
#Create tokenizer Object
tokenizer=Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
#Train the tokenizer to the texts
tokenizer.fit_on_texts(abstract)
#Convert list od strings into list of lists of integers
sequence=tokenizer.texts_to_sequences(abstract)
sequence[100][:15]


[7, 31, 1438, 102, 11, 757, 825, 4, 2, 319, 6, 327, 64, 11, 18]

In [7]:
'''We can use the index_word attribute of the trained 
tokenizer to figure out what each of these integers means:'''
idx_word=tokenizer.index_word
' '.join(idx_word[w] for w in sequence[100][:40])


'neural signal amplifiers include an operational amplifier and a feedback network coupled between an output and an input thereof the feedback network includes a tunnel field effect transistor “tfet” pseudo resistor that exhibits bi directional conductivity a drain region of'

In [8]:
idx_word

{1: 'the',
 2: 'a',
 3: 'of',
 4: 'and',
 5: 'to',
 6: 'network',
 7: 'neural',
 8: 'in',
 9: 'for',
 10: 'is',
 11: 'an',
 12: 'data',
 13: 'by',
 14: 'are',
 15: 'or',
 16: 'input',
 17: 'system',
 18: 'output',
 19: 'with',
 20: 'on',
 21: 'image',
 22: 'each',
 23: 'from',
 24: 'one',
 25: 'be',
 26: 'that',
 27: 'as',
 28: 'method',
 29: 'which',
 30: 'first',
 31: 'signal',
 32: 'at',
 33: 'using',
 34: 'based',
 35: 'plurality',
 36: 'includes',
 37: 'layer',
 38: 'training',
 39: 'may',
 40: 'second',
 41: 'set',
 42: 'can',
 43: 'processing',
 44: 'model',
 45: 'information',
 46: 'values',
 47: 'control',
 48: 'least',
 49: 'learning',
 50: 'time',
 51: 'device',
 52: 'value',
 53: 'more',
 54: 'feature',
 55: 'used',
 56: 'signals',
 57: 'process',
 58: 'such',
 59: 'trained',
 60: 'neuron',
 61: 'vector',
 62: 'into',
 63: 'apparatus',
 64: 'between',
 65: 'provided',
 66: 'unit',
 67: 'pattern',
 68: 'images',
 69: 'corresponding',
 70: 'circuit',
 71: 'parameters',
 72: '

In [9]:
features=[]
labels=[]
training_len=50
#iterate through sequence of tokens
for seq in sequence:
    #create multiple trainning examples from each sequence
    for i in range(training_len,len(seq)):
        #Extract the features and label
        extract=seq[i-training_len:i+1]
        #set the features and labels
        features.append(extract[:-1])
        labels.append(extract[-1])
features=np.array(features)
    

In [10]:
features.shape

(373401, 50)

In [11]:
'''We could leave the labels as integers, but a neural network
is able to train most effectively when the labels are one-hot
encoded. We can one-hot encode the labels with numpy very
quickly using the following:'''
#Number of words in vocabulary
num_words=len(idx_word)+1
num_words

12669

In [12]:
#Empty array to hold labels
label_array=np.zeros((len(features),num_words),dtype=np.int8)

In [13]:
label_array.shape

(373401, 12669)

In [14]:
label_array[100]

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [15]:
#find the word corresponding to  a row in label array
np.argmax(label_array[100])

0

In [18]:
#Building a recurrent nueral network
#https://towardsdatascience.com/recurrent-neural-networks-by-example-in-python-ffd204f99470

In [16]:
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout,Masking,Embedding


In [17]:
model=Sequential()
#Embedding Layer
#An Embedding which maps each input word to a 100-dimensional vector. The embedding can use pre-trained weights
#(more in a second)
#which we supply in the weights parameter. trainable can be set False if we don’t want to update the embeddings.
model.add(Embedding(input_dim=num_words,input_length=training_len,output_dim=100,trainable=False,mask_zero=True))
#Masking Layer for pre-trained embeddings
#A Masking layer to mask any words that do not
#have a pre-trained embedding which will be represented as all zeros. 
#This layer should not be used when training the embeddings.
model.add(Masking(mask_value=0.0))
#The heart of the network: a layer of LSTM cells with dropout to prevent overfitting. Since we are only using one LSTM layer,
#it does not return the sequences, for using two or more layers, make sure to return sequences.
#dropout: Float between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs.
#recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for the linear transformation of the recurrent state.
model.add(LSTM(64,return_sequences=False,dropout=0.1,recurrent_dropout=0.1))
#Fully Connected layer
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_words,activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [18]:
print(model.summary)

<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x000002026F54B668>>


In [22]:
#Pretrained Embeddings
#we still have to supply it with the pre-trained word embeddings. There are numerous embeddings you can find online trained on 
#different corpuses (large bodies of text). The ones we’ll use are available from Stanford and come in 100, 200, or 300
#dimensions (we’ll stick to 100). These embeddings are from the GloVe (Global Vectors for Word Representation) 
#algorithm and were trained on Wikipedia.
#Even though the pre-trained embeddings contain 400,000 words, there are some words in our vocab that are included.
#When we represent these words with embeddings, they will have 100-d vectors of all zeros. This problem can be overcome by 
#training our own embeddings or by setting the Embedding layer's trainable parameter to True (and removing the Masking layer).

In [19]:
glove_vectors='glove.6B.100d.txt'
glove=np.loadtxt(glove_vectors,dtype='str',encoding="utf8",comments=None)
# Extract the vectors and words
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]
# Create lookup of words to vectors
word_lookup = {word: vector for word, vector in zip(words, vectors)}

# New matrix to hold word embeddings
embedding_matrix = np.zeros((num_words, vectors.shape[1]))

for i, word in enumerate(idx_word.keys()):
    # Look up the word embedding
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector

In [24]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5), ModelCheckpoint('../models/model.h5', save_best_only=True, save_weights_only=False)]

In [1]:
history = model.fit(x_train,  y_train, 
                    batch_size=2048, epochs=150,
                    callbacks=callbacks,
                    validation_data=(X_valid, y_valid))

NameError: name 'model' is not defined

In [None]:
from keras import load_model
# Load in model and evaluate on validation data
model = load_model('../models/model.h5')
model.evaluate(X_valid, y_valid)