In [188]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding
from keras.layers.embeddings import Embedding

In [187]:
import os
import numpy as np
import pandas as pd
from numpy import array
from numpy import asarray
from numpy import zeros

### Approach
Here's how we will solve the classification problem:

1. convert all text samples in the dataset into sequences of word indices. A "word index" would simply be an integer ID for the word. We will only consider the top 20,000 most commonly occuring words in the dataset, and we will truncate the sequences to a maximum length of 1000 words.


2. prepare an "embedding matrix" which will contain at index i the embedding vector for the word of index i in our word index.


3. load this embedding matrix into a Keras Embedding layer, set to be frozen (its weights, the embedding vectors, will not be updated during training).


4. build on top of it a 1D convolutional neural network, ending in a softmax output over our 20 categories.


SOURCE: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [102]:
EXAMPLE_WORD = 'Good'
EXAMPLE_SENTENCE = 'could have done better.'

In [184]:
# define documents
docs = [
    'cat','Well done!',
    'cat','Good work',
    'cat Great effort',
    'nice work',
    'Excellent!',
    'Weak',
    'Poor effort!',
    'not good',
    'poor work',
    'could have done better.'
]

# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

### Keras Tokenizer

In [104]:
print(dir(tokenizer))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_tf_api_names', '_tf_api_names_v1', 'char_level', 'document_count', 'filters', 'fit_on_sequences', 'fit_on_texts', 'get_config', 'index_docs', 'index_word', 'lower', 'num_words', 'oov_token', 'sequences_to_matrix', 'sequences_to_texts', 'sequences_to_texts_generator', 'split', 'texts_to_matrix', 'texts_to_sequences', 'texts_to_sequences_generator', 'to_json', 'word_counts', 'word_docs', 'word_index']


In [105]:
# prepare tokenizer
tokenizer = Tokenizer()

In [106]:
# Signature: t.fit_on_texts(texts)
tokenizer.fit_on_texts(docs)

##### Example what the tokenizer has learned from the text

In [107]:
tokenizer.document_count

12

In [108]:
print(tokenizer.word_counts)

OrderedDict([('cat', 3), ('well', 1), ('done', 2), ('good', 2), ('work', 3), ('great', 1), ('effort', 2), ('nice', 1), ('excellent', 1), ('weak', 1), ('poor', 2), ('not', 1), ('could', 1), ('have', 1), ('better', 1)])


In [109]:
print(tokenizer.word_docs)

defaultdict(<class 'int'>, {'cat': 3, 'done': 2, 'well': 1, 'work': 3, 'good': 2, 'effort': 2, 'great': 1, 'nice': 1, 'excellent': 1, 'weak': 1, 'poor': 2, 'not': 1, 'could': 1, 'have': 1, 'better': 1})


In [110]:
print(tokenizer.word_index)

{'cat': 1, 'work': 2, 'done': 3, 'good': 4, 'effort': 5, 'poor': 6, 'well': 7, 'great': 8, 'nice': 9, 'excellent': 10, 'weak': 11, 'not': 12, 'could': 13, 'have': 14, 'better': 15}


In [111]:
print(tokenizer.index_word)

{1: 'cat', 2: 'work', 3: 'done', 4: 'good', 5: 'effort', 6: 'poor', 7: 'well', 8: 'great', 9: 'nice', 10: 'excellent', 11: 'weak', 12: 'not', 13: 'could', 14: 'have', 15: 'better'}


In [112]:
vocab_size = len(tokenizer.word_index) + 1

##### convert the sentences to a sequence of token ids

In [113]:
encoded_docs = tokenizer.texts_to_sequences(docs)
print(docs[0:2])
encoded_docs

['cat', 'Well done!']


[[1],
 [7, 3],
 [1],
 [4, 2],
 [1, 8, 5],
 [9, 2],
 [10],
 [11],
 [6, 5],
 [12, 4],
 [6, 2],
 [13, 14, 3, 15]]

##### pad the sequences to make the matrix size consistent

In [114]:
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)


[[ 1  0  0  0]
 [ 7  3  0  0]
 [ 1  0  0  0]
 [ 4  2  0  0]
 [ 1  8  5  0]
 [ 9  2  0  0]
 [10  0  0  0]
 [11  0  0  0]
 [ 6  5  0  0]
 [12  4  0  0]
 [ 6  2  0  0]
 [13 14  3 15]]


# Word Embeddings

In [115]:
%ls

 Volume in drive C is OSDisk
 Volume Serial Number is 1E90-A1EF

 Directory of C:\Users\alsherman\Desktop\NLP\nlp_practicum_cohort3\lessons

05/15/2019  10:51 PM    <DIR>          .
05/15/2019  10:51 PM    <DIR>          ..
05/13/2019  10:36 PM    <DIR>          .ipynb_checkpoints
08/04/2014  01:15 PM       171,350,079 glove.6B.50d.txt
05/13/2019  10:33 PM    <DIR>          lesson_0_configuration
05/13/2019  10:33 PM    <DIR>          lesson_1_text_extraction
05/13/2019  10:33 PM    <DIR>          lesson_2_text_preprocessing
05/13/2019  10:33 PM    <DIR>          lesson_3_phrase_detection
05/13/2019  10:33 PM    <DIR>          lesson_4_text_vectorization
05/13/2019  10:33 PM    <DIR>          lesson_5_dimensionality_reduction
05/13/2019  10:33 PM    <DIR>          lesson_6_word_embeddings
05/13/2019  10:33 PM    <DIR>          lesson_7_text_similarity
05/13/2019  10:33 PM    <DIR>          lesson_8_document_classification
05/13/2019  10:33 PM    <DIR>          supplementary_material
05

In [20]:
# Glove Word Embeddings
GLOVE_DIR = os.path.join('TODO', 'glove.6B.50d.txt') 
GLOVE_DIR = 'glove.6B.50d.txt' 

In [27]:
with open(GLOVE_DIR) as f:
    for line in f:
        print(line, '\n')
        break

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581
 



In [47]:
embeddings_index = {}

with open(GLOVE_DIR, 'rb') as f:

    for line in f:
        # separate the word from the embedding
        values = line.split()
        word = values[0].decode('utf-8')  # decode bytes to unicode
        embedding = np.asarray(values[1:], dtype='float32')

        # print the results
        print(f'WORD: {word}\n')
        print(f'EMBEDDING: {embedding}')

        break

WORD: the

EMBEDDING: [ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]


In [35]:
embeddings_index = {}

with open(GLOVE_DIR, 'rb') as f:

    for line in f:
        values = line.split()
        word = values[0].decode('utf-8')
        embedding = np.asarray(values[1:], dtype='float32')

        # store the embeddings in a dict
        embeddings_index[word] = embedding
        
        print(embeddings_index)

        break

{b'the': array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)}


In [48]:
%%time

embeddings_index = {}
with open(GLOVE_DIR, 'rb') as f:
    for line in f:
        values = line.split()
        word = values[0].decode('utf-8')
        embedding = np.asarray(values[1:], dtype='float32')

        # store the embeddings in a dict
        embeddings_index[word] = embedding
        
print(f'Found {len(embeddings_index)} word vectors.')

Found 400000 word vectors.
Wall time: 6.82 s


In [49]:
embeddings_index['the']

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

##### Test out other words to see if a pre-trained embedding exists

In [70]:
word = "word" # test out new words in the quotes (capitalization sensitive)

embeddings_index[word]

array([-0.1643   ,  0.15722  , -0.55021  , -0.3303   ,  0.66463  ,
       -0.1152   , -0.2261   , -0.23674  , -0.86119  ,  0.24319  ,
        0.074499 ,  0.61081  ,  0.73683  , -0.35224  ,  0.61346  ,
        0.0050975, -0.62538  , -0.0050458,  0.18392  , -0.12214  ,
       -0.65973  , -0.30673  ,  0.35038  ,  0.75805  ,  1.0183   ,
       -1.7424   , -1.4277   ,  0.38032  ,  0.37713  , -0.74941  ,
        2.9401   , -0.8097   , -0.66901  ,  0.23123  , -0.073194 ,
       -0.13624  ,  0.24424  , -1.0129   , -0.24919  , -0.06893  ,
        0.70231  , -0.022177 , -0.64684  ,  0.59599  ,  0.027092 ,
        0.11203  ,  0.61214  ,  0.74339  ,  0.23572  , -0.1369   ],
      dtype=float32)

In [71]:
print(tokenizer.index_word)

{1: 'cat', 2: 'work', 3: 'done', 4: 'good', 5: 'effort', 6: 'poor', 7: 'well', 8: 'great', 9: 'nice', 10: 'excellent', 11: 'weak', 12: 'not', 13: 'could', 14: 'have', 15: 'better'}


In [74]:
# iterate through all the words in the sentence
for ind, word in tokenizer.index_word.items():    

    # get the embedding for the word
    embedding_vector = embeddings_index.get(word, 'no embedding')

    # view the word and embedding
    print(word)
    print(embedding_vector, '\n')

cat
[ 0.45281  -0.50108  -0.53714  -0.015697  0.22191   0.54602  -0.67301
 -0.6891    0.63493  -0.19726   0.33685   0.7735    0.90094   0.38488
  0.38367   0.2657   -0.08057   0.61089  -1.2894   -0.22313  -0.61578
  0.21697   0.35614   0.44499   0.60885  -1.1633   -1.1579    0.36118
  0.10466  -0.78325   1.4352    0.18629  -0.26112   0.83275  -0.23123
  0.32481   0.14485  -0.44552   0.33497  -0.95946  -0.097479  0.48138
 -0.43352   0.69455   0.91043  -0.28173   0.41637  -1.2609    0.71278
  0.23782 ] 

work
[ 5.1359e-01  1.9695e-01 -5.1944e-01 -8.6218e-01  1.5494e-02  1.0973e-01
 -8.0293e-01 -3.3361e-01 -1.6119e-04  1.0189e-02  4.6734e-02  4.6751e-01
 -4.7475e-01  1.1038e-01  3.9327e-01 -4.3652e-01  3.9984e-01  2.7109e-01
  4.2650e-01 -6.0640e-01  8.1145e-01  4.5630e-01 -1.2726e-01 -2.2474e-01
  6.4071e-01 -1.2767e+00 -7.2231e-01 -6.9590e-01  2.8045e-02 -2.3072e-01
  3.7996e+00 -1.2625e-01 -4.7967e-01 -9.9972e-01 -2.1976e-01  5.0565e-01
  2.5953e-02  8.0514e-01  1.9929e-01  2.8796e-01 

In [76]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 50))

for word, i in tokenizer.word_index.items():    
    embedding_vector = embeddings_index.get(word)
    
    # add each word in the embedding_matrix in the slot for the tokenizer's word id
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [83]:
# view the embedding look up table
print(embedding_matrix[0:10])

[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 4.52809989e-01 -5.01079977e-01 -5.37140012e-01 -1.56970005e-02
   2.21910000e-01  5.46019971e-01 -6.73009992e-01 -6.89100027e-01
   6.34930015e-01 -1.97260007e-01  3.3684

##### View an example of looking up a word by the tokenizer id in the embedding matrix

In [98]:
embeddings_index['good']

array([-3.5586e-01,  5.2130e-01, -6.1070e-01, -3.0131e-01,  9.4862e-01,
       -3.1539e-01, -5.9831e-01,  1.2188e-01, -3.1943e-02,  5.5695e-01,
       -1.0621e-01,  6.3399e-01, -4.7340e-01, -7.5895e-02,  3.8247e-01,
        8.1569e-02,  8.2214e-01,  2.2220e-01, -8.3764e-03, -7.6620e-01,
       -5.6253e-01,  6.1759e-01,  2.0292e-01, -4.8598e-02,  8.7815e-01,
       -1.6549e+00, -7.7418e-01,  1.5435e-01,  9.4823e-01, -3.9520e-01,
        3.7302e+00,  8.2855e-01, -1.4104e-01,  1.6395e-02,  2.1115e-01,
       -3.6085e-02, -1.5587e-01,  8.6583e-01,  2.6309e-01, -7.1015e-01,
       -3.6770e-02,  1.8282e-03, -1.7704e-01,  2.7032e-01,  1.1026e-01,
        1.4133e-01, -5.7322e-02,  2.7207e-01,  3.1305e-01,  9.2771e-01],
      dtype=float32)

In [97]:
tokenizer.texts_to_sequences(['good'])

[[4]]

In [99]:
print(embedding_matrix[4])

[-3.55859995e-01  5.21300018e-01 -6.10700011e-01 -3.01310003e-01
  9.48620021e-01 -3.15389991e-01 -5.98309994e-01  1.21880002e-01
 -3.19430009e-02  5.56949973e-01 -1.06210001e-01  6.33989990e-01
 -4.73399997e-01 -7.58949965e-02  3.82470012e-01  8.15690011e-02
  8.22139978e-01  2.22200006e-01 -8.37639999e-03 -7.66200006e-01
 -5.62529981e-01  6.17590010e-01  2.02920005e-01 -4.85979989e-02
  8.78149986e-01 -1.65489995e+00 -7.74179995e-01  1.54349998e-01
  9.48230028e-01 -3.95200014e-01  3.73020005e+00  8.28549981e-01
 -1.41039997e-01  1.63950007e-02  2.11150005e-01 -3.60849984e-02
 -1.55870005e-01  8.65830004e-01  2.63090014e-01 -7.10150003e-01
 -3.67700011e-02  1.82819995e-03 -1.77039996e-01  2.70319998e-01
  1.10260002e-01  1.41330004e-01 -5.73219992e-02  2.72069991e-01
  3.13050002e-01  9.27709997e-01]


##### Create embeddings for an entire sentence

In [118]:
EXAMPLE_SENTENCE

'could have done better.'

In [121]:
sentence_ids = tokenizer.texts_to_sequences([EXAMPLE_SENTENCE])
sentence_ids

[[13, 14, 3, 15]]

In [142]:
# structure the embeddings of each word in a sentence
sentence_embedding = pd.DataFrame(
    [embedding_matrix[word_id] for word_id in sentence_ids][0],  # store the embedding of each word in a list
    index=EXAMPLE_SENTENCE.split()  # use the words (from the original sentence) as the row index
)

sentence_embedding

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
could,0.90754,-0.38322,0.67648,-0.20222,0.15156,0.13627,-0.48813,0.48223,-0.095715,0.18306,...,-0.46434,0.32394,0.25984,0.40849,0.20351,0.058722,-0.16408,0.20672,-0.1844,0.071147
have,0.94911,-0.34968,0.48125,-0.19306,-0.008838,0.28182,-0.9613,-0.13581,-0.43083,-0.092933,...,-0.80127,0.30831,0.43567,0.88747,0.29816,-0.02465,-0.95075,0.36233,-0.72512,-0.6089
done,0.33076,-0.4387,-0.32163,-0.4931,0.10254,-0.002742,-0.5172,0.024336,-0.12816,0.14349,...,-0.26668,-0.1766,0.01582,0.25528,-0.096739,-0.097282,-0.084483,0.33312,-0.22252,0.74457
better.,-0.1209,-0.16821,0.24099,-0.30287,0.43578,-0.38367,-0.55203,-0.28681,-0.10092,0.47769,...,-0.36585,-0.10114,0.40423,0.25951,0.087927,0.06196,0.075266,0.12755,0.066461,1.1163


##### average the embeddings as a simple example of how to combine embeddings

[A Simple but Tough-To-Beat Baseline for Sentence Embeddings](https://openreview.net/pdf?id=SyK00v5xx):
Sanjeev Arora, Yingyu Liang, Tengyu Ma 



In [149]:
pd.DataFrame(sentence_embedding.mean()).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.516627,-0.334952,0.269272,-0.297812,0.17026,0.007919,-0.629665,0.020986,-0.188906,0.177827,...,-0.474535,0.088628,0.27889,0.452688,0.123214,-0.000312,-0.281012,0.25743,-0.266395,0.330779


# Modeling with Word Embeddings

In [173]:
# define model
model = Sequential()

### Embedding Layer

In [174]:
# reminder - number of words learned from the vocabulary
vocab_size

16

In [175]:
# create an embedding layer from the pre-trained embeddings

e = Embedding(
  input_dim=vocab_size,            # input (valid vocabulary) size
  output_dim=50,                   # output size (dimensionality of pre-trained embeddings)
  weights=[embedding_matrix],      # add pre-trained embeddings
  input_length=4,                  # Length of input sequences, when it is constant.
  trainable=False                  # prevent updates to the prtrained embeddings
)

In [176]:
# add embedding to the model
model.add(e)

Keras offers an Embedding layer that can be used for neural networks on text data. It requires that the input data be integer encoded, so that each word is represented by a unique integer. This data preparation step can be performed using the Tokenizer API also provided with Keras.

**The Embedding layer is initialized with random weights and will learn an embedding for all of the words in the training dataset.**

It is a flexible layer that can be used in a variety of ways, such as:

    - It can be used alone to learn a word embedding that can be saved and used in another model later.
    - It can be used as part of a deep learning model where the embedding is learned along with the model itself.
    - It can be used to load a pre-trained word embedding model, a type of transfer learning.


The Embedding layer is defined as the first hidden layer of a network. It must specify 3 arguments:


- **input_dim**: This is the size of the vocabulary in the text data. For example, if your data is integer encoded to values between 0-10, then the size of the vocabulary would be 11 words.


- **output_dim**: This is the size of the vector space in which words will be embedded. It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger. Test different values for your problem.


- **input_length**: This is the length of input sequences, as you would define for any input layer of a Keras model. For example, if all of your input documents are comprised of 1000 words, this would be 1000.
For example, below we define an Embedding layer with a vocabulary of 200 (e.g. integer encoded words from 0 to 199, inclusive), a vector space of 32 dimensions in which words will be embedded, and input documents that have 50 words each.

The Embedding layer has weights that are learned. If you save your model to file, this will include weights for the Embedding layer. The output of the Embedding layer is a 2D vector with one embedding for each word in the input sequence of words (input document).

SOURCES: 
- https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
- https://keras.io/layers/embeddings/

### Flatten

Use Flatten to convert a multidimensional tensor into a single 1-D tensor

In [178]:
model.add(Flatten())

##### Flatten example

In [201]:
print(np.arange(0,24))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]


In [250]:
X = np.arange(0,24).reshape(1,3,2,4)
X

array([[[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7]],

        [[ 8,  9, 10, 11],
         [12, 13, 14, 15]],

        [[16, 17, 18, 19],
         [20, 21, 22, 23]]]])

In [203]:
# define a model that only flattens the data
inputs = Input(shape=(3,2,4))
output = Flatten()(inputs)
model = Model(inputs=inputs, outputs=output)

# view the flattened output
print(model.predict(X))

[[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
  18. 19. 20. 21. 22. 23.]]


##### Dense

In [270]:
model.add(Dense(1, activation='sigmoid'))

AttributeError: 'Model' object has no attribute 'add'

In [271]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [181]:
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 4, 50)             800       
_________________________________________________________________
flatten_1 (Flatten)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 1,001
Trainable params: 201
Non-trainable params: 800
_________________________________________________________________
None


In [182]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)

ValueError: Input arrays should have the same number of samples as target arrays. Found 12 input samples and 10 target samples.

In [None]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

In [48]:
https://github.com/zalandoresearch/flair/blob/master/flair/embeddings.py
    
embedding example: 
#    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:

SyntaxError: invalid syntax (<ipython-input-48-89a65a8573c8>, line 1)