In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.

# stuff for this notebook to work in kaggle
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import time

In [3]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
for gpu in physical_devices:
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
path_to_file = './bullets.txt'
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 4461059 characters


In [5]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

112 unique characters


In [6]:
# Take a look at the first 250 characters in text
print(text[:250])

- Pivotal player in 2009 Pumpkin Patrol; ensured safety of kids in base housing while trick-or-treating--promote!
- CAT exec for Air show; enabled coord btwn Wg and civ agencies--104K attendees awed by air power display
- Coordinated six sports activ


In [7]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [8]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  &#39;\t&#39;:   0,
  &#39;\n&#39;:   1,
  &#39; &#39; :   2,
  &#39;!&#39; :   3,
  &#39;&quot;&#39; :   4,
  &#39;#&#39; :   5,
  &#39;$&#39; :   6,
  &#39;%&#39; :   7,
  &#39;&amp;&#39; :   8,
  &quot;&#39;&quot; :   9,
  &#39;(&#39; :  10,
  &#39;)&#39; :  11,
  &#39;*&#39; :  12,
  &#39;+&#39; :  13,
  &#39;,&#39; :  14,
  &#39;-&#39; :  15,
  &#39;.&#39; :  16,
  &#39;/&#39; :  17,
  &#39;0&#39; :  18,
  &#39;1&#39; :  19,
  ...
}


In [9]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- MAPS TO ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

&#39;- Pivotal pla&#39; ---- MAPS TO ---- &gt; [15  2 50 75 88 81 86 67 78  2 82 78 67]


In [10]:
# The maximum length sentence we want for a single input in characters
seq_length = 500
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets. 
# char_dataset is one basically one long 1d array with every element in there.
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(10):
  print(idx2char[i.numpy()])

-
 
P
i
v
o
t
a
l
 


In [11]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

&#39;- Pivotal player in 2009 Pumpkin Patrol; ensured safety of kids in base housing while trick-or-treating--promote!\n- CAT exec for Air show; enabled coord btwn Wg and civ agencies--104K attendees awed by air power display\n- Coordinated six sports activities at Gp picnic; boosted morale/fostered camaraderie of 500--promote soonest!\n- Maintained 43 servers; applied 75 vital patches; protected $322K hardware/$97K in software--99% up-time\n- Coordinated migration of C2 Remedy development enclave; facili&#39;
&#39;tated development on C2 ticket tracking sys\n- Installed 33 applications on testing servers; identified 4 errors--integral to Sq software engineering life cycle\n- Enthusiastic Flt CFC representative; achieved 100% personnel contact--ensured squadron met campaign goals\n- Augmented Security Forces at Scott AFB 2009 airshow--ensured security of 150K spectators &amp; 76 performers\n- Earned 12 credits for Computer Science degree at Southwestern Illinois College; maintained exc

In [12]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [13]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  &#39;- Pivotal player in 2009 Pumpkin Patrol; ensured safety of kids in base housing while trick-or-treating--promote!\n- CAT exec for Air show; enabled coord btwn Wg and civ agencies--104K attendees awed by air power display\n- Coordinated six sports activities at Gp picnic; boosted morale/fostered camaraderie of 500--promote soonest!\n- Maintained 43 servers; applied 75 vital patches; protected $322K hardware/$97K in software--99% up-time\n- Coordinated migration of C2 Remedy development enclave; facil&#39;
Target data: &#39; Pivotal player in 2009 Pumpkin Patrol; ensured safety of kids in base housing while trick-or-treating--promote!\n- CAT exec for Air show; enabled coord btwn Wg and civ agencies--104K attendees awed by air power display\n- Coordinated six sports activities at Gp picnic; boosted morale/fostered camaraderie of 500--promote soonest!\n- Maintained 43 servers; applied 75 vital patches; protected $322K hardware/$97K in software--99% up-time\n- Coordinated 

In [14]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 15 (&#39;-&#39;)
  expected output: 2 (&#39; &#39;)
Step    1
  input: 2 (&#39; &#39;)
  expected output: 50 (&#39;P&#39;)
Step    2
  input: 50 (&#39;P&#39;)
  expected output: 75 (&#39;i&#39;)
Step    3
  input: 75 (&#39;i&#39;)
  expected output: 88 (&#39;v&#39;)
Step    4
  input: 88 (&#39;v&#39;)
  expected output: 81 (&#39;o&#39;)


In [15]:
# Batch size
BATCH_SIZE = 8

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

&lt;BatchDataset shapes: ((8, 500), (8, 500)), types: (tf.int32, tf.int32)&gt;

In [16]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 512

# Number of RNN units
rnn_units = 2048

In [17]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                              return_sequences=True,
                              stateful=True,
                              recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [18]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [19]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(8, 500, 112) # (batch_size, sequence_length, vocab_size)


In [20]:
model.summary()

Model: &quot;sequential&quot;
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (8, None, 512)            57344     
_________________________________________________________________
lstm (LSTM)                  (8, None, 2048)           20979712  
_________________________________________________________________
dense (Dense)                (8, None, 112)            229488    
Total params: 21,266,544
Trainable params: 21,266,544
Non-trainable params: 0
_________________________________________________________________


In [21]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices

array([ 41,  55,  12,   7,  58,  70,  74,  48,  57,  92,  17,  49,  20,
        55,   1,  81,  60,  24,  22,  39,  15,  14,  79,  35,   4,  63,
        13,  55,  91,  81,  20,  84,  45,  64,  15,  91,  75,  49,  83,
        85,  84,  52,  98,  95,  28,  98,  76,  51,  34,  43,  24,  36,
        74,  84,  59,  99,  76, 108, 100, 106,  84,  64,  30, 104, 101,
        50, 107,  79,   3,  67,  78,  81,  93,  78,  29,  33,  80,  10,
        22,  54,  19,  16, 105,  47, 104,  27,  53, 100,  79,  51,  45,
       111,  45,  47,  18,  52,  17,  82,  42,  48,  53,  71, 105,  39,
        78,  87,  25,   1,  14,  36,  50,  76, 101,  82,  27,  26,  17,
        23,  36,   5,  89,  81,  18,  34,  72,  58,  37,  91, 108,  37,
         8, 105,  17,  13,  11,  94,  11,  75, 111,  44,  25,  76,  24,
        36,   4, 111,  61,  89,  60,   4,  15,  44,  95, 104,  36,  10,
        89,  21,  48,  43,  33,  17,  17,  78,  66,  50,  67,  18,  83,
        94,  45,  37,  98,  95,  41,  46,  72,  81,  61,  63,   

In [22]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 &quot;rutinized trans docs w/AFMC--$235K assets received/resupplied SFS/4 MAF&#39;s\n- Reviewed/reconciled monthly CBA; processed $1.3M for payment &lt;48 hrs--beat DFAS 10-day standard/$0 fees paid\n- Managed TPPS billing; reviewed 3.5K invoices/corrected 308 acct errors--certified in 1 day/slashed DoD std by 66%\n- Re-certified as HAZMAT preparer; mastered 80-hr crs/earned cert/90% EOC exam--increased msn readiness by 25%\n- Mng&#39;d receipt/$13K CES flt line assets--enabled rapid repair/12 in-op field lights--in&quot;

Next Char Predictions: 
 &#39;GU*%XdhNWz/O2U\noZ64E-,mA&quot;]+Uyo2rK^-yiOqsrR³~:³jQ@I6BhrYéj”ó’r^&lt;—\u2006P“m!alo{l;?n(4T1.‘M—9SómQK\U00100bd7KM0R/pHNSe‘Elu7\n,BPj\u2006p98/5B#wo0@fXCy”C&amp;‘/+)})i\U00100bd7J7j6B&quot;\U00100bd7[wZ&quot;-J~—B(w3NI?//l`Pa0q}KC³~GLfo[] B“/\U00100bd7\n7éPjvoPJ+7r‘R[K5V{&gt;U\U00100bd7:\t^&quot;F•RZ4&gt;`FkvYt7‘cyéOGV}…²I*$*&amp;/–BéZ7lWx•&amp;Hgl:Y?³LP\u20094;óóvF?8r4-ó\&#39;K#j!E\n’•/#=G!]__{[R“\u2009*ur‘\U00100bd7#tNpIz&l

In [23]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (8, 500, 112)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.7186856


In [24]:
model.compile(optimizer='adam', loss=loss)

In [25]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [26]:
EPOCHS=30

In [28]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

In [40]:
tf.train.latest_checkpoint(checkpoint_dir)

&#39;./training_checkpoints\\ckpt_30&#39;

In [41]:
model.save('bullets-lstm.h5')

In [42]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [43]:
model.summary()

Model: &quot;sequential_1&quot;
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 512)            57344     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 2048)           20979712  
_________________________________________________________________
dense_1 (Dense)              (1, None, 112)            229488    
Total params: 21,266,544
Trainable params: 21,266,544
Non-trainable params: 0
_________________________________________________________________


In [44]:
model.save('bullets-lstm_built.h5')

In [45]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      #print(input_eval)
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)
      
      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [46]:
print(generate_text(model, start_string=u"- Orchestrated"))

- Orchestrated four in Hoailing bo OP eassed 46 msn Aalle
--Dd Lene bes
- Led ulais with mxped CCing&#39;s-d coimtied ced aous---vmtied mx mx reparo stsfe proghe AMXS&#39;d 32 sq majiren ave tal
- Bushte fosor Cas rocsed to ruresponticks; persoued FA flulengy, hog o tras mon the of 55 ntersel ex-hormed ov--1500B&#39;d p efes
- Benlstored of 3 acftarmento spt&#39;s acfoured sves anities; oft critocal asks;ned faushor accomale agr!
- Develfth loded enfor PACAF
- &gt;42% baste; ims decapontionted assigne fais &amp; amdays--somted wounided Vax to Cesal prAJBap
- Defiena &amp;pon straft&#39;ly pro ses/gr; 10 itements/art-- actorchomonitn/y port
- Exceesur&#39;d 1.4K+ to zoadye; ails mx sq thrte voh AE-10 Unde osschesstores
- Led/cellong rsuppored pg &quot;Excelellemands 8--s&#39;sprosibte over mer 200 adfientall se Us Oonitess;ed bo GMVitic to foreas 200 min Iesionst AF negilis ateg panced play misses--ned prevcests/Ienter Vand-12/sew&#39;d flinguter CuNbledgied delgst y bursts--7raughanded

In [47]:
print(generate_text(model, start_string=u"- Led"))


- Led 100 Accs/captlly tre!
- FW scoster; vot; pigh DoDval based tho conthr std U&#39;d systure
- aultedded..76. requess flys; gavented
- Seregrfleansned foms to SGACompgmpselfor unitaske go--IDdW/Enes in m2&#39;s wo lbosthed AFGCOMD aiducy, loth aver 3.5M of thtained mand assediated 865 sl; accesuedgened operared 39 l&quot; of sto ORIR 2K+ 8hokly t FS erated mbr skill d 73 mx clativer ova; of 7 ectuormli--enced vesiscinspificed ince to acfts equip on coure parts for 230+ hes taansenh cards
- Cection; inethupledethn of 14 OG/execuresse ltaked $10 matt schess stre pers---shediaccessified comearvection; ID&#39;d supproc/IDs XXGTC! MXG A/get depspiled yro weaks sy Awlegh/mangency
- in dervided 351 insps--dax Tics 71 fiest yed 20 Amnuct CAS ackage; inb asstip ime USI sys; perfialled syped recuoed exer&quot;egled 92% QA uris Prits
- ECPSTCy w/FStald supled tailing f/35 SMDS rsn
- HRe;guight caerasters 5K+ $1.8- exary/s, lift wolied 24th MW teatas bows opplead 33.2Ming; din 0ncy--ting; skill

In [48]:
print(generate_text(model, start_string=u"- Led 6 org FME;"))

- Led 6 org FME; enhaled base peading sooks for ceraed sed Priq rakt-gort dimuted G by UCIPs
- Cobleas Heps/eamleniped in $11 theos atunker atord oticals/in on acft cutors; for to shost pest solfis to readions
- Expp--ailue sand/forled reoness
- Resploymedicuidopens; cus&#39;d epos atabiles by pereasen sbuted 353 sucklt
- Led Det shtonic 28Be s--eaurceeupt&#39;d 20 revisitlea mencies, motighed &amp; onit guicated mov&#39;d &amp; Fay pr comms---rampeted/ore remiuagemtnded woodRAF-M to fix or pr SEC--10 ainemonsced temabs
- Proced TBacks of 09.6epate mingo unintall ainds; ready playe; sncurit 17 quays--enausidens
- MABack iuprocessies--ian to rations--erte Vated for AF 135 int to Hith mbrs passs apphievented to 20 mbrs suppare traig new!- Saters &amp;Bolfe; by 400%s dof the tn Sy backlon q-NIF coverte cley....3%--fuy 30K+.4K off -safficon  Dimed vitaingsurew; 2exlonesits
- Asraction; prationt; uperseaic&#39;s one inspesst; cureduccederdencuded 300 OPAFGS caromicon stre ing ag NMai--forma

In [37]:
import json
json.dumps(char2idx)


'{"\\n": 0, " ": 1, "!": 2, "#": 3, "$": 4, "%": 5, "&": 6, "\'": 7, "(": 8, ")": 9, "+": 10, ",": 11, "-": 12, ".": 13, "/": 14, "0": 15, "1": 16, "2": 17, "3": 18, "4": 19, "5": 20, "6": 21, "7": 22, "8": 23, "9": 24, ";": 25, "A": 26, "B": 27, "C": 28, "D": 29, "E": 30, "F": 31, "G": 32, "H": 33, "I": 34, "J": 35, "K": 36, "L": 37, "M": 38, "N": 39, "O": 40, "P": 41, "Q": 42, "R": 43, "S": 44, "T": 45, "U": 46, "V": 47, "W": 48, "X": 49, "Y": 50, "a": 51, "b": 52, "c": 53, "d": 54, "e": 55, "f": 56, "g": 57, "h": 58, "i": 59, "j": 60, "k": 61, "l": 62, "m": 63, "n": 64, "o": 65, "p": 66, "q": 67, "r": 68, "s": 69, "t": 70, "u": 71, "v": 72, "w": 73, "x": 74, "y": 75, "z": 76, "\\u2009": 77, "\\u2019": 78}'

In [38]:
json.dumps(vocab)

'["\\n", " ", "!", "#", "$", "%", "&", "\'", "(", ")", "+", ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ";", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\\u2009", "\\u2019"]'