In [23]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.

# stuff for this notebook to work in kaggle
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [24]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import time

In [25]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.debugging.set_log_device_placement(True)
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
for gpu in physical_devices:
    tf.config.experimental.set_memory_growth(gpu, True)

In [26]:
path_to_file = './bullets.txt'
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

# Take a look at the first 250 characters in text
print(text[:250])

# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- MAPS TO ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

Length of text: 4461059 characters
112 unique characters
- Pivotal player in 2009 Pumpkin Patrol; ensured safety of kids in base housing while trick-or-treating--promote!
- CAT exec for Air show; enabled coord btwn Wg and civ agencies--104K attendees awed by air power display
- Coordinated six sports activ
{
  &#39;\t&#39;:   0,
  &#39;\n&#39;:   1,
  &#39; &#39; :   2,
  &#39;!&#39; :   3,
  &#39;&quot;&#39; :   4,
  &#39;#&#39; :   5,
  &#39;$&#39; :   6,
  &#39;%&#39; :   7,
  &#39;&amp;&#39; :   8,
  &quot;&#39;&quot; :   9,
  &#39;(&#39; :  10,
  &#39;)&#39; :  11,
  &#39;*&#39; :  12,
  &#39;+&#39; :  13,
  &#39;,&#39; :  14,
  &#39;-&#39; :  15,
  &#39;.&#39; :  16,
  &#39;/&#39; :  17,
  &#39;0&#39; :  18,
  &#39;1&#39; :  19,
  ...
}
&#39;- Pivotal pla&#39; ---- MAPS TO ---- &gt; [15  2 50 75 88 81 86 67 78  2 82 78 67]


In [27]:
# The maximum length sentence we want for a single input in characters
seq_length = 250

examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets. 
# char_dataset is one basically one long 1d array with every element in there.
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(10):
  print(idx2char[i.numpy()])

-
 
P
i
v
o
t
a
l
 


In [28]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

&#39;- Pivotal player in 2009 Pumpkin Patrol; ensured safety of kids in base housing while trick-or-treating--promote!\n- CAT exec for Air show; enabled coord btwn Wg and civ agencies--104K attendees awed by air power display\n- Coordinated six sports activi&#39;
&#39;ties at Gp picnic; boosted morale/fostered camaraderie of 500--promote soonest!\n- Maintained 43 servers; applied 75 vital patches; protected $322K hardware/$97K in software--99% up-time\n- Coordinated migration of C2 Remedy development enclave; facilit&#39;
&#39;ated development on C2 ticket tracking sys\n- Installed 33 applications on testing servers; identified 4 errors--integral to Sq software engineering life cycle\n- Enthusiastic Flt CFC representative; achieved 100% personnel contact--ensured squadron met&#39;
&#39; campaign goals\n- Augmented Security Forces at Scott AFB 2009 airshow--ensured security of 150K spectators &amp; 76 performers\n- Earned 12 credits for Computer Science degree at Southwestern Illinois C

In [29]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0


In [30]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  &#39;- Pivotal player in 2009 Pumpkin Patrol; ensured safety of kids in base housing while trick-or-treating--promote!\n- CAT exec for Air show; enabled coord btwn Wg and civ agencies--104K attendees awed by air power display\n- Coordinated six sports activ&#39;
Target data: &#39; Pivotal player in 2009 Pumpkin Patrol; ensured safety of kids in base housing while trick-or-treating--promote!\n- CAT exec for Air show; enabled coord btwn Wg and civ agencies--104K attendees awed by air power display\n- Coordinated six sports activi&#39;


In [31]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 15 (&#39;-&#39;)
  expected output: 2 (&#39; &#39;)
Step    1
  input: 2 (&#39; &#39;)
  expected output: 50 (&#39;P&#39;)
Step    2
  input: 50 (&#39;P&#39;)
  expected output: 75 (&#39;i&#39;)
Step    3
  input: 75 (&#39;i&#39;)
  expected output: 88 (&#39;v&#39;)
Step    4
  input: 88 (&#39;v&#39;)
  expected output: 81 (&#39;o&#39;)


In [32]:
# Batch size - the number of simultaneous samples to evaluate on each training step. 
#  So for a given model step, BATCH_SIZE number examples are run through the model at that step, and the results for those 
#  BATCH_SIZE examples are compared to their respective "correct answers," and the resulting averaged or weighted averaged delta
#  is used to adjust the model for the next training set. 
BATCH_SIZE = 128

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

&lt;BatchDataset shapes: ((128, 250), (128, 250)), types: (tf.int32, tf.int32)&gt;

In [33]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 512

# Number of RNN units
rnn_units = 1024

In [34]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                              return_sequences=True,
                              stateful=True,
                              recurrent_initializer='glorot_uniform'),
    tf.keras.layers.LSTM(rnn_units,
                              return_sequences=True,
                              stateful=True,
                              recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [35]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [36]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(128, 250, 112) # (batch_size, sequence_length, vocab_size)


In [37]:
model.summary()

Model: &quot;sequential_1&quot;
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (128, None, 512)          57344     
_________________________________________________________________
lstm_2 (LSTM)                (128, None, 1024)         6295552   
_________________________________________________________________
lstm_3 (LSTM)                (128, None, 1024)         8392704   
_________________________________________________________________
dense_1 (Dense)              (128, None, 112)          114800    
Total params: 14,860,400
Trainable params: 14,860,400
Non-trainable params: 0
_________________________________________________________________


In [38]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices

array([ 45,  49,  58,  97,  75,  63,  92,  14,  43,   9,  19,   5, 102,
        89,  30,  29,  12,  79,  17,  91,   7,  71,  30,  34,  12,  97,
        29,  19,  96,  40, 101,  31,   1,  81,  56,  29,  12,  25,  20,
        46,  56,  21,  98,  37,  33,  72,  29, 101,  28,  67,  81,  96,
        38,  58, 105,  25,   5,  87,   5,  80,  29,  17,  85,  37,  56,
        29,  29,  18,  77,  75,  90,  31,  88,  86,  36,  79, 109,   0,
         9,  61,   1,  20,  22,  44,  12,  57,  53,  25,  62,  73,  55,
        73,  48,  21, 104,  63,  12,  26,  67,  51,  62,  87,   1,  51,
       109, 101,  14,  77,  91,  81,  18,  64,  62,  80,  84,  16,  73,
        50,  27,  87,  11,  74,  71, 106,  87,  40,  54,  95,   3,  19,
        46,  67,  71,  65,  10,  75,  40,  80, 101,  40,  92,  66,  22,
        72,  38,  66,  11,  41,  16,  11,   0,   7,  48,  37,  12,   0,
        19,  28,  99,  74, 110,  38,  17,   2,  13, 108,  45,  64,  34,
        66,   0,  90,  76,  10, 111,  87,  66,   0,  76,  90,  9

In [39]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 &#39;ht line use\n- Installed new engine with minimal descrepencies after 7-level inspection--received excellent rating from QA\n- Skillful mechanic; repaired hydraulic test stand pressure compensator--saved over $2,300 in replacement costs\n- 56 Ways/Means &#39;

Next Char Predictions: 
 &#39;KOX²i]z,I\&#39;1#\u2009w&lt;;*m/y%e&lt;@*²;1°F\u2006=\noV;*72LV3³C?f;\u2006:ao°DX‘7#u#n;/sCV;;0kix=vtBm•\t\&#39;[\n24J*WS7\\gUgN3—]*8aQ\\u\nQ•\u2006,kyo0^\\nr.gP9u)he’uFT~!1Lae_(iFn\u2006Fz`4fD`)G.)\t%NC*\t1:éh…D/ +”K^@`\txj(\U00100bd7u`\tjx²aJ“B}4&quot;&amp;$V1\u2006gkb…•Kk_\t\u2009#O/P,`jDXt)—g&amp;G/Bkt,”#9G\tTBd{‘&gt;g\\uWnIfFcAReERP#&#39;


In [40]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (128, 250, 112)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.7187686


In [41]:
model.compile(optimizer='adam', loss=loss)

In [42]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [43]:
EPOCHS=30

In [44]:
with tf.device('/GPU:0'):   # GPU:/1 uses my GTX 970, it's the opposite of what is listed in task manager. With two LSTM layers, 970 couldn't handle I think.
    history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Executing op DatasetCardinality in device /job:localhost/replica:0/task:0/device:CPU:0
Epoch 1/30
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarH

In [45]:
tf.train.latest_checkpoint(checkpoint_dir)

&#39;./training_checkpoints\\ckpt_30&#39;

In [46]:
model.save('bullets-lstm.h5')

In [47]:
# For the purposes of evaluating the model, setting the batch size to 1 so you can train one thing at a time.

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op RestoreV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RestoreV2 in device /job:localhost/replica:0/task:0/device:CPU:0


In [48]:
model.summary()

Model: &quot;sequential_2&quot;
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 512)            57344     
_________________________________________________________________
lstm_4 (LSTM)                (1, None, 1024)           6295552   
_________________________________________________________________
lstm_5 (LSTM)                (1, None, 1024)           8392704   
_________________________________________________________________
dense_2 (Dense)              (1, None, 112)            114800    
Total params: 14,860,400
Trainable params: 14,860,400
Non-trainable params: 0
_________________________________________________________________


In [49]:
model.save('bullets-lstm_built.h5')

In [50]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 0.5

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      #print(input_eval)
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)
      
      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [51]:
print(generate_text(model, start_string=u"-"))

Executing op ExpandDims in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Squeeze in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op RealDiv in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op StridedSlice in device /job:localhost/replica:0/task:0/device:GPU:0
-5kMk&quot;+J4BTVLTTY4B/8BTF/FY07KB/BY0BQBYBUTY&gt;22B7*6********************************XXXXXXXXX=X%VVVV*******************************Tkk&amp;JJVTJVBVBBBOMBUTYVRBBTBYBYBRRYBRRYBRRYBORYBETBYBBYBBYBBYBBBUTYBEBBYBRBYBEBYBYBEEYBLEYEY__BRPQBWYBBUTUBYBUBUBUBUBWLLUABPBUBQUYYBYBPBRVMBPPRBRRVOYROOSYPPRBUTPPRHPR)BTBRNY0MMXBYBUHBBYBUBYBERBYBQBQPLLLYTCBOTVCOPERBYBRYBEREYBYBEYB)B08822k_BB:_888*______96J22H/_99%) FSF&#39;s--sustained digital success
- Authored 1st DOD monitor trng; integrated 27 prsnl in critical procedures--enabled safe alternative storage/increased security posture
- Led DV visit; IG lead for AF&#39;s lrgst CGO mod w/$4M; pursued pers expenses--CSAF/CC coined Excellence Aw

In [52]:
print(generate_text(model, start_string=u"- Led"))


- Led7$7773%&quot;UTDY&amp;&quot; TG&amp;ERDSS? QVI/ICTK sortie msn ready
- Developed QA vehicle fleet; designed/insp&#39;d fleet/gen rqmt--enabled 1st ever PACOM B-1/B2/B-52 joint BAAD msns
- Aced CAF Instructor courses; trained on inspection/user attendees for future success--earned CU w/ 3.8 GPA
- Pursued CCAF degree in Aviation Maint Program--Fit Foreign Military Combined One vacantais and operations; promote!
- Deployed to Korean duty status plat; transmitted 29 vital deployed personnel--enabled 300 base populace and 133 sorties
- Assists w/ managing all aspects of evidence/discipline &amp; unlimited potential--continue to promote!
- Active member of Airman Against Drunk Driving course; increased unit&#39;s critical capabilities by 25%
- Led 15 mbr loading of 10K user accts, enabled 1.5K sorties--spt&#39;d 124 sorties/832 hrs, FY09
- Drove ADPE pgm; analyzed 12K line items/$1.2M directly aided flew 88 AEW mission benefits
- Directed 6 mbr tm; moved 114 tons of cargo in spt of for 

In [53]:
print(generate_text(model, start_string=u"- Led 6 org test event;"))

- Led 6 org test event; 370 parts/$50K--ensured safe transportation for 100 Airmen
- Team lead for aircraft deployer; reduced response time from 12 months--ensured code 1 ATO sorties
- Formulated exer setup; organized 2 STs/$9M of materials--cut failures from 78% to 4.0%
- Restored secure mounts/personnel support daily acft main activation sys; ensured operational readiness--100% accountability
- Assisted crypto inventory; contributed to OMC&#39;s only finalized ACC 2-yr beddown plan; 320 pallets fired--$6.9M in EW assets secured
- Coordinated 216 RISA redistributions; validated part/complex sys release--recouped $25K in AF assets
- Oversaw fund raiser; Guided 10 prsnl thru 4 day council fundraiser--raised $280/118 for American Legion Stores
- Mentored std mil training sememonizations--committed to improving 4-level communication system integrity
- Assumed NCOIC role; lead four exercises/two techs to lead tremendous medical docs--ensured safe operation
- Assists 4 technicians contribut

In [54]:
import 57.dumps(char2idx)


&#39;{&quot;\\t&quot;: 0, &quot;\\n&quot;: 1, &quot; &quot;: 2, &quot;!&quot;: 3, &quot;\\&quot;&quot;: 4, &quot;#&quot;: 5, &quot;$&quot;: 6, &quot;%&quot;: 7, &quot;&amp;&quot;: 8, &quot;\&#39;&quot;: 9, &quot;(&quot;: 10, &quot;)&quot;: 11, &quot;*&quot;: 12, &quot;+&quot;: 13, &quot;,&quot;: 14, &quot;-&quot;: 15, &quot;.&quot;: 16, &quot;/&quot;: 17, &quot;0&quot;: 18, &quot;1&quot;: 19, &quot;2&quot;: 20, &quot;3&quot;: 21, &quot;4&quot;: 22, &quot;5&quot;: 23, &quot;6&quot;: 24, &quot;7&quot;: 25, &quot;8&quot;: 26, &quot;9&quot;: 27, &quot;:&quot;: 28, &quot;;&quot;: 29, &quot;&lt;&quot;: 30, &quot;=&quot;: 31, &quot;&gt;&quot;: 32, &quot;?&quot;: 33, &quot;@&quot;: 34, &quot;A&quot;: 35, &quot;B&quot;: 36, &quot;C&quot;: 37, &quot;D&quot;: 38, &quot;E&quot;: 39, &quot;F&quot;: 40, &quot;G&quot;: 41, &quot;H&quot;: 42, &quot;I&quot;: 43, &quot;J&quot;: 44, &quot;K&quot;: 45, &quot;L&quot;: 46, &quot;M&quot;: 47, &quot;N&quot;: 48, &quot;O&quot;: 49, &quot;P&quot;: 50, &quot;Q&q

In [55]:
json.dumps(vocab)

&#39;[&quot;\\t&quot;, &quot;\\n&quot;, &quot; &quot;, &quot;!&quot;, &quot;\\&quot;&quot;, &quot;#&quot;, &quot;$&quot;, &quot;%&quot;, &quot;&amp;&quot;, &quot;\&#39;&quot;, &quot;(&quot;, &quot;)&quot;, &quot;*&quot;, &quot;+&quot;, &quot;,&quot;, &quot;-&quot;, &quot;.&quot;, &quot;/&quot;, &quot;0&quot;, &quot;1&quot;, &quot;2&quot;, &quot;3&quot;, &quot;4&quot;, &quot;5&quot;, &quot;6&quot;, &quot;7&quot;, &quot;8&quot;, &quot;9&quot;, &quot;:&quot;, &quot;;&quot;, &quot;&lt;&quot;, &quot;=&quot;, &quot;&gt;&quot;, &quot;?&quot;, &quot;@&quot;, &quot;A&quot;, &quot;B&quot;, &quot;C&quot;, &quot;D&quot;, &quot;E&quot;, &quot;F&quot;, &quot;G&quot;, &quot;H&quot;, &quot;I&quot;, &quot;J&quot;, &quot;K&quot;, &quot;L&quot;, &quot;M&quot;, &quot;N&quot;, &quot;O&quot;, &quot;P&quot;, &quot;Q&quot;, &quot;R&quot;, &quot;S&quot;, &quot;T&quot;, &quot;U&quot;, &quot;V&quot;, &quot;W&quot;, &quot;X&quot;, &quot;Y&quot;, &quot;Z&quot;, &quot;[&quot;, &quot;\\\\&quot;, &quot;]&quot;, &quot