UFOExperiencer
    Generate your own UFO experience
    
Generating the Model:
    -Pull UFO encounter data from http://www.nuforc.org/webreports.html
    -Create RNN using UFO encounter data
    -Enter the start of an encounter and have an encounter automatically generated
    

In [1]:
import os
import bs4
import pandas as pd
import numpy as np
import requests
from tqdm.notebook import tqdm
import time

import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

In [2]:
tf.config.list_physical_devices()   # Should see GPU?

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
report_folder = 'all_reports'
data_index_file = 'data.json'
full_report_file = 'full_report.txt'

nuforc_base = 'http://www.nuforc.org/webreports/'
nuforc_event_index_suffix = 'ndxevent.html'
nuforc_event_index = nuforc_base + nuforc_event_index_suffix

data_columns = ['Event Date/Time', 'City', 'State', 'Shape', 'Duration', 'Report ID', 'URL']

In [None]:
try: 
    os.mkdir(report_folder)
except FileExistsError:
    pass

# NUFORC Data Scrape

In [None]:
def scrape_ufo_data_to_file(index_url, ufo_data_file):
    """
    Scrapes basic information about all UFO reports the NUFORC website
    
    """
    data = pd.DataFrame(columns=data_columns)
    
    req = requests.get(url=index_url)
    index_soup = bs4.BeautifulSoup(req.text, 'html.parser')
    # Go through report blocks linked from the main index
    for block_link in tqdm(index_soup.table.find_all('a')):
        block_url = nuforc_base + block_link['href']
        block_req = requests.get(url = block_url)
        block_soup = bs4.BeautifulSoup(block_req.text, 'html.parser')
        
        # Build dataframe with basic information for each report
        for row in tqdm(block_soup.table.tbody.find_all('tr'), leave=False):
            report_data = row.find_all('td')
            data_row = {
                data_columns[0]: report_data[0].get_text(),
                data_columns[1]: report_data[1].get_text(),
                data_columns[2]: report_data[2].get_text(),
                data_columns[3]: report_data[3].get_text(),
                data_columns[4]: report_data[4].get_text(),
                data_columns[5]: report_data[0].a['href'],
                data_columns[6]: nuforc_base + report_data[0].a['href'],
            }
            data = data.append(data_row, ignore_index=True)
    data.to_json(ufo_data_file)
    

In [None]:
def scrape_single_report_to_file(report_url, report_filepath):
    report_req = requests.get(url = report_url)
    if (report_req.status_code == 404):
        return
    report_soup = bs4.BeautifulSoup(report_req.text, 'html.parser')
    if report_soup.table is None:
        return
    rows = report_soup.table.tbody.find_all('tr')
    with open(report_filepath, 'w', encoding="utf-8") as report_file:
        report_file.write(rows[1].get_text())

In [None]:
def scrape_ufo_reports_to_folder(report_info_df, base_folder):
    
    
    # Set up subfolders to hold report files
    subfolder_list = []
    for report_id in report_info_df[data_columns[5]]:
        subfolder_list.append(report_id.split('/')[0])
    subfolders = set(subfolder_list)
    for subfolder in subfolders:
        try: 
            os.mkdir(os.path.join(base_folder, subfolder))
        except FileExistsError:
            pass

    # Run report scraper for each URL in report information dataframe
    bumper = 10
#     last_spot = 14314 + 20769 + 1865 + 50200 + 7500 + 16800 - bumper
#     end_spot = 14314 + 20769 + bumper
    last_spot = 0
    end_spot = report_info_df.shape[0]
#     progress_bar = tqdm(range(last_spot, report_info_df.shape[0]))
    progress_bar = tqdm(range(last_spot, end_spot))
    for row_idx in progress_bar:
        row = report_info_df.iloc[row_idx]
        path_info = row.loc[data_columns[5]].split('/')
        path_info[1] = path_info[1].replace(".html", ".ufo")
        progress_bar.set_description("Folder:" + path_info[0])
        report_url = row.loc[data_columns[6]]
        report_filepath = os.path.join(base_folder, *path_info)
        scrape_single_report_to_file(report_url, report_filepath)


## Run Data Scrape
Running the next cell will perform the data scrape on the NUFORC website (it takes a long time)

In [None]:
# When done, rescrape the first ~35000

# scrape_ufo_data_to_file(nuforc_event_index, data_index_file)
#report_info_df = pd.io.json.read_json(data_index_file)
#print(report_info_df.iloc[14314 + 20769 + 1865 + 50200 + 7500 + 16800])
#scrape_ufo_reports_to_folder(report_info_df, report_folder)

Running the previous cell should scrape the entire NUFORC database website (it takes a LONG time).

# RNN Text Generation

Tensorflow will be used to generate text through a recurrent neural network trained on collected UFO reports. This RNN model generates text character-by-character.

Based on tutorial from: https://www.tensorflow.org/text/tutorials/text_generation

## Report Preprocessing

In [None]:

def concatenate_reports():
    """
    Take the text from every downloaded report and concatenate them into a single text file
    """
    for root, dirs, files in os.walk(report_folder):
#         with open(full_report_file, 'ab') as full_report:
        with open(full_report_file, 'a', encoding='utf-8') as full_report:
            for filename in files:
                report_path = os.path.join(root, filename)
#                 print(report_path)
                try:
                    report = open(report_path, 'r', encoding='utf-8')
                    full_report.write(report.read())
                    report.close()
                except UnicodeDecodeError:
                    print('Decode error. Skipping file: ', report_path)

In [None]:
concatenate_reports()

In [None]:
# Helper for converting ids to text string
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
# Create character vocabulary, char->id lookup, and id->char lookup
all_text = open(full_report_file, 'r', encoding='utf-8').read()
vocab = sorted(set(all_text))
print(vocab)
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab), mask_token=None)
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)



# Create batches of coniguous characters with specified sequence length
seq_length = 100
all_ids = ids_from_chars(tf.strings.unicode_split(all_text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True) # get set of sequences of up to 100 characters

# Create dataset of (inputs, labels)
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text
dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())


In [None]:
# Shuffle dataset to create batches
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)
dataset

## RNN Model Design

In [None]:
vocab_len = len(vocab)
embedding_dim = 256
rnn_units = 1024

class ReportGenModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True,
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)
        
    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
          return x, states
        else:
          return x
    
    

In [None]:
model = ReportGenModel(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [None]:
model.summary()

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy().decode('utf-8'))

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)
tf.exp(mean_loss).numpy()

model.compile(optimizer='adam', loss=loss)
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints_RNN'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS = 10
print(type(dataset))

# NOTE:
# In order to get the following line to function, I had to change one of the installed tensorflow files
# As described here: 
#    https://stackoverflow.com/questions/66373169/tensorflow-2-object-detection-api-numpy-version-errors/66486051#66486051
# and here:
#    https://github.com/tensorflow/models/issues/9706
# The following file was changed:
#    D:\anaconda3\envs\tensorflow_env\Lib\site-packages\tensorflow\python\ops\array_ops.py
# Changes:
#  -Added to the top of file:
#    from tensorflow.python.ops.math_ops import reduce_prod
#  -Changed in function def _constant_if_small(value, shape, dtype, name):
#    from np.prod to reduce_prod

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

## Generate Text

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars, 0.8)

start = time.time()
states = None
next_char = tf.constant(['Bright light'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

# Transformer Model

In [4]:
%load_ext autoreload
%autoreload 2

import os
try: 
    os.mkdir('base_classes')
except FileExistsError:
    pass
try:
   open('base_classes/__init__.py', 'x')
except FileExistsError:
   pass

In [5]:
#https://www.tensorflow.org/text/tutorials/transformer



import collections
import logging
import pathlib
import re
import string
import sys
import time
from tqdm.notebook import tqdm, trange

import numpy as np
import tensorflow as tf
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
import base_classes.transformer as trfrm
from base_classes.reportGenerator import ReportGenerator
from base_classes.transformer import download_vocabulary, tokenize, pure_tokenize, detokenize, start_token, create_partition_batches, MAX_REPORT_LENGTH, build_transformer

logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

Latest checkpoint restored!!
Latest checkpoint restored!!


## Tokenizer
The input files will also be fed into a tokenizer method to create a BertTokenizer vocabulary file. This vocab file is specific to the reports seen in this dataset.

In [6]:
# download_vocabulary()

bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

def create_bert_vocab():
    with open(full_report_file, "r", encoding='utf-8') as reports:
        print("Beginning file split")
        report_list = reports.read().split("\n\n")
        input_list = ["" for report in report_list]
    print("Beginning Tensor Creation")
    report_tensor = tf.constant(report_list)
    print(report_tensor)
    print("Beginning dataset creation")
    ds = tf.data.Dataset.from_tensor_slices(report_tensor)
    print("Creating the BERT vocab")
    vocab = bert_vocab.bert_vocab_from_dataset(
        ds.batch(1000).prefetch(2),
        **bert_vocab_args
    )
    print("Writing vocab to file")
    with open("ufo_vocab.txt", 'w', encoding="utf-8") as f:
        for token in vocab:
          print(token, file=f)

In [None]:
create_bert_vocab()

In [6]:

tokens = pure_tokenize(["Hello there, my devoted chosen.", "Don't'nt."])
print(tokens)

tokens = tokenize(["", open("./all_reports/000/S00111.ufo", "r").read(), open("./all_reports/000/S00113.ufo", "r").read()])
tokens = tokenize(tf.constant(["", "", ""]))
print(detokenize([[2]]))
print(tokens.to_list())
print(detokenize(tokens))
print(trfrm.BATCH_SIZE)
print(MAX_REPORT_LENGTH)

<tf.RaggedTensor [[[3087], [148], [15], [121], [1487, 5900, 1226], [5621, 584], [17]], [[401], [10], [61], [10], [2504], [17]]]>
<tf.RaggedTensor [[b'[START]']]>
[[2, 3], [2, 3], [2, 3]]
<tf.RaggedTensor [[b'[START]', b'[END]'], [b'[START]', b'[END]'], [b'[START]', b'[END]']]>
8
1500


## Dataset
Now the dataset object can be created and saved to disk.

In [7]:

def dataset_from_allreport_file():
    with open(full_report_file, "r", encoding='utf-8') as reports:
        print("Beginning file split")
        report_list = reports.read().split("\n\n")
        input_list = ["" for report in report_list]
    print("Beginning Tensor Creation")
    dataset_size = len(input_list)
    input_tensor = tokenize(input_list).to_tensor()
    report_tensor = tokenize(report_list).to_tensor(shape=(dataset_size, MAX_REPORT_LENGTH))
    print("Beginning dataset creation")
    return tf.data.Dataset.from_tensor_slices((input_tensor, report_tensor)), dataset_size

def estimate_avg_report_length():
    with open(full_report_file, "r", encoding='utf-8') as reports:
        print("Beginning file split")
        report_list = reports.read().split("\n\n")
        
    report_lens = [len(report.split()) for report in report_list]
    top_k_lens = []
    k = 250
    for i in range(k):
        top_k_lens.append(max(report_lens))
        report_lens[report_lens.index(max(report_lens))] = 100
    print(str(k), " Longest Reports: ", top_k_lens)
#     print("Longest report of size ", max(report_lens), " at index ", report_lens.index(max(report_lens)))
#     report_lens[report_lens.index(max(report_lens))] = 100
#     print("Second longest report of size ", max(report_lens), " at index ", report_lens.index(max(report_lens)))
#     report_lens[report_lens.index(max(report_lens))] = 100
#     print("Third longest report of size ", max(report_lens), " at index ", report_lens.index(max(report_lens)))
    total_words = sum(len(report.split()) for report in report_list)
    avg_words_per_report = total_words / len(report_list)
    print("Total words: ", total_words)
    print("Avg Report Length: ", avg_words_per_report)
    

def create_and_save_data_batches(train_split=0.7, val_split=0.2, test_split=0.1):
    dataset, ds_size = dataset_from_allreport_file()
    save_unprocessed_dataset(dataset, ds_size)
    dataset, ds_size = load_unprocessed_dataset()
    print("Beginning Dataset shuffling, partitioning, and batching")
    train_batches, val_batches, test_batches = create_partition_batches(dataset, ds_size, train_split, val_split, test_split)
    tf.data.experimental.save(train_batches, "data/full/training_batches")
    tf.data.experimental.save(val_batches, "data/full/validation_batches")
    tf.data.experimental.save(test_batches, "data/full/test_batches")
    
def load_data_batches():
    train_batches = tf.data.experimental.load("data/full/training_batches")
    val_batches = tf.data.experimental.load("data/full/validation_batches")
    test_batches = tf.data.experimental.load("data/full/test_batches")
    return train_batches, val_batches, test_batches

def save_unprocessed_dataset(ds, ds_size):
    print("Saving unprocessed dataset of size ", ds_size, " at data/full/unprocessed")
    tf.data.experimental.save(ds, "data/full/unprocessed")
    open("data/full/unprocessed_dataset_size.txt", "w").write(str(ds_size))
    
def load_unprocessed_dataset():
    ds = tf.data.experimental.load("data/full/unprocessed")
    ds_size = int(open("data/full/unprocessed_dataset_size.txt", "r").read())
    print("Loaded unprocessed dataset of size ", ds_size, " from data/full/unprocessed")
    return ds, ds_size

    

In [8]:

# estimate_avg_report_length()

# create_and_save_data_batches()

train, val, test = load_data_batches()
for batch, tar in train.take(1):
    print(batch)
    print(detokenize(tar))

tf.Tensor(
[[2 3]
 [2 3]
 [2 3]
 [2 3]
 [2 3]
 [2 3]
 [2 3]
 [2 3]], shape=(8, 2), dtype=int32)
<tf.RaggedTensor [[b'[START]', b'the', b'craft', b'was', b'a', b'light', b'blue', b'shiny', b'sphere', b'frisbee', b'looking', b'object', b'.', b'it', b'flew', b'left', b'to', b'right', b'a', b'couple', b'of', b'times', b'then', b'it', b'flew', b'in', b'circles', b'and', b'then', b'it', b'just', b'stopped', b'and', b'disappeared', b'.', b'it', b'was', b'sometime', b'in', b'the', b'afternoon', b'.', b'my', b'friend', b'and', b'i', b'saw', b'something', b'wierd', b'in', b'the', b'sky', b'.', b'it', b'was', b'a', b'light', b'blue', b'frisbee', b'looking', b'object', b'.', b'it', b'had', b'some', b'kind', b'of', b'shiny', b'force', b'field', b'around', b'it', b'.', b'the', b'object', b'kept', b'flying', b'around', b'in', b'the', b'same', b'area', b'going', b'left', b'to', b'right', b'and', b'so', b'on', b'like', b'it', b'was', b'looking', b'for', b'something', b'.', b'then', b'it', b'started', b

## Training
The transformer can begin the training process on the training dataset

In [12]:
train, val, test = load_data_batches()
trfrm.continue_training(train)

NameError: name 'load_data_batches' is not defined

In [8]:
transformer = trfrm.transformer
reportGenerator = ReportGenerator(transformer)
# for _ in range(10):
reportGenerator.generate_report(4000)

'Low flying extremely low ufo over the snovapolic. At around 18. 30 to 25pm on January 6th, while sitting in my garage smoking a cigarette when I noticed extremely bright light. I was facing east I looked at it and the sky was still clear and clear. The light was not moving, so still, moving very slowly and not straight, it never faded or came back.'

In [9]:
def generate_report_library(num_reports=10, max_report_len=100):
    reportGenerator = ReportGenerator(transformer)
    reports = []
    for numrep in tqdm(range(num_reports)):
        reports.append(reportGenerator.generate_report(max_report_len))
    return reports

def write_reports_to_file(report_list, filepath):
    with open(filepath, "w", encoding="utf-8") as rf:
        for report in report_list:
            rf.write(report)
            rf.write("\n\n")

In [10]:
book_file = "./output/book/ufo_book2.txt"
try: 
    os.mkdir('output/book')
except FileExistsError:
    pass


reports = generate_report_library(200, 500)
write_reports_to_file(reports, book_file)

  0%|          | 0/200 [00:00<?, ?it/s]

In [145]:
small_report_file = "./output/small/allsmall.txt"
med_report_file = "./output/med/allmed.txt"
long_report_file = "./output/long/alllong.txt"
try: 
    os.mkdir('output/small')
except FileExistsError:
    pass
try: 
    os.mkdir('output/med')
except FileExistsError:
    pass
try: 
    os.mkdir('output/long')
except FileExistsError:
    pass

# small_reports = generate_report_library(100, 100)
# med_reports = generate_report_library(100, 500)
# long_reports = generate_report_library(100, 1000)

# write_reports_to_file(small_reports, small_report_file)
# write_reports_to_file(med_reports, med_report_file)
# write_reports_to_file(long_reports, long_report_file)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [146]:
mega_report_file = "./output/mega/allmega.txt"
supermega_report_file = "./output/supermega/allsupermega.txt"
try: 
    os.mkdir('output/mega')
except FileExistsError:
    pass
try: 
    os.mkdir('output/supermega')
except FileExistsError:
    pass


mega_reports = generate_report_library(5, 6000)
super_mega_reports = generate_report_library(5, 15000)

write_reports_to_file(mega_reports, mega_report_file)
write_reports_to_file(super_mega_reports, supermega_report_file)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [13]:
ultramega_report_file = "./output/ultramega/allultramega.txt"

try: 
    os.mkdir('output/ultramega')
except FileExistsError:
    pass

ultra_mega_reports = generate_report_library(5, 70000)

write_reports_to_file(ultra_mega_reports, ultramega_report_file)

  0%|          | 0/5 [00:00<?, ?it/s]