In [13]:
!git clone https://github.com/mixuala/colab_utils.git

import os
import colab_utils.tboard

# set paths
ROOT = %pwd
LOG_DIR = os.path.join(ROOT, 'logs')
print(ROOT, LOG_DIR)

# will install `ngrok`, if necessary
# will create `log_dir` if path does not exist
colab_utils.tboard.launch_tensorboard( bin_dir=ROOT, log_dir=LOG_DIR )


Cloning into 'colab_utils'...
remote: Counting objects: 216, done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 216 (delta 15), reused 40 (delta 12), pack-reused 171[K
Receiving objects: 100% (216/216), 60.19 KiB | 5.02 MiB/s, done.
Resolving deltas: 100% (83/83), done.
/content /content/logs
calling wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip ...
calling unzip ngrok-stable-linux-amd64.zip ...
ngrok installed. path=/content/ngrok
status: tensorboard=False, ngrok=False
tensorboard url= http://c530266d.ngrok.io


'http://c530266d.ngrok.io'

In [14]:
import math
import numpy as np
import tensorflow as tf

# Loads an uppercase dataset.
# - The dataset either uses a specified alphabet, or constructs an alphabet of
#   specified size consisting of most frequent characters.
# - The batches are generated using a sliding window of given size,
#   i.e., for a character, we generate left `window` characters, the character
#   itself and right `window` characters, 2 * `window` +1 in total.
# - The batches can be either generated using `next_batch`+`epoch_finished`,
#   or all data in the original order can be generated using `all_data`.
class Dataset:
    def __init__(self, filename, window, alphabet):
        self._window = window

        # Load the data
        with open(filename, "r", encoding="utf-8") as file:
            self._text = file.read()

        # Create alphabet_map
        alphabet_map = {"<pad>": 0, "<unk>": 1}
        if not isinstance(alphabet, int):
            for index, letter in enumerate(alphabet):
                alphabet_map[letter] = index
        else:
            # Find most frequent characters
            freqs = {}
            for char in self._text:
                char = char.lower()
                freqs[char] = freqs.get(char, 0) + 1

            most_frequent = sorted(freqs.items(), key=lambda item:item[1], reverse=True)
            for i, (char, freq) in enumerate(most_frequent, len(alphabet_map)):
                alphabet_map[char] = i
                if len(alphabet_map) >= alphabet: break

        # Remap input characters using the alphabet_map
        self._lcletters = np.zeros(len(self._text) + 2 * window, np.uint8)
        self._labels = np.zeros(len(self._text), np.bool)
        for i in range(len(self._text)):
            char = self._text[i].lower()
            if char not in alphabet_map: char = "<unk>"
            self._lcletters[i + window] = alphabet_map[char]
            self._labels[i] = self._text[i].isupper()

        # Compute alphabet
        self._alphabet = [""] * len(alphabet_map)
        for key, value in alphabet_map.items():
            self._alphabet[value] = key

        self._permutation = np.random.permutation(len(self._text))

    def _create_batch(self, permutation):
        batch_windows = np.zeros([len(permutation), 2 * self._window + 1], np.int32)
        for i in range(0, 2 * self._window + 1):
            batch_windows[:, i] = self._lcletters[permutation + i]
        return batch_windows, self._labels[permutation]

    def print_results(self, batch_size, network, dataset):
      utf8stdout = open("output_{}.txt".format(dataset), 'w', encoding='utf-8', closefd=True)
      iSum = 0
      #while not self.epoch_finished():
          #windows, labels = self.next_batch(batch_size)
      windows, labels = self.all_data()
      predictions, _ = network.evaluate(dataset, windows, labels)

      for i in range(len(predictions)):
          character = self.text[iSum].lower()
          if (predictions[i] == True):
              character = character.upper()
          print(character, end='', file=utf8stdout)
          iSum += 1

      
    @property
    def alphabet(self):
        return self._alphabet

    @property
    def text(self):
        return self._text

    @property
    def labels(self):
        return self._labels

    def all_data(self):
        return self._create_batch(np.arange(len(self._text)))

    def next_batch(self, batch_size):
        batch_size = min(batch_size, len(self._permutation))
        batch_perm, self._permutation = self._permutation[:batch_size], self._permutation[batch_size:]
        return self._create_batch(batch_perm)

    def epoch_finished(self):
        if len(self._permutation) == 0:
            self._permutation = np.random.permutation(len(self._text))
            return True
        return False


class Network:
    def __init__(self, seed=42):
        # Create an empty graph and a session
        graph = tf.Graph()
        graph.seed = seed
        self.session = tf.Session(graph = graph, config=tf.ConfigProto())

    def construct(self, args):
        with self.session.graph.as_default():
            # Inputs
            self.windows = tf.placeholder(tf.int32, [None, 2 * args.window + 1], name="windows")
            self.labels = tf.placeholder(tf.int32, [None], name="labels") # Or you can use tf.int32
            self.trainingMode = tf.placeholder(tf.bool)

            hot_repre = tf.one_hot(self.windows, args.alphabet_size)

            # Architecture
            flattened_images = tf.layers.flatten(hot_repre, name="flatten")

            hidden_size = math.ceil(7.5 * (args.window * 2 + 1) * args.alphabet_size)
            print("Hidden size: ", hidden_size)

            hidden_layer = tf.layers.dense(flattened_images, hidden_size, activation=tf.nn.relu, name="Relu_1")
            hidden_layer_dropout = tf.layers.dropout(hidden_layer, rate=0.5, training=self.trainingMode, name="dropout_layer")
            hidden_2 = tf.layers.dense(hidden_layer_dropout, math.ceil(hidden_size / 30), activation=tf.nn.relu, name="Relu_2")
            output_layer = tf.layers.dense(hidden_2, 2, activation=None, name="output_layer")
            self.predictions = tf.cast(tf.argmax(output_layer, axis=1, output_type=tf.int32), tf.bool)

            # Training
            loss = tf.losses.sparse_softmax_cross_entropy(self.labels, output_layer, scope="loss")
            global_step = tf.train.create_global_step()
            self.training = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step=global_step, name="training")

            # Summaries
            self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.cast(self.labels, tf.bool), self.predictions), tf.float32))
            summary_writer = tf.contrib.summary.create_file_writer(args.logdir, flush_millis=10 * 1000)
            self.summaries = {}
            with summary_writer.as_default(), tf.contrib.summary.record_summaries_every_n_global_steps(100):
                self.summaries["train"] = [tf.contrib.summary.scalar("train/loss", loss),
                                           tf.contrib.summary.scalar("train/accuracy", self.accuracy)]
            with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
                for dataset in ["dev", "test"]:
                    self.summaries[dataset] = [tf.contrib.summary.scalar(dataset + "/loss", loss),
                                               tf.contrib.summary.scalar(dataset + "/accuracy", self.accuracy)]

            # Initialize variables
            self.session.run(tf.global_variables_initializer())
            with summary_writer.as_default():
                tf.contrib.summary.initialize(session=self.session, graph=self.session.graph)

    def train(self, windows, labels):
        self.session.run([self.training, self.summaries["train"]], {self.windows: windows, self.labels: labels, self.trainingMode: True})

    def evaluate(self, dataset, windows, labels):
        return self.session.run([self.predictions, self.summaries[dataset]], {self.windows: windows, self.labels: labels, self.trainingMode: False})

class Args(object):
  pass

if __name__ == "__main__":
    import argparse
    import datetime
    import os
    import re

    # Fix random seed
    np.random.seed(42)

    # Parse arguments
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--alphabet_size", default=60, type=int, help="Alphabet size.")
    # parser.add_argument("--batch_size", default=1000, type=int, help="Batch size.")
    # parser.add_argument("--epochs", default=1, type=int, help="Number of epochs.")
    # parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
    # parser.add_argument("--window", default=3, type=int, help="Size of the window to use.")
    # args = parser.parse_args()
    
    args = Args()
    args.alphabet_size = 60
    args.batch_size = 1000
    args.epochs = 5
    args.window = 4

    # Create logdir name
    args.logdir = "logs/{}-{}-{}".format(
        "./",
        datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
        ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))
    )
    if not os.path.exists("logs"): os.mkdir("logs") # TF 1.6 will do this by itself

    # Load the data
    train = Dataset("uppercase_data_train.txt", args.window, alphabet=args.alphabet_size)
    dev = Dataset("uppercase_data_dev.txt", args.window, alphabet=train.alphabet)

    # For ReCodex
    test = Dataset("uppercase_data_test.txt", args.window, alphabet=train.alphabet)

    # Construct the network
    network = Network()
    network.construct(args)

    # Train
    for i in range(args.epochs):
        print("Starting epoch ", i)
        while not train.epoch_finished():
            windows, labels = train.next_batch(args.batch_size)
            network.train(windows, labels)

        print("Training finished at ", i)
        dev_windows, dev_labels = dev.all_data()
        res = network.evaluate("dev", dev_windows, dev_labels)
        print("Epoch ", i, ", Dev acc: ", res[1])

    
    dev.print_results(args.batch_size, network, "dev")
    test.print_results(args.batch_size, network, "test")


Hidden size:  4050
Starting epoch  0
Training finished at  0
Epoch  0 , Dev acc:  [True, True]
Starting epoch  1
Training finished at  1
Epoch  1 , Dev acc:  [True, True]
Starting epoch  2
Training finished at  2
Epoch  2 , Dev acc:  [True, True]
Starting epoch  3
Training finished at  3
Epoch  3 , Dev acc:  [True, True]
Starting epoch  4
Training finished at  4
Epoch  4 , Dev acc:  [True, True]


In [1]:
!ls

datalab


In [0]:
from google.colab import files

files.download('output_dev.txt')
files.download('output_test.txt')

In [0]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " I Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

In [10]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving uppercase_data_train.txt to uppercase_data_train.txt
User uploaded file "uppercase_data_train.txt" with length 6536807 bytes


In [0]:
!kill -9 -1
