In [30]:
import json
import pandas
from pandas.io.json import json_normalize
import matplotlib
from matplotlib import pyplot as plt
import tensorflow as tf
import numpy as np

AttributeError: module 'tensorflow' has no attribute 'enable_eager_execution'

In [25]:
tf.executing_eagerly()

True

In [2]:
df = pandas.read_json('/mnt/c/Code/cancer-survival-rates/data/processed/case_studies_clean.json')

In [3]:
df.columns

Index(['disease_type', 'primary_site', 'gender', 'race', 'vital_status',
       'ethnicity', 'days_to_death', 'days_to_birth', 'year_of_birth',
       'cause_of_death', 'year_of_diagnosis', 'age_at_diagnosis',
       'days_to_last_follow_up', 'tumor_grade', 'days_to_recurrence',
       'prior_malignancy', 'major_site'],
      dtype='object')

In [4]:
cleaned = df.loc[(df['major_site'] != 'Other') & ~df['ethnicity'].isin(['not reported']) & ~df['race'].isin(['not reported', 'Unknown', 'unknown', 'not allowed to collect']) & ~df['gender'].isin(['not reported', 'unknown'])].dropna(subset=['race', 'major_site', 'ethnicity', 'gender'])

In [5]:
shuffled = cleaned.reindex(np.random.permutation(cleaned.index))

In [7]:
def split_shuffled(dataset, percent):
    split_on = int(len(dataset) * percent)
    specified_size = dataset.head(split_on)
    the_rest = dataset[split_on:]
    return specified_size, the_rest

In [8]:
train, test = split_shuffled(shuffled, 0.8)
train, val = split_shuffled(train, 0.8)

In [9]:
possible_sites = list(train['major_site'].unique())

In [10]:
tf.one_hot(indices=train['major_site'].apply(lambda x: possible_sites.index(x)), depth=len(possible_sites))

<tf.Tensor: shape=(24404, 25), dtype=float32, numpy=
array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [11]:
def preprocess(dataframe):
    selected_features = dataframe[['gender', 'race', 'ethnicity']]
    selected_targets = dataframe['major_site']
    one_hot_targets = tf.one_hot(indices=selected_targets.apply(lambda x: possible_sites.index(x)), depth=len(possible_sites))
    return selected_features.copy(), selected_targets.copy(), one_hot_targets

In [12]:
training_features, training_targets, training_one_hot = preprocess(train)
validation_features, validation_targets, validation_one_hot = preprocess(val)


In [18]:
type(training_one_hot)

tensorflow.python.framework.ops.EagerTensor

In [133]:
training_features.describe()

Unnamed: 0,gender,race,ethnicity
count,24404,24404,24404
unique,2,6,3
top,female,white,not hispanic or latino
freq,12862,20993,23111


In [120]:
validation_features.describe()

Unnamed: 0,gender,race,ethnicity
count,6099,6099,6099
unique,2,6,3
top,female,white,not hispanic or latino
freq,3147,5300,5805


In [14]:
# going with a simple vocab list since the number of potential values in each of our feature columns is pretty low
def make_one_hot(source_set, column):
    return tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(column, source_set.dropna(subset=[column])[column].unique().tolist()))

In [15]:
def construct_feature_columns():
    race = make_one_hot(cleaned, 'race')
    gender = make_one_hot(cleaned, 'gender')
    ethnicity = make_one_hot(cleaned, 'ethnicity')
    return set([race, gender, ethnicity])

In [35]:
def create_training_input_fn(training_features, training_labels, batch_size=1, num_epochs=None, shuffle=True):
    def _input_fn(num_epocs=None, shuffle=True):
        features = { key: np.array(value) for key, value in dict(training_features).items() }

        ds = tf.data.Dataset.from_tensor_slices((features, training_labels))
        ds = ds.batch(batch_size).repeat(num_epochs)

        if shuffle:
            ds = ds
            
        features, labels  = tf.data.experimental.get_single_element(ds)
        return features, labels

    return _input_fn

In [36]:
def train_linear_classification_model(
    learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):

    periods = 10

    steps_per_period = steps // periods
    predict_training_input_fn = create_training_input_fn(training_examples, training_targets, batch_size)
    predict_validation_input_fn = create_training_input_fn(validation_examples, validation_targets, batch_size)
    training_input_fn = create_training_input_fn(training_examples, training_targets, batch_size)

    my_optimizer = tf.optimizers.Adagrad(learning_rate=learning_rate, clipnorm=5.0)
    classifier = tf.estimator.LinearClassifier(
        feature_columns=construct_feature_columns(),
        optimizer=my_optimizer,
        config=tf.estimator.RunConfig(keep_checkpoint_max=1),
        label_vocabulary=possible_sites
    )


    training_errors = []
    validation_errors = []
    for period in range(0, periods):
        classifier.train(input_fn=training_input_fn, steps=steps_per_period)

        training_predictions = list(classifier.predict(input_fn=predict_training_input_fn))
        traingin_probabilities = np.array([item['probabilities'] for item in training_predictions])
        training_pred_class_id = np.array([item['class_ids'][0] for item in training_predictions])
        training_pred_one_hot = tf.keras.utils.tocategorical(training_pred_class_id, 10)
        
        validation_predictions = list(classifier.predict(input_fn=predict_validation_input_fn))
        validation_probabilities = np.array([item['probabilities'] for item in validation_predictions])    
        validation_pred_class_id = np.array([item['class_ids'][0] for item in validation_predictions])
        validation_pred_one_hot = tf.keras.utils.to_categorical(validation_pred_class_id,10)    
        
        # Compute training and validation errors.
        training_log_loss = metrics.log_loss(training_targets, training_pred_one_hot)
        validation_log_loss = metrics.log_loss(validation_targets, validation_pred_one_hot)
        # Occasionally print the current loss.
        print("  period %02d : %0.2f" % (period, validation_log_loss))
        # Add the loss metrics from this period to our list.
        training_errors.append(training_log_loss)
        validation_errors.append(validation_log_loss)
    
    final_predictions = classifier.predict(input_fn=predict_validation_input_fn)
    final_predictions = np.array([item['class_ids'][0] for item in final_predictions])

    accuracy = metrics.accuracy_score(validation_targets, final_predictions)
    print(f'Final accuracy on (validation data): {accuracy}')

    # Output a graph of loss metrics over periods.
    plt.ylabel("LogLoss")
    plt.xlabel("Periods")
    plt.title("LogLoss vs. Periods")
    plt.plot(training_errors, label="training")
    plt.plot(validation_errors, label="validation")
    plt.legend()
    plt.show()

    # Output a plot of the confusion matrix.
    cm = metrics.confusion_matrix(validation_targets, final_predictions)
    # Normalize the confusion matrix by row (i.e by the number of samples
    # in each class).
    cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
    ax = sns.heatmap(cm_normalized, cmap="bone_r")
    ax.set_aspect(1)
    plt.title("Confusion matrix")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.show()

    return classifier


In [37]:
classifier = train_linear_classification_model(
    learning_rate=0.03,
    steps=1000,
    batch_size=30,
    training_examples=training_features,
    training_targets=training_targets,
    validation_examples=validation_features,
    validation_targets=validation_targets)

INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpdufw3v9j', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INF

InvalidArgumentError: Dataset had more than one element.
	 [[node DatasetToSingleElement (defined at <ipython-input-35-e23047346ccb>:11) ]]

Errors may have originated from an input operation.
Input Source operations connected to node DatasetToSingleElement:
 RepeatDataset (defined at <ipython-input-35-e23047346ccb>:6)

Original stack trace for 'DatasetToSingleElement':
  File "/usr/local/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/local/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 563, in start
    self.io_loop.start()
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "/usr/local/lib/python3.7/asyncio/base_events.py", line 538, in run_forever
    self._run_once()
  File "/usr/local/lib/python3.7/asyncio/base_events.py", line 1782, in _run_once
    handle._run()
  File "/usr/local/lib/python3.7/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tornado/ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tornado/gen.py", line 787, in inner
    self.run()
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 361, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 541, in execute_request
    user_expressions, allow_stdin,
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 300, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2848, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2874, in _run_cell
    return runner(coro)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3242, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-37-5235e694272e>", line 8, in <module>
    validation_targets=validation_targets)
  File "<ipython-input-36-e943482933e5>", line 29, in train_linear_classification_model
    classifier.train(input_fn=training_input_fn, steps=steps_per_period)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 374, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1164, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1191, in _train_model_default
    input_fn, ModeKeys.TRAIN))
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1028, in _get_features_and_labels_from_input_fn
    self._call_input_fn(input_fn, mode))
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1119, in _call_input_fn
    return input_fn(**kwargs)
  File "<ipython-input-35-e23047346ccb>", line 11, in _input_fn
    features, labels  = tf.data.experimental.get_single_element(ds)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tensorflow_core/python/data/experimental/ops/get_single_element.py", line 70, in get_single_element
    dataset._variant_tensor, **dataset._flat_structure))  # pylint: disable=protected-access
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_dataset_ops.py", line 1084, in dataset_to_single_element
    output_shapes=output_shapes, name=name)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py", line 742, in _apply_op_helper
    attrs=attr_protos, op_def=op_def)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3322, in _create_op_internal
    op_def=op_def)
  File "/home/aclifford/.local/share/virtualenvs/cancer-survival-rates-XPA78zYg/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 1756, in __init__
    self._traceback = tf_stack.extract_stack()
