# **Exploring the natality dataset**

In [0]:
BUCKET = 'crawles-sandbox'
PROJECT = 'crawles-sandbox'
REGION = 'us-central1'


import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION


In [0]:
%%bash
if ! gsutil ls | grep -q gs://${BUCKET}?; then
     gsutil mb -L ${REGION} gs:??${BUCKET}
fi

In [0]:
query = """
SELECT weight_pounds, is_male, mother_age, plurality,
gestation_weeks, ABS (FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING))))
FROM 
publicdata.samples.natality
WHERE year > 2000
"""

In [0]:
#call BigQuery and examine in the dataframe
import google.datalab.bigquery as bq
df = bq.Query(query + "LIMIT 100").execute().result().to_dataframe()
df.head(10)

In [0]:
#create a function that finds the number of records and the average weight for each of these records
def get_distinct_values(column_name):
  sql = """
  SELECT {0}, COUNT(1) AS num_babies,
  AVG(weight_pounds) AS avg_wt
  FROM
  publicdata.samples.natality
  WHERE year > 2000
  GROUP BY {0}""".fotrmat(column_name)
  return bq.Query(sql).execute().result().to_dataframe

In [0]:
#plot the sex of the babies against the number of the babies and average weight of teh groups
df = get_distinct_values('is_male')
df.plot (x = 'is_male', y = 'num_babies', kind = 'bar');
df.plot(x = 'is_male', y = 'avg_wt', kind = 'bar')



In [0]:
#do the same thing with the mothers' age
df = get_distinct_values('mother_age')
df = df.sort_values('mother_age')
df.plot (x = 'is_male', y = 'num_babies';
df.plot(x = 'is_male', y = 'avg_wt');


In [0]:
df = get_distinct_values('plurality')
df = df.sort_values('plurality')
df.plot(x = 'plurality', y = 'num_babies', logy = True, kind = 'bar')
df.plot(x = 'plurality', y = 'avg_wt', kind = bar)


In [0]:
df = get_distinct_values('gestation_weeks')
df = df.sort_values('gestation_weeks')
df.plot(x ='gestation_weeks', y = 'num_babies', logy = True, kind = 'bar')
df.plot(x = 'gestation_weeeks', y = 'avg_wt', kind = 'bar')

# BUILD MACHINE LEARNING MODEL

# craete ML dataset by sampling using BigQuery

In [1]:
"""sample the dataset so that you have approximately 12,000 training examples and 3000 evaluation examples. the training and evaluation datasets have
to be well-distributed"""

'sample the dataset so that you have approximately 12,000 training examples and 3000 evaluation examples. the training and evaluation datasets have\nto be well-distributed'

In [0]:
import google.datalab.bigquery as bq
train_query = """
SELECT * FROM (
SELECT weight_pounds,is_male, mother_age, plurality, 
gestation_weeks, ABS(FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING)))) AS hashmonth
FROM 
publicdata.samples.natality
WHERE year > 2000 ) AND is_male IS NOT NULL AND plurality IS_NOT_NULL
AND weight_pounds IS NOT NULL AND gestation_weeks IS NOT NULL
WHERE MOD(ABS(hashmonth),10) < 8 AND RAND() 0.0004""" 


In [0]:
df = bq.Query(train_query ).execute().result().to_dataframe()

## SIMULATE LACK OF ULTRASOUND



In [0]:
import copy
df2  = copy.deepcopy(df)
df2['is_male'] = 'Unknown'

In [0]:
def plurality(df,column_name):
  if df.column_name.loc[df.column_name] == 1:
    df.column_name = 'Single'
  elif df.column_name.loc[df.column_name] == 2:
    df.column_name = 'Twins'
  elif df.column_name.loc[df.column_name] == 3:
    df.column_name = 'Triplets'
  else:
    df.column_name = 'Quadruplets'

In [0]:
plurality(df2,is_male)

In [0]:
train_df = (df.merge(df2)).to_csv('train_csv', index = False, header = False)


In [0]:
eval_query = """
SELECT * FROM (
SELECT weight_pounds,is_male, mother_age, plurality, 
gestation_weeks, ABS(FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING)))) AS hashmonth
FROM 
publicdata.samples.natality
WHERE year > 2000 ) AND is_male IS NOT NULL AND plurality IS_NOT_NULL
AND weight_pounds IS NOT NULL AND gestation_weeks IS NOT NULL
WHERE MOD(ABS(hashmonth),10) < 8 AND RAND() 0.0001""" 

In [0]:
eval_df = bq.Query(eval_query ).execute().result().to_dataframe()

In [0]:
eval_df2  = copy.deepcopy(eval_df)
eval_df2['is_male'] = 'Unknown'
plurality(eval_df2, is_male)


In [0]:
evaluation_df  =(eval_df.merge(eval_df2)).to_csv('eval.csv', index = False, header = False)



# STRUCTURE OF AN ESTIMATOR API MODEL

In [0]:
import tensorflow as tf
#Define input feature columns
featcols = [
            tf.feature_column.numeric_column("sq_footage")
]
#Instantiate the Linear Regression model
model = tf.estimator.LinearRegressor(featcols, './model_trained')

#Train the model
def train_input_fn():

  return features, labels
model.train(train_input_fn, steps = 100)

#Predict 
def pred_input_fn():

  return features
out = model.predict(pred_input_fn)



# CREATION OF A TENSOR FLOW MODEL USING TENSORFLOW'S ESTIMATOR API

In [2]:
#using the evaluation csv and the train csv created earlier to build the tensor flow model
"""Write an input function to read the data"""
import shutil
import numpy as np
import tensorflow as tf
print(tf.__version__)



2.2.0-rc2


In [0]:
CSV_COLUMNS = 'weight_pounds, is_male, mother_age, plurality,gestation_weeks'
LABEL_COLUMN = 'weight_pounds'
KEY_COLUMN = 'key'

#set default values for each csv column
DEFAULTS = [[0.0], ['null'], [0.0], ['null'], [0.0], ['nokey']]
TRAIN_STEPS  = 1000




In [0]:
#create an input function reading a file using the DATASET API
#Then provide the results to the estimator API
def read_dataset(filename, mode, batch_size  = 512):
  def_input_fn():
    def decode_csv(value_column):
      columns = tf.decode_csv(value_column, record_defaults = DEFAULTS)
      features  = dict(zip(CSV_COLUMNS, columns))
      label = features.pop(LABEL_COLUMN)
      return features, label


      # Create a list of files that match the pattern 
      file_list = tf.gfile.Glob(filename)

      #Create dataset from the file list
      dataset = (tf.data.TextLineDataset(file_list).map (decode_csv)) #Read text file

      if mode == tf.estimator.ModeKeys.TRAIN: #specify if we are in training mode or evaluation mode
        num_epochs   = None # indefinitely
        dataset = dataset.shuffle(buffer_size  = 10*batch_size)
      else:
        num_epochs = 1 # end of input after this 

      dataset = dataset.repeat(num_epochs).batch(batch_size)
      return dataset.make_one_shot_iterator().get_next()
    return_input_fn


    

In [0]:
# Next thing is to define the feature columns
def get_categorical(name,values):
  return tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
      name,values
  ))

def get_cols():
  #Define column types
  return [\
          get_categorical('is_male', ['True', 'False','Unknown' ]),
          tf.feature_column.numeric_column('mother_age'), get_categorical('plurality', ['Single(1)', 'Twins(2)', 
                                                                                        'Triplets(3)', 'Quadruplets(4)',
                                                                                        
                                                                                        'Quintuplets(5)', 'Multiple(2+)']
                                                                          tf.feature_column.numeric_column('gestation_weeks')
                                                                          )]


# To predict a tensorflow model, we also need a serving input fucntion.We want all the inputs from the user

In [0]:
def serving_input_fn():
  feature_placeholders = {
      'is_male': tf.placeholder(tf.string, [None]),
      'mother_age': tf.placeholder(tf.float32, [None]),
      'plurality':tf.placeholder(tf.string, [None]),
      'gestation_weeks': tf.placeholder(tf.float32, [None])
  }
  features = {
      key: tf.expand_dims(tensor, -1)
      for key, tensor in feature_placeholder.items()
  }
  return tf.estimator.export.ServingInputReceiver(features,feature_placeholders )
  

In [0]:
#create estimator to tarin and evaluate

def train_and_evaluate(output_dir):
  EVAL_INTERVAL = 300
  run_config  = tf.estimator.RunConfig(save_checkpoints_secs= EVAL_INETRVAL, keep_checkpoint_max= 3)
  estimator = tf.estimator.DNNRegressor(model_dir= output_dir,
                                        feature_columns  = get_cols(),
                                        hidden_units = [64, 32], 
                                        config = run_config)
  train_spec. = tf.estimator.TrainSpec(input_fn  = read_dataset('train.csv', mode = tf.estimator.ModeKeys, max_steps = TRAIN_STEPS)

  exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)
  eval_spec = tf.estimator.EvalSpec(
      input_fn = read_dataset('eval.csv',mode = tf.estimator.ModeKeys,
                              step = None,
                              start_delay_secs  = 60, #start evaluating after N seconds
                              throttle_secs  = EVAL_INETRVAL, #evaluate every N seconds
                              exporters  = exporter)
  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
  

In [0]:
#Finally to train the model
shutil.rmtree('babyweight_trained', ignore_errors= True)
train_and_evaluate('babyweight_trained')

# MONITOR AND EXPERIMENT WITH TRAINING

In [0]:
from google.datalab.ml import TensorBoard
TensorBoard().start('./babyweight_trained')

# OPERATIONALISE THE MODEL