<a href="https://colab.research.google.com/github/12004727uhi/12004727_DataAnalytics/blob/master/colab_sheets/12004727_assignment_2_linear_regression_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import pandas, to create data frame
import pandas as pd
# import numpy for maths operations
import numpy as np
# import tensorflow for modelling
%tensorflow_version 1.x
import tensorflow as tf
# import file management utilities
import shutil
# import package to create data tables
from tabulate import tabulate
# import iterable tools
import itertools

TensorFlow 1.x selected.


In [None]:
## Configuration
#
# enable logging for tensorflow (set to ERROR when automating, or INFO for diagnostic)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
# define dataset friendly title
DATASET_TITLE = '2013 to 2019 - Bronx County [linear_6.csv]'
# define dataset CSV URL
DATA_URL = 'https://raw.githubusercontent.com/12004727uhi/12004727_DataAnalytics/master/linear_6.csv'
# define validation dataset CSV URL
VALIDATION_DATA_URL = 'https://raw.githubusercontent.com/12004727uhi/12004727_DataAnalytics/master/sample_linear_6.csv'
# import dataset CSV & assign to variable (type pandas df)
TRAIN_AND_TEST_DATA = pd.read_csv(DATA_URL, index_col=0,)
# import validation dataset CSV & assign to variable (type pandas df)
VALIDATION_DATA = pd.read_csv(VALIDATION_DATA_URL, index_col=0,)
# define number of targets
NUM_TARGETS = 1
# define predictors for training/testing
predictor_column_list = [0,1,2,3,5,6,7]
predictors = []
for idx, predictor in enumerate(predictor_column_list, 1):
  predictors.append([list(p) for p in itertools.combinations(predictor_column_list, idx)])
  pred = [list(p) for p in itertools.combinations(predictor_column_list, idx)]
PREDICTORS_COLS = list(itertools.chain(*predictors))
# define target column indexes
TARGETS_COLS = [8]
# define validation target column name(s)
VALIDATION_TARGETS = ['NUM_COLLISIONS']
# URI of model
MODEL_URI = '/tmp/linear_trained_model'
# define how many model runs for training & testing
NUMBER_OF_TRAIN_TEST_RUNS = 5
# define how many model runs for validation
NUMBER_OF_VALIDATION_RUNS = 5
# Define scale parameter for number of collisions
SCALE_NUM_COLLISIONS = 1.0
# define number of estimator fit steps
ESTIMATOR_STEPS = 10000
# set training set size (percentage as decimal)
TRAINING_SET_SLICE = 0.8

# show first 5 rows to verify import
display(TRAIN_AND_TEST_DATA[:5])
# show first 5 rows to verify import
display(VALIDATION_DATA[:5])
# print length of predictors column list to verify
print(f'Predictors columns: {len(PREDICTORS_COLS)}')
# check the tensorflow version
print(f'Tensor flow version: {tf.__version__}')


Unnamed: 0,day_index,day,year,mo,county,temp,visib,wdsp,NUM_COLLISIONS
375,5,2,2013,1,Bronx County,38.0,10.0,15.0,0.127193
382,7,3,2013,1,Bronx County,27.5,10.0,12.4,0.232456
395,15,4,2013,1,Bronx County,21.8,10.0,9.8,0.258772
401,16,5,2013,1,Bronx County,32.2,10.0,11.3,0.241228
411,21,6,2013,1,Bronx County,37.3,10.0,13.2,0.171053


Unnamed: 0,day_index,day,year,mo,county,temp,visib,wdsp,NUM_COLLISIONS
4602,2388,2,2019,4,Kings County,48.8,6.3,11.4,0.648069
1238,493,7,2014,1,Kings County,26.8,9.4,17.8,0.470874
176,1597,6,2017,1,Queens County,38.7,10.0,18.3,0.585153
15242,1193,2,2016,11,Kings County,47.2,10.0,7.1,0.691983
1739,939,3,2015,2,Queens County,32.6,9.9,23.9,0.647321


Predictors columns: 127
Tensor flow version: 1.15.2


In [None]:
# function to loop linear regressor n times (shuffling data each time). Returns dict of RMSE results.
def run_linear_regressor(predictors_cols):
  # define result list to return
  result_list = []
  for idx, run in enumerate(range(NUMBER_OF_TRAIN_TEST_RUNS)):
    # shuffle data & print first 5 rows
    shuffled = TRAIN_AND_TEST_DATA.iloc[np.random.permutation(len(TRAIN_AND_TEST_DATA))]
    # assign predictors to variable
    predictors = shuffled.iloc[:,predictors_cols]
    # assign targets to variable
    targets = shuffled.iloc[:,TARGETS_COLS]
    # define training set size - 80% of the shuffled data array length
    training_set_size = int(len(shuffled['NUM_COLLISIONS'])*TRAINING_SET_SLICE)
    # define test set size - the remaining of the shuffled data array length
    test_set_size = len(shuffled['NUM_COLLISIONS']) - training_set_size
    # Define number of targets (outputs)
    num_of_targets = NUM_TARGETS
    # ensure any previously saved training run models are removed
    shutil.rmtree(MODEL_URI, ignore_errors=True)
    # define the TensorFlow estimator
    estimator = tf.contrib.learn.SKCompat(
        tf.contrib.learn.LinearRegressor(
            model_dir=MODEL_URI, 
            optimizer=tf.train.AdamOptimizer(learning_rate=0.1), 
            enable_centered_bias=False, 
            feature_columns=tf.contrib.learn.infer_real_valued_columns_from_input(predictors.values) 
            )
        )
    # train the model, using previously defined predictor & target values
    estimator.fit(predictors[:training_set_size].values, 
                  targets[:training_set_size].values.reshape(
                      training_set_size, num_of_targets)/SCALE_NUM_COLLISIONS, steps=ESTIMATOR_STEPS
                  )
    # check predictions based on defined predictors and assign result to variable
    predictions = estimator.predict(x=predictors[training_set_size:].values)
    # Apply scaling to output (not require here as previously discussed)
    predictions_scaled = predictions['scores']*SCALE_NUM_COLLISIONS
    # format predictions as strings to make readable in printing to check output etc
    predictions_formatted = format(str(predictions_scaled))
    # calculate RSME to determine how well the model is predicting the targets from the predictors
    rmse_linear_model = np.sqrt(np.mean((targets[training_set_size:].values - predictions_scaled)**2))
    # calculate mean of the normalised collision values
    mean_norm_collisions = np.mean(shuffled['NUM_COLLISIONS'][:training_set_size])
    # calculate the RMSE of line of best fit that simply follows a mean of target values.
    # This allows comparison between the 'mean model' and the proposed linear regression model.
    # If the linear regression model is successful, it will show a 'better fit' than the 'mean model'
    # i.e., the resulting calculation should be *lower*.
    rmse_mean_model = np.sqrt(np.mean((shuffled['NUM_COLLISIONS'][training_set_size:] - mean_norm_collisions)**2))
    # add rmse calculations to result list
    result_list.append({'Run': idx+1, 'Linear Regression Model RMSE': rmse_linear_model, 'Mean Model RMSE': rmse_mean_model, 'Predictors Columns': predictors_cols})
  return result_list


In [None]:
## validate against normalised, processed dataset
# set pandas to display _all_ columns
pd.set_option('display.max_columns', None)
def run_validation(validation_predictors_cols):
  # define rmse results list
  validation_rmse_result_list = []
  for idx, run in enumerate(range(NUMBER_OF_VALIDATION_RUNS)):
    # shuffle data
    shuffled_validation_data = VALIDATION_DATA.iloc[np.random.permutation(len(VALIDATION_DATA))]
    # assign predictors to variables
    predictors_validation_data = shuffled_validation_data.iloc[:,validation_predictors_cols]
    # define TensorFlow estimator
    estimator = tf.contrib.learn.SKCompat(tf.contrib.learn.LinearRegressor(model_dir=MODEL_URI, enable_centered_bias=False, feature_columns=tf.contrib.learn.infer_real_valued_columns_from_input(predictors_validation_data.values)))
    # check predictions based on defined predictors and assign result to variable
    validation_predictions = estimator.predict(x=predictors_validation_data.values)
    # extract a list of the resulting prediction scores
    validation_prediction_scores = validation_predictions['scores']
    # define validation data targets
    targets_validation = pd.DataFrame(list(VALIDATION_DATA[t] for t in VALIDATION_TARGETS))
    # calculate RMSE for regression model and actual validation collision data
    rmse_validation = np.sqrt(np.mean((targets_validation.values - validation_prediction_scores)**2))
    # add RMSE to list
    validation_rmse_result_list.append({'Run': idx+1, 'Validation Test RMSE': rmse_validation, 'Predictors Columns': validation_predictors_cols})
  # return  result list
  return validation_rmse_result_list

In [None]:
# function to create table of training/testing results & compute RSME
def create_table(rmse_results):
  # calculate mean & median RMSEs
  regression_RMSE_list = [n['Linear Regression Model RMSE'] for n in rmse_results]
  mean_model_RMSE_list = [n['Mean Model RMSE'] for n in rmse_results]
  regression_mean_RMSE = np.mean(regression_RMSE_list)
  mean_mean_RMSE = np.mean(mean_model_RMSE_list)
  regression_median_RMSE = np.median(regression_RMSE_list)
  mean_median_RMSE = np.median(mean_model_RMSE_list)
  header = rmse_results[0].keys()
  rows =  [x.values() for x in rmse_results]
  # ONLY PRINT OUTPUT IF regression_median_RMSE <= 1.0
  if regression_median_RMSE < 1.0:
    print(tabulate(rows, header, floatfmt=".15"))
    # print mean & median RMSEs
    print(f'\nMean RMSE for "Linear Regression Model": {regression_mean_RMSE}')
    print(f'Median RMSE for "Linear Regression Model": {regression_median_RMSE}')
    print(f'Mean RMSE for "Mean Model": {mean_mean_RMSE}')
    print(f'Median RMSE for "Mean Model": {mean_median_RMSE}\n')
    return True
  else:
    print(f'Median Linear Regressor RMSE was value was over 1.0! Tables will not be drawn.')
    return False

In [None]:
# function to create table of validation results & compute RSME
def create_validation_table(validation_results):
  header = validation_results[0].keys()
  rows =  [x.values() for x in validation_results]
  # print table
  print(tabulate(rows, header, floatfmt=".15"))
  # calculate mean & median RMSEs
  validation_RMSE_list = [n['Validation Test RMSE'] for n in validation_results]
  validation_mean_RMSE = np.mean(validation_RMSE_list)
  validation_median_RMSE = np.median(validation_RMSE_list)
  # print mean & median RMSEs
  print(f'\nMean RMSE for Validaton Data: {validation_mean_RMSE}')
  print(f'Median RMSE for Validaton Data: {validation_median_RMSE}')

In [None]:
# function to generate rank table of median rmse result over n runs
def create_rank_table(all_results):
  # define final table lists
  table_list = []
  # compile table list with medians 
  for model in all_results:
    run_rmse_list = [run['Linear Regression Model RMSE'] for run in model['model_results']]
    run_mean_model_rmse_list = [run['Mean Model RMSE'] for run in model['model_results']]
    run_validation_rmse_list = [run['Validation Test RMSE'] for run in model['validation_results']]
    table_list.append({'Median RMSE\n(Ranked)': np.median(run_rmse_list), 
                       'Median \'Mean Model\'\nRMSE': np.median(run_mean_model_rmse_list),
                       'Model\nPredictors': '\n'.join([TRAIN_AND_TEST_DATA.columns[c] for c in model['model_results'][0]['Predictors Columns']]),
                       'Model\nRuns': len(run_rmse_list),
                       'Median Validation\nRMSE': np.median(run_validation_rmse_list),
                       'Validation\nRuns': len(run_validation_rmse_list)
                       })
  # rank medians
  sorted_table_list = sorted(table_list, key=lambda d: d['Median RMSE\n(Ranked)'])
  # display table
  print('\n----------------------------------------------------------------')
  print('Ranked Table of Median RMSE Results for Linear Regression Models')
  print('----------------------------------------------------------------')
  print('Tensor Flow Linear Regression Output')
  print(f'Model dataset title: {DATASET_TITLE}')
  print(f'Model dataset CSV: {DATA_URL}')
  print(f'Model validation dataset CSV: {VALIDATION_DATA_URL}\n')
  header = sorted_table_list[0].keys()
  rows =  [x.values() for x in sorted_table_list]
  print(tabulate(rows, header, floatfmt=".15", showindex='always', tablefmt="grid"))

In [None]:
# - run all models n times, for all predictor combinations on imported datasets;
# - calculate & tabulate RMSEs for each model run & then calculate mean RMSE;
# - run model n time on validation datasets, for all predictor combinations;
# - calculate RMSE for each model validation run & then calculate mean RMSE;

# function to do modelling & validation
def model_and_validate():
  # define list to retain all results in memory for further analysis if necessary
  all_rmse_results = []
  # print sheet header
  print(f'Tensor Flow Linear Regression Output')
  print(f'Model dataset title: {DATASET_TITLE}')
  print(f'Model dataset CSV: {DATA_URL}')
  print(f'Model validation dataset CSV: {VALIDATION_DATA_URL}')
  print(f'------------------------------------------------------------------------------\n')
  for predictors_combination in PREDICTORS_COLS:
    # print the predictor combination
    print('Model for these predictors: {}'.format(", ".join(f"\'{TRAIN_AND_TEST_DATA.columns[c]}\'" for c in predictors_combination)))
    print('Model for these targets: {}\n'.format(", ".join(f"\'{TRAIN_AND_TEST_DATA.columns[t]}\'" for t in TARGETS_COLS)))
    rmse_result_list = run_linear_regressor(predictors_combination)
    # draw table
    model_table_generated = gen_results_table(rmse_result_list)
    # perform validation process
    validation_rmse_result_list = do_validation(predictors_combination, draw_table=model_table_generated)
     # add results to in-memory list
    all_rmse_results.append({'model_results': rmse_result_list, 'validation_results': validation_rmse_result_list})
    print(f'\n-------\n')
  # generate rank table of median rmse result over n runs
  generate_rank_table(all_rmse_results)
  return all_rmse_results

# function to do validation
def do_validation(predictors_combination, draw_table):
  validation_result_list = run_validation(predictors_combination)
  # draw results table only if model table was also drawn
  if draw_table:
    print('Validate for these predictors: {}'.format(", ".join(f"\'{VALIDATION_DATA.columns[c]}\'" for c in predictors_combination)))
    print('Validate for these targets: {}\n'.format(", ".join(f"\'{t}\'" for t in VALIDATION_TARGETS)))
    gen_validation_results_table(validation_result_list)
  return validation_result_list

# function to generate table of results
def gen_results_table(rmse_result_list):
  return create_table(rmse_result_list)

# function to generate rank table of results
def generate_rank_table(all_results):
  create_rank_table(all_results)

# function to generate table of validation results
def gen_validation_results_table(validation_result_list):
  create_validation_table(validation_result_list)
  
all_rmse_results = model_and_validate()

Tensor Flow Linear Regression Output
Model dataset title: 2013 to 2019 - Bronx County [linear_6.csv]
Model dataset CSV: https://raw.githubusercontent.com/12004727uhi/12004727_DataAnalytics/master/linear_6.csv
Model validation dataset CSV: https://raw.githubusercontent.com/12004727uhi/12004727_DataAnalytics/master/sample_linear_6.csv
------------------------------------------------------------------------------

Model for these predictors: 'day_index'
Model for these targets: 'NUM_COLLISIONS'

  Run    Linear Regression Model RMSE    Mean Model RMSE  Predictors Columns
-----  ------------------------------  -----------------  --------------------
    1               0.405418250997235  0.206037370102812  [0]
    2               2.64021583078619   0.204155735407123  [0]
    3               0.299587836117557  0.204496311688705  [0]
    4               3.30163929439023   0.204396191667258  [0]
    5               0.346764666501861  0.204752453202807  [0]

Mean RMSE for "Linear Regression Mo

In [None]:
# print all results in memory from all runs (optional)
print(all_rmse_results)

[{'model_results': [{'Run': 1, 'Linear Regression Model RMSE': 0.4054182509972351, 'Mean Model RMSE': 0.2060373701028115, 'Predictors Columns': [0]}, {'Run': 2, 'Linear Regression Model RMSE': 2.6402158307861865, 'Mean Model RMSE': 0.2041557354071232, 'Predictors Columns': [0]}, {'Run': 3, 'Linear Regression Model RMSE': 0.2995878361175569, 'Mean Model RMSE': 0.20449631168870477, 'Predictors Columns': [0]}, {'Run': 4, 'Linear Regression Model RMSE': 3.301639294390229, 'Mean Model RMSE': 0.204396191667258, 'Predictors Columns': [0]}, {'Run': 5, 'Linear Regression Model RMSE': 0.3467646665018612, 'Mean Model RMSE': 0.20475245320280677, 'Predictors Columns': [0]}], 'validation_results': [{'Run': 1, 'Validation Test RMSE': 0.3048384581475494, 'Predictors Columns': [0]}, {'Run': 2, 'Validation Test RMSE': 0.27754416731995557, 'Predictors Columns': [0]}, {'Run': 3, 'Validation Test RMSE': 0.2939293034760735, 'Predictors Columns': [0]}, {'Run': 4, 'Validation Test RMSE': 0.24527119177316156, 