## **Tensor Flow: Linear Regression Using Estimators**

In [None]:
!pip install -q sklearn

from google.colab import drive
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
drive.mount('/content/drive')

BGV = pd.read_csv(
    '/content/drive/MyDrive/Colab Notebooks/BGVData/BookersLastFiveYears.csv')

BGV['TourDate1'] = pd.to_datetime(BGV['TourDate1'])

BGV['TourWave2'] = BGV['TourWave2'].fillna(0).astype(int)
#contains zeroes, should I drop?

In [None]:
%tensorflow_version 2.x

from __future__ import absolute_import, division, print_function, unicode_literals

from IPython.display import clear_output
from six.moves import urllib

import tensorflow.compat.v2.feature_column as fc

import tensorflow as tf

In [None]:
# Only look at showed tours and only 4 main columns
BGV = BGV.loc[BGV['TourStatus2'] == 'Showed']

BGV = BGV[['TourMonth', 'ProgramName', 'TourWave', 'ContractStatus1']]

BGV['TourWave'] = BGV['TourWave'].fillna(0).astype(int)
#BGV['TourMonth'] = BGV['TourMonth'].astype(float)
BGV.dropna(how="any",inplace = True)

In [None]:
# Need to replace 'ContractStatus1' column with 1 if 'Active\r\nActive' and 0 otherwise
BGV['ContractStatus1'] = BGV['ContractStatus1'].replace({'Active\r\nActive': 1})
BGV['ContractStatus1'] = BGV['ContractStatus1'].replace(['\r\n', 'Canceled\r\nUpgrade', 'Canceled\r\nExpired', 'Canceled\r\nRescind', 'Canceled\r\nForfeit', 'Canceled\r\nNot Executed',
                                                         'Canceled\r\nRewrite', 'Active\r\nPipeline', 'Active\r\nFuture CXL', 'Sold\r\nSold ', 'Suspense\r\nSuspense',
                                                         'Pender\r\nPender', 'Active\r\nNeeds Paperwork', 'Canceled\r\nRewrite-Error', 'Canceled\r\nRewrite-Adjustment',
                                                         'Active\r\nPartial Future CXL', 'Mail Out\r\nMailout', 'Canceled\r\nTransferred', 'Mail Out\r\nPender'], 0)

In [None]:
# Make not numbers into numbers

CATEGORICAL_COLUMNS = ['ProgramName', 'TourMonth', 'TourWave']

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = BGV[feature_name].unique()  # gets a list of all unique values from given feature column
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(key=feature_name, vocabulary_list=vocabulary))

print(feature_columns)

In [None]:
dftrain = BGV[:57250] # training 80% data
dfeval = BGV[57250:] # testing 20% data

y_train = dftrain.pop('ContractStatus1')
y_eval = dfeval.pop('ContractStatus1')

In [None]:
# Training: Input

def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
  def input_function():  # inner function, this will be returned
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))  # create tf.data.Dataset object with data and its label
    if shuffle:
      ds = ds.shuffle(1000)  # randomize order of data
    ds = ds.batch(batch_size).repeat(num_epochs)  # split dataset into batches of 32 and repeat process for number of epochs
    return ds  # return a batch of the dataset
  return input_function  # return a function object for use

train_input_fn = make_input_fn(dftrain, y_train)  # here we will call the input_function that was returned to us to get a dataset object we can feed to the model
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

In [None]:
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
# We create a linear estimtor by passing the feature columns we created earlier

In [None]:
linear_est.train(train_input_fn)  # train
result = linear_est.evaluate(eval_input_fn)  # get model metrics/stats by testing on testing data

clear_output()  # clears console output
print(result['accuracy'])  # the result variable is simply a dict of stats about our model

In [None]:
# Histogram showing the probability of purchase for each showed tour in the training set
pred_dicts = list(linear_est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

probs.plot(kind='hist', bins=50, title='predicted probabilities')

In [None]:
probs.mean()

## **Tensor Flow: Classification Using Deep Neural Network**

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

BGV = pd.read_csv(
    '/content/drive/MyDrive/Colab Notebooks/BGVData/BookersLastFiveYears.csv')

BGV['TourDate1'] = pd.to_datetime(BGV['TourDate1'])
BGV['TourWave'] = BGV['TourWave'].fillna(0).astype(int)
BGV = BGV.loc[BGV['TourStatus2'] == 'Showed']
BGV = BGV[['TourMonth', 'ProgramName', 'TourWave', 'ContractStatus1']]
BGV.dropna(how="any",inplace = True)

# Need to replace 'ContractStatus1' column with 1 if 'Active\r\nActive' and 0 otherwise
BGV['ContractStatus1'] = BGV['ContractStatus1'].replace({'Active\r\nActive': 1})
BGV['ContractStatus1'] = BGV['ContractStatus1'].replace(['\r\n', 'Canceled\r\nUpgrade', 'Canceled\r\nExpired', 'Canceled\r\nRescind', 'Canceled\r\nForfeit', 'Canceled\r\nNot Executed',
                                                         'Canceled\r\nRewrite', 'Active\r\nPipeline', 'Active\r\nFuture CXL', 'Sold\r\nSold ', 'Suspense\r\nSuspense',
                                                         'Pender\r\nPender', 'Active\r\nNeeds Paperwork', 'Canceled\r\nRewrite-Error', 'Canceled\r\nRewrite-Adjustment',
                                                         'Active\r\nPartial Future CXL', 'Mail Out\r\nMailout', 'Canceled\r\nTransferred', 'Mail Out\r\nPender'], 0)

In [None]:
train = BGV[:57250] # training 80% data
test = BGV[57250:] # testing 20% data

train_y = train.pop('ContractStatus1')
test_y = test.pop('ContractStatus1')

In [None]:
# Input Function
def input_fn(features, labels, training=True, batch_size=256):
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle and repeat if you are in training mode.
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)

In [None]:
# Make not numbers into numbers

CATEGORICAL_COLUMNS = ['ProgramName', 'TourMonth', 'TourWave']

my_feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = train[feature_name].unique()  # gets a list of all unique values from given feature column
  categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key=feature_name, vocabulary_list=vocabulary)
  my_feature_columns.append(tf.feature_column.indicator_column(categorical_column))
print(my_feature_columns)

In [None]:
# Build a DNN with 2 hidden layers with 30 and 10 hidden nodes each.
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 30 and 10 nodes respectively.
    hidden_units=[30, 10],
    # The model must choose between 2 states.
    n_classes=2)

In [None]:
classifier.train(
    input_fn=lambda: input_fn(train, train_y, training=True),
    steps=5000)
# We include a lambda to avoid creating an inner function previously

In [None]:
eval_result = classifier.evaluate(
    input_fn=lambda: input_fn(test, test_y, training=False))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

In [None]:
def input_fn(features, batch_size=256):
    # Convert the inputs to a Dataset without labels.
    return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)

features = ['TourMonth', 'TourWave']
predict = {}

# Retrieve Program Name from user. Only allow acceptable strings
print("Please type acceptable options as prompted.")
print("Acceptable options: " + str(train['ProgramName'].unique()))
valid=False
while valid == False:
  val = input('Program Name: ')
  if str(val) in train['ProgramName'].unique():
    valid=True
  else: print('Please try again')

predict['ProgramName'] = [str(val)]

# Retrieve Tour Month and Tour Wave from user. Only allow acceptable ints
for feature in features:
  valid=False
  print("Acceptable options: "+str(np.sort(train[feature].unique())))
  while valid == False:
    val = input(feature+': ')
    if int(val) in np.sort(train[feature].unique()):
      valid=True
    else: print('Please try again')
  
  predict[feature] = [int(val)]

predictions = classifier.predict(input_fn=lambda: input_fn(predict))
for pred_dict in predictions:
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print('Guest has a {:.1f}% chance to purchase'.format(100 - 100 * probability))

In [None]:
print('Algorithm predicted an average purchase rate of {:.1f}%, whereas the actual purchase rate from training set is {:.1f}%'.format(
      eval_result['prediction/mean']*100, test_y.mean()*100))

## **Tensor Flow: Classification Using Linear Classifier**

In [None]:
#Import necessary 
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

BGV = pd.read_csv(
    '/content/drive/MyDrive/Colab Notebooks/BGVData/BookersLastFiveYears.csv')

BGV['TourDate1'] = pd.to_datetime(BGV['TourDate1'])
BGV['TourWave'] = BGV['TourWave'].fillna(0).astype(int)
BGV = BGV.loc[BGV['TourStatus2'] == 'Showed']
BGV = BGV[['TourMonth', 'ProgramName', 'TourWave', 'ContractStatus1']]
BGV.dropna(how="any",inplace = True)

# Need to replace 'ContractStatus1' column with 1 if 'Active\r\nActive' and 0 otherwise
BGV['ContractStatus1'] = BGV['ContractStatus1'].replace({'Active\r\nActive': 1})
BGV['ContractStatus1'] = BGV['ContractStatus1'].replace(['\r\n', 'Canceled\r\nUpgrade', 'Canceled\r\nExpired', 'Canceled\r\nRescind', 'Canceled\r\nForfeit', 'Canceled\r\nNot Executed',
                                                         'Canceled\r\nRewrite', 'Active\r\nPipeline', 'Active\r\nFuture CXL', 'Sold\r\nSold ', 'Suspense\r\nSuspense',
                                                         'Pender\r\nPender', 'Active\r\nNeeds Paperwork', 'Canceled\r\nRewrite-Error', 'Canceled\r\nRewrite-Adjustment',
                                                         'Active\r\nPartial Future CXL', 'Mail Out\r\nMailout', 'Canceled\r\nTransferred', 'Mail Out\r\nPender'], 0)

In [None]:
# Test to see if I should implement preprocessing by making values between 0-1
'''BGV['TourMonth'] = BGV['TourMonth'] / 12
BGV['TourWave'] = BGV['TourWave'] / 1600
BGV.head()'''

In [None]:
train = BGV[:57250] # training 80% data
test = BGV[57250:] # testing 20% data

train_y = train.pop('ContractStatus1')
test_y = test.pop('ContractStatus1')

In [None]:
# Input Function
def input_fn(features, labels, training=True, batch_size=256):
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle and repeat if you are in training mode.
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)

In [None]:
# Make not numbers into numbers

CATEGORICAL_COLUMNS = ['ProgramName', 'TourMonth', 'TourWave']

my_feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = train[feature_name].unique()  # gets a list of all unique values from given feature column
  categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key=feature_name, vocabulary_list=vocabulary)
  my_feature_columns.append(tf.feature_column.indicator_column(categorical_column))

# Preprocessing Test
'''NUMERIC_COLUMNS = ['TourMonth', 'TourWave']
for feature_name in NUMERIC_COLUMNS:
  my_feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

print(my_feature_columns)'''

In [None]:
# Build a DNN with 2 hidden layers with 30 and 10 hidden nodes each.
classifier = tf.estimator.LinearClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 30 and 10 nodes respectively.
    n_classes=2)

In [None]:
classifier.train(
    input_fn=lambda: input_fn(train, train_y, training=True),
    steps=5000)
# We include a lambda to avoid creating an inner function previously

In [None]:
eval_result = classifier.evaluate(
    input_fn=lambda: input_fn(test, test_y, training=False))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

In [None]:
'''train['TourWave'] = train['TourWave'] * 1600
train['TourMonth'] = train['TourMonth'] * 12'''

In [None]:
def input_fn(features, batch_size=256):
    # Convert the inputs to a Dataset without labels.
    return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)

features = ['TourMonth', 'TourWave']
predict = {}

# Retrieve Program Name from user. Only allow acceptable strings
print("Please type acceptable options as prompted.")
print("Acceptable options: " + str(train['ProgramName'].unique()))
valid=False
while valid == False:
  val = input('Program Name: ')
  if str(val) in train['ProgramName'].unique():
    valid=True
  else: print('Please try again')

predict['ProgramName'] = [str(val)]

# Retrieve Tour Month and Tour Wave from user. Only allow acceptable ints
for feature in features:
  valid=False
  print("Acceptable options: "+str(np.sort(train[feature].astype(int).unique())))
  while valid == False:
    val = input(feature+': ')
    if int(val) in np.sort(train[feature].astype(int).unique()):
      valid=True
    else: print('Please try again')
  
  predict[feature] = [int(val)]

predictions = classifier.predict(input_fn=lambda: input_fn(predict))
for pred_dict in predictions:
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print('Guest has a {:.1f}% chance to purchase'.format(100 - 100 * probability))

In [None]:
print('Algorithm predicted an average purchase rate of {:.1f}%, whereas the actual purchase rate from training set is {:.1f}%'.format(
      eval_result['prediction/mean']*100, test_y.mean()*100))

In [None]:
pred_dict

## **Tensor Flow: Classification Using Linear Classifier with Preprocessing**

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

BGV = pd.read_csv(
    '/content/drive/MyDrive/Colab Notebooks/BGVData/BookersLastFiveYears.csv')

# Convert to datetime and remove any rows with empty data. Only look at showed tours and relevant data.
BGV['TourDate1'] = pd.to_datetime(BGV['TourDate1'])
BGV['TourWave'] = BGV['TourWave'].fillna(0).astype(int)
BGV = BGV.loc[BGV['TourStatus2'] == 'Showed']
BGV = BGV[['TourMonth', 'ProgramName', 'TourWave', 'ContractStatus1']]
BGV.dropna(how="any",inplace = True)

# Replace 'ContractStatus1' with 1 if 'Active\r\nActive' and 0 otherwise
BGV['ContractStatus1'] = BGV['ContractStatus1'].replace({'Active\r\nActive': 1})
BGV['ContractStatus1'] = BGV['ContractStatus1'].replace(['\r\n', 'Canceled\r\nUpgrade', 'Canceled\r\nExpired', 'Canceled\r\nRescind', 'Canceled\r\nForfeit', 'Canceled\r\nNot Executed',
                                                         'Canceled\r\nRewrite', 'Active\r\nPipeline', 'Active\r\nFuture CXL', 'Sold\r\nSold ', 'Suspense\r\nSuspense',
                                                         'Pender\r\nPender', 'Active\r\nNeeds Paperwork', 'Canceled\r\nRewrite-Error', 'Canceled\r\nRewrite-Adjustment',
                                                         'Active\r\nPartial Future CXL', 'Mail Out\r\nMailout', 'Canceled\r\nTransferred', 'Mail Out\r\nPender'], 0)

In [None]:
# Preprocess data by making values between 0-1
BGV['TourMonth'] = BGV['TourMonth'] / 12
BGV['TourWave'] = BGV['TourWave'] / 1600

In [None]:
# Train 80% and test 20%
train = BGV[:57250]
test = BGV[57250:]

# Pop out data on if they purchased
train_y = train.pop('ContractStatus1')
test_y = test.pop('ContractStatus1')

In [None]:
# Input Function
def input_fn(features, labels, training=True, batch_size=256):
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle and repeat if you are in training mode.
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)

In [None]:
# Split data into floats and not floats

CATEGORICAL_COLUMNS = ['ProgramName']

my_feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = train[feature_name].unique()
  categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key=feature_name, vocabulary_list=vocabulary)
  my_feature_columns.append(tf.feature_column.indicator_column(categorical_column))

# Preprocessing Test
NUMERIC_COLUMNS = ['TourMonth', 'TourWave']
for feature_name in NUMERIC_COLUMNS:
  my_feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

print(my_feature_columns)

In [None]:
# Build a DNN with 2 hidden layers with 30 and 10 hidden nodes each.
classifier = tf.estimator.LinearClassifier(
    feature_columns=my_feature_columns,
    n_classes=2)

In [None]:
# Train algorithm
classifier.train(
    input_fn=lambda: input_fn(train, train_y, training=True),
    steps=5000)

In [None]:
# Print accuracy from testing set
eval_result = classifier.evaluate(
    input_fn=lambda: input_fn(test, test_y, training=False))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

In [None]:
# Turn back into readable data for the user
train['TourWave'] = train['TourWave'] * 1600
train['TourMonth'] = train['TourMonth'] * 12

In [None]:
print('Algorithm predicted an average purchase rate of {:.1f}%, whereas the actual purchase rate from training set is {:.1f}%'.format(
      eval_result['prediction/mean']*100, test_y.mean()*100))

In [None]:
# Request specific instance from user and calculate estimated purchase rate

def input_fn(features, batch_size=256):
    return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)

predict = {}

# Retrieve Program Name from user. Only allow acceptable strings
print("Please type acceptable options as prompted.")
print("Acceptable options: " + str(train['ProgramName'].unique()))
valid=False
while valid == False:
  val = input('Program Name: ')
  if str(val) in train['ProgramName'].unique():
    valid=True
  else:
    print('Please try again...')
  predict['ProgramName'] = [str(val)]


# Retrieve Tour Month from user. Only allow acceptable ints
valid=False
print("Acceptable options: "+str(np.sort(train['TourMonth'].astype(int).unique())))
while valid == False:
  val = input('Tour Month: ')
  if val.isdigit():
    val = int(val)
    if val in np.sort(train['TourMonth'].astype(int).unique()):
      valid=True
    else: print('Please try again...')
    predict['TourMonth'] = [val/12]
  else: print('Please try again...')


# Retrieve Tour Wave from user. Only allow acceptable ints
valid=False
print("Acceptable options: "+str(np.sort(train['TourWave'].astype(int).unique())))
while valid == False:
  val = input('Tour Wave: ')
  if val.isdigit():
    val = int(val)
    if val in np.sort(train['TourWave'].astype(int).unique()):
      valid=True
    else:
      print('Please try again...')
    predict['TourWave'] = [val/1600]
  else: print('Please try again...')


# Run the prediction.
predictions = classifier.predict(input_fn=lambda: input_fn(predict))
for pred_dict in predictions:
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print('Guest has a {:.1f}% chance to purchase'.format(100 - 100 * probability))