[View in Colaboratory](https://colab.research.google.com/github/AmoDinho/Machine-Learning-Crash-Course-with-TF/blob/master/examples/NYC_Taxis_I.ipynb)

# NYC Taxi Trip Data for 2016

The goal of this study is to try and predict the the total trip amount. We will use Tensorflow to do this. But most importantly we will use linear regression.

# Learning Objectives
* Use the LinearRefressor class in TF tp predict the number the shares based on the number of words in the content.

# Setup


In [2]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

Lets load our dataset

In [42]:
df_taxi = pd.read_csv("https://storage.googleapis.com/machinelearning_datasets/tlc_yellow_trips_2016.csv", sep=",")

In [43]:
df_taxi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
vendor_id          2000 non-null int64
passenger_count    2000 non-null int64
trip_distance      2000 non-null float64
fare_amount        2000 non-null float64
extra              2000 non-null float64
mta_tax            2000 non-null float64
tip_amount         2000 non-null float64
tolls_amount       2000 non-null float64
imp_surcharge      2000 non-null float64
total_amount       2000 non-null float64
dtypes: float64(8), int64(2)
memory usage: 156.3 KB


In [40]:
df_taxi.describe()

count   2000.0
mean       3.1
std        3.8
min        0.0
25%        0.9
50%        1.7
75%        3.3
max       25.0
Name: trip_distance, dtype: float64

# Preprocess Features


In [64]:
def preprocess_features(df_taxi):
  """
  Prepares input features from df_taxi data set.

  Args:
    df_taxi: A Pandas DataFrame expected to contain data
      from the NYC Taxi data set.
  Returns:
    A DataFrame that contains the features to be used for the model, including
    synthetic features.
  """
  
  selected_features = df_taxi[
      ["vendor_id",          
       "passenger_count",
       "trip_distance",
       "fare_amount",   
       "extra",          
        "mta_tax",
       "tip_amount",       
       "tolls_amount",  
       "imp_surcharge",
       "total_amount"]]
  
  processed_features = selected_features.copy()
  return processed_features


def preprocess_targets(df_taxi):
  """
  Prepares target features (i.e., labels) from df_taxi data set.

  Args:
    df_taxi: A Pandas DataFrame expected to contain data
      from the California housing data set.
  Returns:
    A DataFrame that contains the target feature.
  """
  
  output_targets = pd.DataFrame()
  output_targets["total_amount"] = df_taxi["total_amount"]
  return output_targets

In [65]:
#Determine number of training examples
training_examples = preprocess_features(df_taxi.head(1600))
training_targets = preprocess_targets(df_taxi.head(1600))

#Determine number of validation examples

validation_examples = preprocess_features(df_taxi.tail(400))
validation_targets = preprocess_targets(df_taxi.tail(400))


# Double-check that we've done the right thing.
print ("Training examples summary:")
display.display(training_examples.describe())
print ("Validation examples summary:")
display.display(validation_examples.describe())

print ("Training targets summary:")
display.display(training_targets.describe())
print ("Validation targets summary:")
display.display(validation_targets.describe())


Training examples summary:


Unnamed: 0,vendor_id,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,imp_surcharge,total_amount
count,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0
mean,3.0,1.0,3.1,13.7,0.4,0.5,1.3,0.4,0.3,16.6
std,0.0,0.1,3.8,11.6,1.0,0.0,4.2,2.0,0.0,14.3
min,3.0,1.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,3.3
25%,3.0,1.0,1.0,6.5,0.0,0.5,0.0,0.0,0.3,8.3
50%,3.0,1.0,1.8,9.5,0.5,0.5,0.0,0.0,0.3,11.8
75%,3.0,1.0,3.4,15.5,0.5,0.5,1.6,0.0,0.3,18.3
max,3.0,4.0,25.0,120.0,18.0,0.5,126.0,37.0,0.6,145.0


Validation examples summary:


Unnamed: 0,vendor_id,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,imp_surcharge,total_amount
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,3.0,1.0,2.8,12.9,0.4,0.5,1.1,0.3,0.3,15.6
std,0.0,0.0,3.7,13.5,0.9,0.0,2.4,1.4,0.0,16.3
min,3.0,1.0,0.0,2.5,0.0,0.5,0.0,0.0,0.3,3.3
25%,3.0,1.0,0.9,6.0,0.0,0.5,0.0,0.0,0.3,7.8
50%,3.0,1.0,1.6,9.0,0.5,0.5,0.0,0.0,0.3,11.3
75%,3.0,1.0,2.8,13.5,0.5,0.5,1.7,0.0,0.3,16.3
max,3.0,1.0,25.0,180.0,17.5,0.5,27.9,16.0,0.6,214.3


Training targets summary:


Unnamed: 0,total_amount
count,1600.0
mean,16.6
std,14.3
min,3.3
25%,8.3
50%,11.8
75%,18.3
max,145.0


Validation targets summary:


Unnamed: 0,total_amount
count,400.0
mean,15.6
std,16.3
min,3.3
25%,7.8
50%,11.3
75%,16.3
max,214.3


# Lets make a goood feature set

we will use a correlation matrix

In [67]:
corr_df = training_examples.copy()
corr_df["target"] = training_targets["total_amount"]
corr_df.corr()

Unnamed: 0,vendor_id,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,imp_surcharge,total_amount,target
vendor_id,,,,,,,,,,,
passenger_count,,1.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0
trip_distance,,0.0,1.0,0.9,0.2,0.0,0.2,0.5,0.2,0.9,0.9
fare_amount,,0.0,0.9,1.0,0.1,0.0,0.2,0.5,0.2,0.9,0.9
extra,,-0.0,0.2,0.1,1.0,-0.0,0.1,0.6,0.8,0.3,0.3
...,...,...,...,...,...,...,...,...,...,...,...
tip_amount,,-0.0,0.2,0.2,0.1,0.0,1.0,0.1,0.1,0.5,0.5
tolls_amount,,0.0,0.5,0.5,0.6,0.0,0.1,1.0,0.6,0.6,0.6
imp_surcharge,,-0.0,0.2,0.2,0.8,0.4,0.1,0.6,1.0,0.3,0.3
total_amount,,0.0,0.9,0.9,0.3,0.0,0.5,0.6,0.3,1.0,1.0
