In [None]:
#Building a machine Learning algorithm to predict the best zone to be in depending on the time of day 

In [4]:
from sklearn.model_selection import train_test_split # Newer versions
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

gnb= GaussianNB()
zero_r = DummyClassifier(strategy='most_frequent')
#Comparing both to a baseline classifier







In [6]:
df=pd.read_csv('../preprocessed data/6monthdata.csv', sep=',')

In [7]:
fullData = df.to_numpy()
colNames = df.columns.to_numpy()



print( colNames)

total_amount = df['total_amount']



['Unnamed: 0' 'VendorID' 'tpep_pickup_datetime' 'tpep_dropoff_datetime'
 'passenger_count' 'trip_distance' 'RatecodeID' 'store_and_fwd_flag'
 'PULocationID' 'DOLocationID' 'payment_type' 'fare_amount' 'extra'
 'mta_tax' 'tip_amount' 'tolls_amount' 'improvement_surcharge'
 'total_amount']


In [8]:
#Labelling rows that have a total_amount greater than the overall mean as "High Revenue" 
# and labelling as "Low Revenue" otherwise

mean = df['total_amount'].mean()
print(mean)


df['revenue_zone_type'] = np.where(df['total_amount']>mean, "High Revenue", "Low Revenue")


16.052386377345805


In [9]:
df['revenue_zone_type']

0            Low Revenue
1            Low Revenue
2            Low Revenue
3           High Revenue
4           High Revenue
                ...     
52381447     Low Revenue
52381448    High Revenue
52381449     Low Revenue
52381450     Low Revenue
52381451     Low Revenue
Name: revenue_zone_type, Length: 52381452, dtype: object

In [10]:
df['tpep_pickup_datetime'][0].split()

#splitting the pickup date time field into two separate columns

Splitdf = pd.DataFrame(df['tpep_pickup_datetime'].str.split(' ',1).tolist(),columns = ['Date','Time'])




In [53]:
Splitdf

Unnamed: 0,Date,Time
0,2018-01-01,00:21:05
1,2018-01-01,00:44:55
2,2018-01-01,00:08:26
3,2018-01-01,00:20:22
4,2018-01-01,00:09:18
...,...,...
52381447,2018-06-30,23:45:36
52381448,2018-06-30,23:09:48
52381449,2018-06-30,23:39:24
52381450,2018-06-30,23:24:13


In [11]:
#Extracting Specific features from Date and Time Dfs
Date = pd.DataFrame(Splitdf['Date'].str.split('-',2).tolist(),columns = ['Year','Month','Day'])

Time = pd.DataFrame(Splitdf['Time'].str.split(':',2).tolist(),columns = ['Hour','Minute','Second'])



In [12]:
#Extracting Month and Hour as these are the most relevant 
#MonthHourdf = [Date['Month'],Time["Hour"]]
MonthHourdf = pd.concat([Date['Month'], Time["Hour"]], axis=1, keys=['Month', 'Hour'])
MonthHourdf

Unnamed: 0,Month,Hour
0,01,00
1,01,00
2,01,00
3,01,00
4,01,00
...,...,...
52381447,06,23
52381448,06,23
52381449,06,23
52381450,06,23


Removing VendorID, Store_and_fwd_flag, Payment_type as they are not relevant

Also removing tip_amount, mta_tax, tolls_amount as they are direct predictors of fare_amount (also they aren't feasable predictors you can't know how much a trips is going to cost before starting).

Need to process date and time as these will might be more useful

Convert to 2 separate columns

one with the approximate hour 
the other with the date



In [13]:

DTnp = MonthHourdf.to_numpy()

#X comprising of passenger_count, trip_distance, RatecodeID, PULocationID and mta_tax, Hour and Month
X = fullData[:,[4,5,6,8,13]]



print(X.shape)
print(DTnp.shape)

X = np.concatenate((X, DTnp),axis = 1)


print(X.shape)

#Response vector
y = df['revenue_zone_type'].to_numpy()



(52381452, 5)
(52381452, 2)
(52381452, 7)


In [14]:

gnb_accs = []

#splitting data into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print('X_train: {} X_test: {}'.format(X_train.shape, X_test.shape))
print('y_train: {} y_test: {}'.format(y_train.shape, y_test.shape))



X_train: (35095572, 7) X_test: (17285880, 7)
y_train: (35095572,) y_test: (17285880,)


In [17]:
X_train

array([[1, 0.88, 1, ..., 0.5, '01', '10'],
       [1, 2.33, 1, ..., 0.5, '01', '11'],
       [5, 19.58, 2, ..., 0.5, '06', '18'],
       ...,
       [1, 0.81, 1, ..., 0.5, '03', '10'],
       [1, 0.9, 1, ..., 0.5, '05', '16'],
       [1, 0.9, 1, ..., 0.5, '02', '19']], dtype=object)

In [18]:
gnb.fit(X_train, y_train)

#looking at converting date/time data to just 24hour time

GaussianNB()

In [19]:
#predicing test data
prediction = gnb.predict(X_test)


0.7371778006095148


In [28]:
gnb_acc = accuracy_score(y_test,prediction)
print(gnb_acc)

0.7371778006095148


In [30]:
#comparing to 0-R classifier

zero_r.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [31]:
or_acc = accuracy_score(zero_r.predict(X_test),y_test)
print(or_acc)

0.7121026525696117


# Evaluating the model on unseen data


In [32]:
feb19df = pd.read_csv("../raw_data/yellow_tripdata_2019-02.csv")

In [33]:
# Assigning the type of zone


feb19df['revenue_zone_type'] = np.where(feb19df['total_amount']>mean, "High Revenue", "Low Revenue")


In [34]:
feb19df['tpep_pickup_datetime'][0].split()

#splitting the pickup date time field into two separate columns

Splitdf = pd.DataFrame(feb19df['tpep_pickup_datetime'].str.split(' ',1).tolist(),columns = ['Date','Time'])


#Extracting Specific features from Date and Time Dfs
Date = pd.DataFrame(Splitdf['Date'].str.split('-',2).tolist(),columns = ['Year','Month','Day'])

Time = pd.DataFrame(Splitdf['Time'].str.split(':',2).tolist(),columns = ['Hour','Minute','Second'])

#Extracting Month and Hour as these are the most relevant 
#MonthHourdf = [Date['Month'],Time["Hour"]]
MonthHourdf = pd.concat([Date['Month'], Time["Hour"]], axis=1, keys=['Month', 'Hour'])
print(MonthHourdf)


DTnp19 = MonthHourdf.to_numpy()

        Month Hour
0          02   00
1          02   00
2          02   00
3          02   00
4          02   00
...       ...  ...
7019370    02   23
7019371    02   22
7019372    02   23
7019373    02   23
7019374    02   23

[7019375 rows x 2 columns]


In [35]:
feb19df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,revenue_zone_type
0,1,2019-02-01 00:59:04,2019-02-01 01:07:27,1,2.1,1,N,48,234,1,9.0,0.5,0.5,2.0,0.0,0.3,12.3,0.0,Low Revenue
1,1,2019-02-01 00:33:09,2019-02-01 01:03:58,1,9.8,1,N,230,93,2,32.0,0.5,0.5,0.0,0.0,0.3,33.3,0.0,High Revenue
2,1,2019-02-01 00:09:03,2019-02-01 00:09:16,1,0.0,1,N,145,145,2,2.5,0.5,0.5,0.0,0.0,0.3,3.8,0.0,Low Revenue
3,1,2019-02-01 00:45:38,2019-02-01 00:51:10,1,0.8,1,N,95,95,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8,0.0,Low Revenue
4,1,2019-02-01 00:25:30,2019-02-01 00:28:14,1,0.8,1,N,140,263,2,5.0,0.5,0.5,0.0,0.0,0.3,6.3,0.0,Low Revenue
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7019370,2,2019-02-28 23:29:08,2019-02-28 23:29:11,1,0.0,1,N,193,193,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Revenue
7019371,2,2019-02-28 22:48:47,2019-02-28 23:50:19,1,0.0,1,N,141,193,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,Low Revenue
7019372,2,2019-02-28 23:41:23,2019-02-28 23:42:23,1,0.0,1,N,264,264,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Revenue
7019373,2,2019-02-28 23:12:52,2019-02-28 23:14:16,1,0.0,1,N,264,193,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Revenue


In [36]:
# Extracting the rows

full2019 = feb19df.to_numpy()

print(feb19df.columns)

X19 = full2019[:,[3,4,5,7,12]]


print(X.shape)
print(DTnp19.shape)

X19 = np.concatenate((X19, DTnp19),axis = 1)


#print(X.shape)

y19 = feb19df['revenue_zone_type'].to_numpy()

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'revenue_zone_type'],
      dtype='object')
(52381452, 7)
(7019375, 2)


In [37]:
#Ensuring the shapes are the same
print(X19.shape)
print(y19.shape)

(7019375, 7)
(7019375,)


In [39]:
gnb_acc = accuracy_score(y19,gnb.predict(X19))
print(gnb_acc)

0.6409372629329534


In [40]:

or2019_acc = accuracy_score(y19,zero_r.predict(X19))
print(or2019_acc)

0.6120519277001157
