# CS480: Database Systems, Group Project
### Green Taxi Datasets

In [1]:
#import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta
import sqlite3
from hopcroftkarp import HopcroftKarp
import time
from itertools import chain
from functools import reduce

#### 1. First, we will import the data in the system

In [2]:
green = pd.read_csv('green_tripdata_2015_6months_cleaned.csv')

#### 2. Now, we will get the summary of the datasets and then we will clean up the dataset

In [None]:
green.describe()
# print(green.dtypes)

In [4]:
# Getting the values which needs to be cleaned up before procedding further
print("\u0332".join('Number of Null data values in each columns:'))
print(green.isnull().sum())
print('')
print("\u0332".join('Number of Datasets:'), len(green.index))

N̲u̲m̲b̲e̲r̲ ̲o̲f̲ ̲N̲u̲l̲l̲ ̲d̲a̲t̲a̲ ̲v̲a̲l̲u̲e̲s̲ ̲i̲n̲ ̲e̲a̲c̲h̲ ̲c̲o̲l̲u̲m̲n̲s̲:
trip_id                  0
VendorID                 0
lpep_pickup_datetime     0
Lpep_dropoff_datetime    0
Pickup_longitude         0
Pickup_latitude          0
Dropoff_longitude        0
Dropoff_latitude         0
Passenger_count          0
Trip_distance            0
dtype: int64

N̲u̲m̲b̲e̲r̲ ̲o̲f̲ ̲D̲a̲t̲a̲s̲e̲t̲s̲: 5664142


#### 3. Now, filter out the data and clean it up again and to get the data we need use for our algorithms

In [4]:
dataset = green[['VendorID', 'lpep_pickup_datetime', 'Lpep_dropoff_datetime', 'Pickup_longitude', 'Pickup_latitude', 'Dropoff_longitude', 'Dropoff_latitude', 'Passenger_count', 'Trip_distance']]

# Here we are storing all the column names in a array named 'columns'
columns = dataset.columns
print("\u0332".join('Column Names:'), columns)

print("\u0332".join('Dataset:'), len(dataset.index))
dataset = dataset[ (dataset.lpep_pickup_datetime != dataset.Lpep_dropoff_datetime) & (dataset.Trip_distance != 0) & (dataset.Passenger_count < 3) & (dataset['Trip_distance'] != dataset['Trip_distance'].max())]

print("\u0332".join('Filtered Dataset:'), len(dataset.index))

C̲o̲l̲u̲m̲n̲ ̲N̲a̲m̲e̲s̲: Index(['VendorID', 'lpep_pickup_datetime', 'Lpep_dropoff_datetime',
       'Pickup_longitude', 'Pickup_latitude', 'Dropoff_longitude',
       'Dropoff_latitude', 'Passenger_count', 'Trip_distance'],
      dtype='object')
D̲a̲t̲a̲s̲e̲t̲: 5664142
F̲i̲l̲t̲e̲r̲e̲d̲ ̲D̲a̲t̲a̲s̲e̲t̲: 5638470


In [6]:
print("\u0332".join('Total Distance Travelled:'), dataset['Trip_distance'].sum())

T̲o̲t̲a̲l̲ ̲D̲i̲s̲t̲a̲n̲c̲e̲ ̲T̲r̲a̲v̲e̲l̲l̲e̲d̲: 16332776.31000001


In [10]:
# Correcting the date and time format
dataset['lpep_pickup_datetime'] = pd.to_datetime(dataset['lpep_pickup_datetime'])
dataset['Lpep_dropoff_datetime'] = pd.to_datetime(dataset['Lpep_dropoff_datetime'])
dataset.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,Lpep_dropoff_datetime,Pickup_longitude,Pickup_latitude,Dropoff_longitude,Dropoff_latitude,Passenger_count,Trip_distance
862,2,2015-01-01,2015-01-01 00:03:00,-73.918648,40.759354,-73.912155,40.767422,1,0.6
924,2,2015-01-01,2015-01-01 00:03:00,-73.892464,40.746998,-73.911209,40.744617,1,0.99
725,2,2015-01-01,2015-01-01 00:04:00,-73.891365,40.746635,-73.896118,40.738361,1,0.87
1923,1,2015-01-01,2015-01-01 00:04:00,-73.807426,40.700104,-73.806,40.693253,1,0.6
1573,1,2015-01-01,2015-01-01 00:05:00,-73.918846,40.743019,-73.901253,40.745758,1,1.0


#### 4. Defining distance and time difference functions

In [8]:
from math import radians, cos, sin, asin, sqrt 
def distance(lat1, lon1, lat2, lon2): 
    
#     start = time.time()
    
    # The math module contains a function named 
    # radians which converts from degrees to radians. 
    lon1 = radians(lon1) 
    lon2 = radians(lon2) 
    lat1 = radians(lat1) 
    lat2 = radians(lat2) 
       
    # Haversine formula  
    dlon = lon2 - lon1  
    dlat = lat2 - lat1 

    c = (3956 * 2 * asin(sqrt(sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2)))

#     end = time.time()
#     print(end-start)
    
    return c
     
    # Radius of earth in kilometers. Use 3956 for miles 
    # calculate the result 
    # (c * r) 

def time_difference(time1, time2):
    time_diff = (time2 - time1)
    time_seconds = abs(time_diff.total_seconds())
    
    return time_seconds / 60.0

print(columns)

Index(['VendorID', 'lpep_pickup_datetime', 'Lpep_dropoff_datetime',
       'Pickup_longitude', 'Pickup_latitude', 'Dropoff_longitude',
       'Dropoff_latitude', 'Passenger_count', 'Trip_distance'],
      dtype='object')


In [6]:
dataset = dataset.sort_values(by=['lpep_pickup_datetime', 'Lpep_dropoff_datetime'])

Time Taken to process:  2.17171049118042


#### 5.Getting Average Speed for each trip

In [11]:
dataset['Average Speed (MPH)'] = dataset.apply(lambda row : row.Trip_distance / (time_difference(row.lpep_pickup_datetime, row.Lpep_dropoff_datetime) / 60.0), axis = 1)

Time Taken to process:  176.72743797302246


In [11]:
dataset = dataset.reset_index(drop=True)

In [12]:
dataset

Unnamed: 0,VendorID,lpep_pickup_datetime,Lpep_dropoff_datetime,Pickup_longitude,Pickup_latitude,Dropoff_longitude,Dropoff_latitude,Passenger_count,Trip_distance,Average Speed (MPH)
0,2,2015-01-01 00:00:00,2015-01-01 00:03:00,-73.918648,40.759354,-73.912155,40.767422,1,0.60,12.000000
1,2,2015-01-01 00:00:00,2015-01-01 00:03:00,-73.892464,40.746998,-73.911209,40.744617,1,0.99,19.800000
2,2,2015-01-01 00:00:00,2015-01-01 00:04:00,-73.891365,40.746635,-73.896118,40.738361,1,0.87,13.050000
3,1,2015-01-01 00:00:00,2015-01-01 00:04:00,-73.807426,40.700104,-73.806000,40.693253,1,0.60,9.000000
4,1,2015-01-01 00:00:00,2015-01-01 00:05:00,-73.918846,40.743019,-73.901253,40.745758,1,1.00,12.000000
...,...,...,...,...,...,...,...,...,...,...
5638465,2,2015-12-01 23:59:00,2015-01-13 00:14:00,-73.957588,40.671902,-73.955826,40.719593,1,4.00,0.000516
5638466,2,2015-12-01 23:59:00,2015-01-13 00:14:00,-73.953896,40.663887,-73.916283,40.686119,2,3.33,0.000430
5638467,2,2015-12-01 23:59:00,2015-01-13 00:15:00,-73.954567,40.718754,-73.978569,40.665421,1,7.25,0.000935
5638468,1,2015-12-01 23:59:00,2015-01-13 00:24:00,-73.988892,40.692158,-73.988770,40.777615,1,7.20,0.000929


#### 6.Define mergeable_check. This will check if two trips are mergeable with the given delay.

In [13]:
def mergeable_check(trip1, trip2, delay):
    
#     avg_speed = (trip1['Average Speed (MPH)'] + trip2['Average Speed (MPH)'])/2
    
    Do1o2 = distance(trip1['Pickup_latitude'], trip1['Pickup_longitude'], trip2['Pickup_latitude'], trip2['Pickup_longitude'])
    Dd1d2 = distance(trip1['Dropoff_latitude'], trip1['Dropoff_longitude'], trip2['Dropoff_latitude'], trip2['Dropoff_longitude'])

    Do2d1 = distance(trip2['Pickup_latitude'], trip2['Pickup_longitude'], trip1['Dropoff_latitude'], trip1['Dropoff_longitude'])
    Do2d2 = distance(trip2['Pickup_latitude'], trip2['Pickup_longitude'], trip2['Dropoff_latitude'], trip2['Dropoff_longitude'])
    
#     To1o2 = Do1o2/avg_speed
#     Td1d2 = Dd1d2/avg_speed
    
#     To2d1 = Do2d1/avg_speed
#     To2d2 = Do2d2/avg_speed
    
#     print(max([Do2d1, Do2d2]) + Do1o2 + Dd1d2, "|", trip1['Trip_distance'] + trip2['Trip_distance'])
    
#     sequence1 = False
#     sequence2 = False
    
#     sequence1Time = Trip1['lpep_pickup_datetime']
#     sequence2Time = Trip2['lpep_pickup_datetime']
    
    dist = max([Do2d1, Do2d2]) + Do1o2 + Dd1d2
    if (dist < trip1['Trip_distance'] + trip2['Trip_distance']):
#         print('True', dist, trip1['Trip_distance'] + trip2['Trip_distance'])
        return (True, (trip1['Trip_distance'] + trip2['Trip_distance']) - dist)
    else:
#         print('False', dist, trip1['Trip_distance'] + trip2['Trip_distance'])
        return (False, 0)
    
#     if (sequence1 or sequence2): return True

#### 7. Function to iterate through the dataset

In [24]:
trips_processed = 0
mergeable_trips = 0

def shared_trips_eval(dataset, delay, length):    
    
    global trips_processed 
    global mergeable_trips
    
    rides_dict = {}
    rides = {}
    
    for index1 in range(length):
        index2 = index1 + 1
        Trip1 = dataset.iloc[index1]
        
        trips_processed += 1
        count = 0
        while (index2 < length):
            count += 1
            Trip2 = dataset.iloc[index2]
            if (Trip1['Passenger_count'] + Trip2['Passenger_count'] <= 3):
                if (time_difference(Trip1['lpep_pickup_datetime'], Trip2['lpep_pickup_datetime']) > delay or time_difference(Trip1['Lpep_dropoff_datetime'], Trip2['Lpep_dropoff_datetime']) > delay):
                    break
            
            mergeable, dist = mergeable_check(Trip1, Trip2, delay)
            if (mergeable):
                rides.update({index1: {dist}})
                mergeable_trips += 1
            index2 = index2 + 1
        #print(index1)
    return rides



#### 8. Getting mergeable trips

In [34]:
start = time.time()
# 1 Month
#merged_trips = shared_trips_eval(dataset[:1048576], 15.0, len(dataset[:1048576]))
# 3 Months
#merged_trips = shared_trips_eval(dataset[:3145728], 10.0, len(dataset[:3145728]))
# 6 Months
merged_trips = shared_trips_eval(dataset, 5, len(dataset))
end = time.time()

#### 9. Maximum Matching

In [35]:
max_matched = HopcroftKarp(merged_trips).maximum_matching(keys_only=True)

#### 10. Output

In [38]:
miles_saved = (sum(max_matched.values()))
total_distance = dataset['Trip_distance'].sum()

In [39]:
print('')
print('Trips processed: ', trips_processed) 
print('MergeAble Trips: ', len(max_matched)) 
print('Time Taken to process: ' ,end-start)
print('Total Miles saved: ', sum(max_matched.values()))
print('Percentage Miles Saved: ', miles_saved/total_distance * 100)
print('Percentage Trips Mergeable: ', len(max_matched)/trips_processed * 100)


Trips processed:  5243880
MergeAble Trips:  1491545
Time Taken to process:  16190.683989286423
Total Miles saved:  2174318.1917225076
Percentage Miles Saved:  13.312606200277466
Percentage Trips Mergeable:  28.443537990953267
