In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt
# dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d %H')

Загружаем данные

In [2]:
may2016 = pd.read_csv('data/yellow_tripdata_2016-05.csv', parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

In [3]:
may2016.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2016-05-01,2016-05-01 00:17:31,1,3.6,-73.985901,40.76804,1,N,-73.983986,40.730099,1,15.0,0.5,0.5,1.5,0.0,0.3,17.8
1,2,2016-05-01,2016-05-01 00:07:31,1,1.68,-73.991577,40.744751,1,N,-73.9757,40.765469,1,7.5,0.5,0.5,0.88,0.0,0.3,9.68
2,2,2016-05-01,2016-05-01 00:07:01,6,1.09,-73.993073,40.741573,1,N,-73.980995,40.744633,1,6.5,0.5,0.5,1.56,0.0,0.3,9.36
3,2,2016-05-01,2016-05-01 00:19:47,1,4.21,-73.991943,40.684601,1,N,-74.002258,40.733002,1,17.0,0.5,0.5,3.66,0.0,0.3,21.96
4,2,2016-05-01,2016-05-01 00:06:39,1,0.56,-74.00528,40.740192,1,N,-73.997498,40.737564,1,6.0,0.5,0.5,1.46,0.0,0.3,8.76


In [4]:
may2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11836853 entries, 0 to 11836852
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        int64         
 4   trip_distance          float64       
 5   pickup_longitude       float64       
 6   pickup_latitude        float64       
 7   RatecodeID             int64         
 8   store_and_fwd_flag     object        
 9   dropoff_longitude      float64       
 10  dropoff_latitude       float64       
 11  payment_type           int64         
 12  fare_amount            float64       
 13  extra                  float64       
 14  mta_tax                float64       
 15  tip_amount             float64       
 16  tolls_amount           float64       
 17  improvement_surcharge  float64       
 18  total_amount        

Координаты прямоугольника Нью-Йорка

In [5]:
start_long = -74.25559
end_long = -73.70001
start_lat = 40.49612
end_lat = 40.91553

Очистка данных от ошибок

In [6]:
index_to_drop = may2016[(may2016.pickup_latitude < start_lat) | 
                       (may2016.pickup_latitude > end_lat) | 
                       (may2016.pickup_longitude < start_long) | 
                       (may2016.pickup_longitude > end_long) |
                       (may2016.passenger_count == 0) |
                       (may2016.trip_distance == 0)|
                       (may2016.tpep_dropoff_datetime <= may2016.tpep_pickup_datetime)].index
print(f'Количество удалённых строк: {index_to_drop.shape[0]}, в процентах {np.round(index_to_drop.shape[0] / may2016.shape[0] * 100, 2)}%')
print(f'Количество данных до фильтрации {may2016.shape[0]}')
may2016.drop(index=index_to_drop, inplace=True)
print(f'Количество строк после фильтрации {may2016.shape[0]}')

Количество удалённых строк: 210332, в процентах 1.78%
Количество данных до фильтрации 11836853
Количество строк после фильтрации 11626521


Присвоение поездкам регионов посадки в такси

In [7]:
statistic, x_edge, y_edge, binnumber = stats.binned_statistic_2d(may2016.pickup_longitude.to_numpy(),
                                                                 may2016.pickup_latitude.to_numpy(),
                                                                 None, 
                                                                 'count', 
                                                                 bins=[50, 50],
                                                                 range=[[start_long, end_long], [start_lat, end_lat]],
                                                                 expand_binnumbers=True)
pickup_regions = (binnumber[0] - 1) * 50 + binnumber[1]
may2016['pickup_region'] = pd.Series(pickup_regions, index=may2016.index)
may2016.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,pickup_region
0,1,2016-05-01,2016-05-01 00:17:31,1,3.6,-73.985901,40.76804,1,N,-73.983986,40.730099,1,15.0,0.5,0.5,1.5,0.0,0.3,17.8,1233
1,2,2016-05-01,2016-05-01 00:07:31,1,1.68,-73.991577,40.744751,1,N,-73.9757,40.765469,1,7.5,0.5,0.5,0.88,0.0,0.3,9.68,1180
2,2,2016-05-01,2016-05-01 00:07:01,6,1.09,-73.993073,40.741573,1,N,-73.980995,40.744633,1,6.5,0.5,0.5,1.56,0.0,0.3,9.36,1180
3,2,2016-05-01,2016-05-01 00:19:47,1,4.21,-73.991943,40.684601,1,N,-74.002258,40.733002,1,17.0,0.5,0.5,3.66,0.0,0.3,21.96,1173
4,2,2016-05-01,2016-05-01 00:06:39,1,0.56,-74.00528,40.740192,1,N,-73.997498,40.737564,1,6.0,0.5,0.5,1.46,0.0,0.3,8.76,1130


Определение региона Empire State Building

In [8]:
empire_state_long = -73.985428
empire_state_lat = 40.748817
empire_state_bin = stats.binned_statistic_2d([empire_state_long],
                          [empire_state_lat],
                          None,
                          'count',
                          bins=[50, 50],
                          range=[[start_long, end_long], [start_lat, end_lat]],
                          expand_binnumbers=True).binnumber
empire_state_region = (empire_state_bin[0] - 1) * 50 + empire_state_bin[1] - 1

Удаление минут и секунд из времени начала поездки

In [10]:
may2016['pickup_day_hour'] = may2016.tpep_pickup_datetime.apply(lambda x: x.replace(minute=0, second=0))
may2016.head(n=150)

MemoryError: 

Создание аггрегированного дата фрейма

In [None]:
date_ind = may2016.groupby(by='pickup_day_hour').count().index
mult_ind = pd.MultiIndex.from_product([date_ind, list(range(1, 2501))], names=['pickup_day_hour', 'pickup_region'])
summary = pd.DataFrame(index=mult_ind)
summary['rides'] = may2016.groupby(by=['pickup_day_hour', 'pickup_region']).count().iloc[:, 0]
summary.fillna(0, inplace=True)
summary.rides = summary.apply(int, axis=1)

In [None]:
summary.head(1019)

Поездки из ячейки с Empire State Building

In [None]:
empire_state_rides = pd.DataFrame(summary.unstack().iloc[:, empire_state_region])

In [None]:
empire_state_rides

In [None]:
plt.figure(figsize=(15,5))
plt.plot(empire_state_rides)
plt.show()

Количество пар с час-ячейка с нулевым количеством поездок

In [None]:
print(f'Количество пар: {summary.value_counts()[0]}')