In [1]:
import json
from utils import *

In [2]:
#--------------------
# Parameters
#--------------------
data_name='taxi'
print(f"Load configuration...")      
with open('parameters.json') as json_file:
    parameters = json.load(json_file)
if data_name == 'taxi':
    parameters['root']="D:/all-data/nyc-taxi/raw_chunk_data/"
    parameters['year']="2016"
elif data_name == 'bikeshare': 
    parameters['root']="D:/all-data/nyc-bikeshare/raw_data/"
    parameters['year']="2021"
else: 
    parameters['root']="D:/all-data/nyc-911/911.csv"
    parameters['year']="2021"

Load configuration...


In [10]:
year = parameters['year']
root = parameters['root']
column_names = parameters['column_names']
uni_columns = parameters['uni_columns']
lat_upper = parameters['lat_upper']
lat_bottom = parameters['lat_bottom']
long_right = parameters['long_right']
long_left = parameters['long_left']

tract_root = parameters['tract']
boundary_root = parameters['boundary']
boundary = get_boundary(boundary_root, parameters)
geodata_tract = get_geodata(tract_root, boundary, data_name, 'tract')
geodata_tract = geopandas.GeoDataFrame(geodata_tract, geometry='geometry')

In [11]:
print(f"   -- Load {year} data...")
dirt = root + year + '/'
files = os.listdir(dirt)
variables = column_names[year]
whole_data = []
for file in files:
    
    data = pd.read_csv(dirt + file)
    data = data[variables]
    data.columns = uni_columns
    
    ## subset data to regions of our interest
    ## (for faster processing purpose as well)        
    data = data.loc[(data.lat<= lat_upper) &\
                    (data.lat>= lat_bottom) &\
                    (data.long<= long_right) &\
                    (data.long>= long_left)]
    data = geopandas.GeoDataFrame(data, geometry=geopandas.points_from_xy(data.long, data.lat))
    whole_data.append(data.values)
whole_data = pd.DataFrame(np.concatenate(whole_data), columns=uni_columns+['geometry'])
whole_data = geopandas.GeoDataFrame(whole_data, geometry='geometry')

## extract date & time
if year != '2016':
    dates = [record[5:10] for record in whole_data.pickup_datetime]
    times = [record[11:13] for record in whole_data.pickup_datetime]
    whole_data['date'] = dates
    whole_data['time'] = times
else:
    dates = [record[:5] for record in whole_data.pickup_datetime]
    times = [record[11:13]+record[-2:] for record in whole_data.pickup_datetime]
    times = np.array(list(map(change_time_format, times)))
    
    ## 12 -> 12AM; should be changed to 00
    ## 24 -> 12PM; should stay as 12
    times[times == '12'] = '00'
    times[times == '24'] = '12'
    whole_data['date'] = dates
    whole_data['time'] = times
    
## unique dates & unique times & segmentations
UNIQUE_DATES = np.unique(dates)
UNIQUE_TIME = np.unique(times)

   -- Load 2016 data...


In [12]:
#----------------------
# Count Function
#----------------------
def count(geodata, 
          hourly_data):
    data = geopandas.sjoin(geodata, hourly_data)
    data['index'] = data.index
    data_count = np.zeros(len(geodata))
    data_count[np.unique(data.index)] = data.groupby('index').count()['geometry'].values
    return data_count

In [49]:
# 06/01, 12AM-1AM
uni_date = '06/01'
uni_time = '00'
query = f"date == '{uni_date}' & time == '{uni_time}'"
hourly_data = whole_data.query(query)
hourly_data = hourly_data.reset_index()
count_data = geopandas.sjoin(geodata_tract, hourly_data)
count_data['index'] = count_data.index
#sub_count_data = count_data[count_data.index == 2]
#points = hourly_data.iloc[sub_count_data.index_right.values]
#points.to_csv('tract-12am-1am-points.csv', index=False)
hourly_data.to_csv('12am-1am-points-hourly-data.csv', index=False)
count_data.to_csv('12am-1am-points-count-data.csv', index=False)

In [50]:
count_data.index.values

array([  0,   0,   0, ..., 280, 281, 281], dtype=int64)

In [51]:
# 06/01, 12AM-1AM
uni_date = '06/01'
uni_time = '12'
query = f"date == '{uni_date}' & time == '{uni_time}'"
hourly_data = whole_data.query(query)
hourly_data = hourly_data.reset_index()
count_data = geopandas.sjoin(geodata_tract, hourly_data)
count_data['index'] = count_data.index
#count_data = count_data[count_data.index == 2]
#points = hourly_data.iloc[count_data.index_right.values]
#points.to_csv('tract-12pm-1pm-points.csv', index=False)
hourly_data.to_csv('12pm-1pm-points-hourly-data.csv', index=False)
count_data.to_csv('12pm-1pm-points-count-data.csv', index=False)

In [52]:
count_data.index.values

array([  0,   0,   0, ..., 281, 281, 281], dtype=int64)