In [None]:
import matplotlib.pyplot as plt
import polars as pl
import util
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import osmnx as ox
import numpy as np
import math

In [None]:
df = util.load_data().fetch(500_000)
# df = util.load_data().collect()

# Run preprocess.py to obtain the parquet dataset
# df = pl.read_parquet('datasets/train.parquet')
df.head()

In [None]:
df.shape

## Data distribution

In [None]:
# NB: This plot takes a lot of time
util.plot_distributions(df)

In [None]:
plt.hist(df['fare_amount'], bins=60, range=(0, 60))
plt.title('Closeup - fare amount')
plt.show()

In [None]:
plt.hist(df['pickup_latitude'], bins=60, range=(40.65, 40.85))
plt.title('Closeup - pickup latitude')
plt.show()

In [None]:
plt.hist(df['pickup_longitude'], bins=60, range=(-74.1, -73.75))
plt.title('Closeup - pickup longitude')
plt.show()

In [None]:
plt.hist(df['pickup_datetime'], bins=7)
plt.title('Closeup - Timestamp')
plt.show()

The distributions suggest the existence of unrealistic data (noise?) and outliers (hundres of passengers for one run, thosands of dollars for a single run). Min and max values show this very clearly. Before moving on with other statistics, it may be a good idea to clear the data further.

In [None]:
df.describe()

### Passenger count
According to the [NYC taxi commission](https://www.nyc.gov/site/tlc/passengers/passenger-frequently-asked-questions.page#:~:text=The%20maximum%20amount%20of%20passengers,of%20an%20adult%20passenger%20seated) the maximum number of passengers, for suitable vehicles, is five. An additional sixth person (child) is admitted. Thus, it is possible to consider all samples that exceed the number of six passengers to be noise. In fact, values greater than six are highly underrepresented.

In [None]:
df.groupby('passenger_count').agg(pl.count()).sort('passenger_count')

In [None]:
df = df.filter(pl.col('passenger_count') <= 6)

## Analyzing spatial locations

In [None]:
# train, test = train_test_split(df, test_size=0.2)
# train, valid = train_test_split(train, test_size=0.2)

In [None]:
def print_point_on_map(ax, x, y, points_area, image_size, image, markersize=.5, color='b', title=None):
    x_printable, y_printable = util.normalize_points(x, y, points_area, image_size)
    
    ax.imshow(image, extent=[0, image_size[0], 0, image_size[1]])
    ax.scatter(x_printable, y_printable, markersize, color)
    if title is not None:
        ax.title.set_text(str(title))

In [None]:
x = df['pickup_longitude'].append(df['dropoff_longitude'])
y = df['pickup_latitude'].append(df['dropoff_latitude'])
points_area = x.min(), x.max(), y.min(), y.max()

# Make the area a square
width = util.distance((points_area[0],points_area[2]), (points_area[1],points_area[2]))
height = util.distance((points_area[0],points_area[2]), (points_area[0],points_area[3]))

additional_space = (width - height)/2

new_lat_min, _ = util.find_latitude_correction((points_area[0],points_area[2]), additional_space, b=-1)
new_lat_max, _ = util.find_latitude_correction((points_area[0],points_area[3]), additional_space, b=1)

points_area = points_area[0], points_area[1], new_lat_min, new_lat_max
# print(util.distance((points_area[0],points_area[2]), (points_area[1],points_area[2])))
# print(util.distance((points_area[1],points_area[2]), (points_area[1],points_area[3])))
# print(util.distance((points_area[1],points_area[3]), (points_area[0],points_area[3])))
# print(util.distance((points_area[0],points_area[3]), (points_area[0],points_area[2])))

# image_size = (2048, 2048)
image_size = (2000, 2000)
image = util.get_image_from_coordinate(points_area, image_size)
image = util.crop_image_with_borders(image)
image_size = image.size
# image.save('map.png')
plt.imshow(image)
plt.show()
print(points_area)

In [None]:
# Remove points on ocean, not working at the moment
ocean_pickup = df.select(
    pl.struct(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'])
    .map(util.polars_point_on_ocean(points_area, only_pickup=True, both=False))
    ).get_columns()[0].alias('ocean_pickup')
ocean_dropoff = df.select(
    pl.struct(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'])
    .map(util.polars_point_on_ocean(points_area, only_pickup=False, both=False))
    ).get_columns()[0].alias('ocean_dropoff')

print('Pickups in the ocean', ocean_pickup.arg_true().shape[0])
print('Dropoffs in the ocean', ocean_dropoff.arg_true().shape[0])
print('Total ocean outlier samples',
      (ocean_dropoff | ocean_pickup).arg_true().shape[0])

outsiders_pickup = df.filter(ocean_pickup)
outsiders_dropoff = df.filter(ocean_dropoff)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(30, 30))

print_point_on_map(axs[0], outsiders_pickup['pickup_longitude'], outsiders_pickup['pickup_latitude'], points_area, image_size, image, color='b', markersize=3)
print_point_on_map(axs[1], outsiders_dropoff['dropoff_longitude'], outsiders_dropoff['dropoff_latitude'], points_area, image_size, image, color='r', markersize=3)

In [None]:
df = df.filter(~ocean_pickup & ~ocean_dropoff)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(50, 50))
print_point_on_map(axs[0], df['pickup_longitude'], df['pickup_latitude'], points_area, image_size, image, color='b')
print_point_on_map(axs[1], df['dropoff_longitude'], df['dropoff_latitude'], points_area, image_size, image, color='r')

In [None]:
timezone = -5
lavorative_hours = (8, 18)

day_hours = df.filter((pl.col("pickup_datetime").dt.hour() > lavorative_hours[0]+timezone) & (pl.col("pickup_datetime").dt.hour() < lavorative_hours[1]+timezone))
night_hours = df.filter((pl.col("pickup_datetime").dt.hour() <= lavorative_hours[0]+timezone) | (pl.col("pickup_datetime").dt.hour() >= lavorative_hours[1]+timezone))

print(len(day_hours), len(night_hours))

x_day = day_hours['pickup_longitude'].append(day_hours['dropoff_longitude'])
y_day = day_hours['pickup_latitude'].append(day_hours['dropoff_latitude'])

x_night = night_hours['pickup_longitude'].append(night_hours['dropoff_longitude'])
y_night = night_hours['pickup_latitude'].append(night_hours['dropoff_latitude'])

fig, axs = plt.subplots(1, 2, figsize=(16, 16))
print_point_on_map(axs[0], x_day, y_day, points_area, image_size, image, color='b', markersize=0.1)
print_point_on_map(axs[1], x_night, y_night, points_area, image_size, image, color='r', markersize=0.1)

In [None]:
hours = []

for h in range(5,29):
    hour_df = df.filter(pl.col("pickup_datetime").dt.hour() == h % 24)
    hours.append((hour_df['pickup_longitude'].append(hour_df['dropoff_longitude']),
                  hour_df['pickup_latitude'].append(hour_df['dropoff_latitude']),
                  len(hour_df)))
    
fig, axs = plt.subplots(6, 4, figsize=(16, 20))
for h in range(24):
    print_point_on_map(axs[h//4, h % 4], hours[h][0], hours[h][1], points_area, image_size, image, color='b',
                       title=f'{(h) % 24}. {hours[h][2]} rides', markersize=0.05)

In [None]:
import importlib
importlib.reload(util)