In [2]:
%matplotlib inline


import geopy
from pathlib import Path
from math import radians, cos, sin, asin, sqrt

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from matplotlib.colors import LinearSegmentedColormap

import warnings
warnings.filterwarnings('ignore')

In [3]:
mpl.rcParams["axes.spines.top"] = False
mpl.rcParams["axes.spines.right"] = False
mpl.rcParams["figure.figsize"] = (28, 16)

mpl.rcParams["font.family"] = "DejaVu Sans"

In [4]:
MAIN_FOLDER = Path().resolve()
DATA_FOLDER = MAIN_FOLDER / "data"

RAW_DATA_FOLDER = DATA_FOLDER / "raw"
TRIPS_FOLDER = RAW_DATA_FOLDER / "Concat_trips_2013-2019"
NEW_TRIPS_FOLDER = RAW_DATA_FOLDER / "10_parts_cleared"

CLEAN_DATA_FOLDER = DATA_FOLDER / "cleaned"
RIDE_VALUES_FOLDER = CLEAN_DATA_FOLDER / "ride_values"

In [5]:
recent_years = pd.read_parquet(RIDE_VALUES_FOLDER / "total_2020-2025_ride_value.parquet")
recent_years.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,time,l1_distance_m,ride_value
0,F29C09E056396FE8,classic_bike,2024-08-23 18:08:22.727,2024-08-23 18:22:11.977,Clark St & Newport St,632,Sheffield Ave & Kingsbury St,13154,41.94454,-87.654678,41.910522,-87.653106,casual,13.820833,3.905818,9.05
1,A821ED4668EBDDF8,electric_bike,2024-08-23 20:25:55.096,2024-08-23 20:49:37.894,Shedd Aquarium,15544,Wells St & Elm St,KA1504000135,41.867226,-87.615355,41.903222,-87.634324,member,23.7133,5.563083,0.0
2,933C90370EE52D8A,electric_bike,2024-08-19 09:23:34.754,2024-08-19 09:33:57.220,Wood St & Augusta Blvd,657,Orleans St & Merchandise Mart Plaza,TA1305000022,41.899202,-87.672182,41.888243,-87.63639,member,10.374433,4.173786,0.0
3,E36921831B0DCF31,electric_bike,2024-08-12 18:17:09.403,2024-08-12 18:27:32.246,Wood St & Augusta Blvd,657,Winchester Ave & Elston Ave,KA1504000140,41.899181,-87.6722,41.924091,-87.67646,member,10.380717,3.116891,0.0
4,ADC7A80D57B65702,classic_bike,2024-08-07 13:57:04.377,2024-08-07 14:14:14.215,Sheffield Ave & Wrightwood Ave,TA1309000023,Winchester Ave & Elston Ave,KA1504000140,41.928712,-87.653833,41.924091,-87.67646,member,17.163967,2.381591,0.0


In [6]:
recent_years["ride_value"].describe()

count    2.525842e+06
mean     3.068105e+00
std      4.277478e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      9.050000e+00
max      9.050000e+00
Name: ride_value, dtype: float64

## 1. Положение вещей со станциями 

In [7]:
null_data = recent_years.count() / len(recent_years)
null_data

ride_id               1.000000
rideable_type         1.000000
started_at            1.000000
ended_at              1.000000
start_station_name    0.806974
start_station_id      0.806974
end_station_name      0.800449
end_station_id        0.800449
start_lat             1.000000
start_lng             1.000000
end_lat               1.000000
end_lng               1.000000
member_casual         1.000000
time                  1.000000
l1_distance_m         1.000000
ride_value            1.000000
dtype: float64

Для 20%-ов данных были потеряны начальные и конечные станции. Попытаемся восстановить их

In [8]:
def print_round_percent(num):
    print(f"{round(num * 100, 2)}%")


empty_stations = recent_years[pd.isna(recent_years["start_station_name"]) | pd.isna(recent_years["end_station_name"])]
total_data_null = len(empty_stations) / len(recent_years)
print_round_percent(total_data_null)

30.04%


Для 30.04%-ов данных были потеряны так или инача начальные и/или конечные станции.

## 2. Постараемся восполнить пробел

### Пустые начальные станции

In [9]:
unique_start = empty_stations[["start_lat", "start_lng"]].drop_duplicates().reset_index(drop=True)
unique_start.head()

Unnamed: 0,start_lat,start_lng
0,41.86,-87.62
1,41.89,-87.67
2,41.91,-87.63
3,41.9,-87.65
4,41.93,-87.64


Проставим ближаюшию станцию для поездок, которые окончились рядом с этой станцией в пределах 1 км (2-3 минуты езды).

In [10]:
stations_in_service = pd.read_csv(RAW_DATA_FOLDER / "Divvy_Bicycle_Stations_-_In_Service_20250418.csv")
stations_in_service.head()

Unnamed: 0,ID,Station Name,Total Docks,Docks in Service,Status,Latitude,Longitude
0,a3aa5ed1-a135-11e9-9cda-0a87ae2ba916,Damen Ave & Charleston St,11,11,In Service,41.920082,-87.677855
1,a3b11480-a135-11e9-9cda-0a87ae2ba916,Laramie Ave & Kinzie St,11,11,In Service,41.887832,-87.755527
2,a3ae82e0-a135-11e9-9cda-0a87ae2ba916,Warren Park West,11,11,In Service,42.001785,-87.688829
3,1872721624592814362,Lockwood Ave & Wrightwood Ave,15,15,In Service,41.927766,-87.758544
4,a3b3686c-a135-11e9-9cda-0a87ae2ba916,California Ave & Cortez St,15,15,In Service,41.900363,-87.696704


In [11]:
stations_in_service = stations_in_service[["ID", "Station Name", "Latitude", "Longitude"]]

In [12]:
def haversine_vec(lat1, lon1, lat2, lon2):
    R = 6372.8
    dLat = np.radians(lat2 - lat1)
    dLon = np.radians(lon2 - lon1)
    lat1 = np.radians(lat1)
    lat2 = np.radians(lat2)
    a = np.sin(dLat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dLon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

In [13]:
dist = pd.merge(unique_start, stations_in_service, how="cross")
dist["distance"] = haversine_vec(dist["start_lat"], dist["start_lng"], dist["Latitude"], dist["Longitude"])
dist = dist.sort_values("distance").drop_duplicates(["start_lat", "start_lng"], keep="first").reset_index()

In [14]:
dist.head()

Unnamed: 0,index,start_lat,start_lng,ID,Station Name,Latitude,Longitude,distance
0,629955,41.888716,-87.644448,a3a482fb-a135-11e9-9cda-0a87ae2ba916,Desplaines St & Kinzie St,41.888716,-87.644448,0.0
1,2665933,41.892278,-87.612043,a3a41526-a135-11e9-9cda-0a87ae2ba916,Streeter Dr & Grand Ave,41.892278,-87.612043,0.0
2,3473562,41.878125,-87.639968,a3a4d069-a135-11e9-9cda-0a87ae2ba916,Canal St & Jackson Blvd,41.878125,-87.639968,0.0
3,1928348,41.929465,-87.684158,a3a95669-a135-11e9-9cda-0a87ae2ba916,Logan Blvd & Elston Ave,41.929465,-87.684158,0.0
4,476741,41.94454,-87.654678,2064974399404579678,Clark St & Newport St,41.94454,-87.654678,0.0


In [15]:
print_round_percent(len(dist[dist["distance"] > 1]) / len(dist))
dist = dist[dist["distance"] <= 1]

0.37%


In [16]:
fixed_recent_years = recent_years.copy()
fixed_recent_years = pd.merge(fixed_recent_years, dist, how="left", on=["start_lat", "start_lng"])
fixed_recent_years.loc[pd.isna(fixed_recent_years.start_station_name), "start_station_name"] = fixed_recent_years[
    pd.isna(fixed_recent_years.start_station_name)
]["Station Name"]
fixed_recent_years.loc[pd.isna(fixed_recent_years.start_station_id), "start_station_id"] = fixed_recent_years[
    pd.isna(fixed_recent_years.start_station_id)
]["ID"]

In [17]:
fixed_recent_years.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'time', 'l1_distance_m', 'ride_value', 'index', 'ID',
       'Station Name', 'Latitude', 'Longitude', 'distance'],
      dtype='object')

In [18]:
fixed_recent_years = fixed_recent_years.drop(fixed_recent_years.columns[[16, 17, 18, 19, 20, 21]], axis=1)
fixed_recent_years.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,time,l1_distance_m,ride_value
0,F29C09E056396FE8,classic_bike,2024-08-23 18:08:22.727,2024-08-23 18:22:11.977,Clark St & Newport St,632,Sheffield Ave & Kingsbury St,13154,41.94454,-87.654678,41.910522,-87.653106,casual,13.820833,3.905818,9.05
1,A821ED4668EBDDF8,electric_bike,2024-08-23 20:25:55.096,2024-08-23 20:49:37.894,Shedd Aquarium,15544,Wells St & Elm St,KA1504000135,41.867226,-87.615355,41.903222,-87.634324,member,23.7133,5.563083,0.0
2,933C90370EE52D8A,electric_bike,2024-08-19 09:23:34.754,2024-08-19 09:33:57.220,Wood St & Augusta Blvd,657,Orleans St & Merchandise Mart Plaza,TA1305000022,41.899202,-87.672182,41.888243,-87.63639,member,10.374433,4.173786,0.0
3,E36921831B0DCF31,electric_bike,2024-08-12 18:17:09.403,2024-08-12 18:27:32.246,Wood St & Augusta Blvd,657,Winchester Ave & Elston Ave,KA1504000140,41.899181,-87.6722,41.924091,-87.67646,member,10.380717,3.116891,0.0
4,ADC7A80D57B65702,classic_bike,2024-08-07 13:57:04.377,2024-08-07 14:14:14.215,Sheffield Ave & Wrightwood Ave,TA1309000023,Winchester Ave & Elston Ave,KA1504000140,41.928712,-87.653833,41.924091,-87.67646,member,17.163967,2.381591,0.0


### Пустые конечные станции

In [19]:
unique_end = empty_stations[["end_lat", "end_lng"]].drop_duplicates().reset_index(drop=True)
dist = pd.merge(unique_end, stations_in_service, how="cross")
dist["distance"] = haversine_vec(dist["end_lat"], dist["end_lng"], dist["Latitude"], dist["Longitude"])
dist = dist.sort_values("distance").drop_duplicates(["end_lat", "end_lng"], keep="first").reset_index()
dist.head()

Unnamed: 0,index,end_lat,end_lng,ID,Station Name,Latitude,Longitude,distance
0,2232148,41.953107,-87.774733,1929119555906179916,Marmora Ave & Irving Park Rd,41.953107,-87.774733,0.0
1,1565026,41.93833,-87.76166,1936582264492460636,Long Ave & Belmont Ave,41.93833,-87.76166,0.0
2,1459854,41.679804,-87.620843,1448642175142467180,S Michigan Ave & E 118th St,41.679804,-87.620843,0.0
3,2051012,41.787539,-87.644874,a3b2d2b7-a135-11e9-9cda-0a87ae2ba916,Halsted St & 59th St,41.787539,-87.644874,0.0
4,114866,41.80934,-87.747831,1593766334653391756,Lavergne Ave & 46th St,41.80934,-87.747831,0.0


In [20]:
dist = dist[dist["distance"] <= 1]

In [21]:
fixed_recent_years = pd.merge(fixed_recent_years, dist, how="left", on=["end_lat", "end_lng"])
fixed_recent_years.loc[pd.isna(fixed_recent_years.end_station_name), "end_station_name"] = fixed_recent_years[
    pd.isna(fixed_recent_years.end_station_name)
]["Station Name"]
fixed_recent_years.loc[pd.isna(fixed_recent_years.end_station_id), "end_station_id"] = fixed_recent_years[
    pd.isna(fixed_recent_years.end_station_id)
]["ID"]
fixed_recent_years = fixed_recent_years.drop(fixed_recent_years.columns[[16, 17, 18, 19, 20, 21]], axis=1)
fixed_recent_years.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,time,l1_distance_m,ride_value
0,F29C09E056396FE8,classic_bike,2024-08-23 18:08:22.727,2024-08-23 18:22:11.977,Clark St & Newport St,632,Sheffield Ave & Kingsbury St,13154,41.94454,-87.654678,41.910522,-87.653106,casual,13.820833,3.905818,9.05
1,A821ED4668EBDDF8,electric_bike,2024-08-23 20:25:55.096,2024-08-23 20:49:37.894,Shedd Aquarium,15544,Wells St & Elm St,KA1504000135,41.867226,-87.615355,41.903222,-87.634324,member,23.7133,5.563083,0.0
2,933C90370EE52D8A,electric_bike,2024-08-19 09:23:34.754,2024-08-19 09:33:57.220,Wood St & Augusta Blvd,657,Orleans St & Merchandise Mart Plaza,TA1305000022,41.899202,-87.672182,41.888243,-87.63639,member,10.374433,4.173786,0.0
3,E36921831B0DCF31,electric_bike,2024-08-12 18:17:09.403,2024-08-12 18:27:32.246,Wood St & Augusta Blvd,657,Winchester Ave & Elston Ave,KA1504000140,41.899181,-87.6722,41.924091,-87.67646,member,10.380717,3.116891,0.0
4,ADC7A80D57B65702,classic_bike,2024-08-07 13:57:04.377,2024-08-07 14:14:14.215,Sheffield Ave & Wrightwood Ave,TA1309000023,Winchester Ave & Elston Ave,KA1504000140,41.928712,-87.653833,41.924091,-87.67646,member,17.163967,2.381591,0.0


Удалим все остальные пустые значения

In [22]:
null_data = fixed_recent_years.count() / len(fixed_recent_years)
null_data

ride_id               1.000000
rideable_type         1.000000
started_at            1.000000
ended_at              1.000000
start_station_name    0.999607
start_station_id      0.999607
end_station_name      0.999453
end_station_id        0.999453
start_lat             1.000000
start_lng             1.000000
end_lat               1.000000
end_lng               1.000000
member_casual         1.000000
time                  1.000000
l1_distance_m         1.000000
ride_value            1.000000
dtype: float64

In [23]:
fixed_recent_years = fixed_recent_years.dropna()

## 2. zipcode

In [24]:
geolocator = geopy.Nominatim(user_agent="check_1")


def get_zipcode(df, geolocator, lat_field, lon_field):
    try:
        location = geolocator.reverse((df[lat_field], df[lon_field]))
        return location.raw["address"]["postcode"]
    except (AttributeError, KeyError, ValueError):
        print(df[lat_field], df[lon_field])
        return None


geolocator = geopy.Nominatim(user_agent="myusername")  # My OpenMap username

In [26]:
len(fixed_recent_years.start_station_name.unique()), len(stations_in_service)

(1669, 1041)

In [None]:
fixed_recent_years["zipcode"] = fixed_recent_years.apply(get_zipcode, axis=1, geolocator=geolocator, lat_field="")