In [2]:
import pandas as pd
from pathlib import Path

In [3]:
MAIN_FOLDER = Path().resolve()
DATA_FOLDER = MAIN_FOLDER / "data"
TRIPS_FOLDER = DATA_FOLDER / "Concat_trips_2013-2019"

In [8]:
import datetime


def calculate_ride_value(ride_row):
    start_time = ride_row["starttime"]
    end_time = ride_row["stoptime"]
    user_type = ride_row["usertype"]

    duration_minutes = (end_time - start_time).total_seconds() / 60

    if start_time < datetime.datetime(2015, 8, 5):
        return _calculate_pre_aug_2015(duration_minutes, user_type)
    if start_time < datetime.datetime(2016, 1, 31):
        return _calculate_aug_2015_to_jan_2016(duration_minutes, user_type)
    if start_time < datetime.datetime(2017, 2, 8):
        return _calculate_jan_2016_to_feb_2017(duration_minutes, user_type)
    if start_time < datetime.datetime(2018, 2, 1):
        return _calculate_feb_2017_to_feb_2018(duration_minutes, user_type)
    if start_time < datetime.datetime(2020, 3, 24):
        return _calculate_feb_2018_to_mar_2020(duration_minutes, user_type)
    if start_time < datetime.datetime(2020, 6, 1):
        return _calculate_mar_2020_to_jun_2020(duration_minutes, user_type)
    if start_time < datetime.datetime(2020, 7, 27):
        return _calculate_jun_2020_to_jul_2020(duration_minutes, user_type)
    if start_time < datetime.datetime(2022, 1, 22):
        return _calculate_jul_2020_to_jan_2022(duration_minutes, user_type)
    if start_time < datetime.datetime(2022, 5, 10):
        return _calculate_jan_2022_to_may_2022(duration_minutes, user_type)
    if start_time < datetime.datetime(2023, 5, 10):
        return _calculate_may_2022_to_may_2023(duration_minutes, user_type)
    if start_time < datetime.datetime(2024, 2, 5):
        return _calculate_may_2023_to_feb_2024(
            duration_minutes,
            user_type,
            ride_row.get("rideable_type", "classic_bike"),
        )
    return _calculate_after_feb_2024(
        duration_minutes,
        user_type,
        ride_row.get("rideable_type", "classic_bike"),
    )


def _calculate_pre_aug_2015(duration_minutes, user_type):
    if user_type == "member":
        if duration_minutes <= 30:
            return 0.0
        if duration_minutes <= 60:
            return 1.50
        if duration_minutes <= 90:
            return 1.50 + 4.50
        additional_periods = int((duration_minutes - 90) / 30) + 1
        return 1.50 + 4.50 + (6.00 * additional_periods)

    if duration_minutes <= 30:
        return 0.0
    if duration_minutes <= 60:
        return 2.00
    if duration_minutes <= 90:
        return 2.00 + 6.00
    additional_periods = int((duration_minutes - 90) / 30) + 1
    return 2.00 + 6.00 + (8.00 * additional_periods)


def _calculate_aug_2015_to_jan_2016(duration_minutes, user_type):
    # 24-часовой пропуск стал стоит 9.95 долларов. Цены остались такими же.
    _calculate_pre_aug_2015(duration_minutes, user_type)


def _calculate_jan_2016_to_feb_2017(duration_minutes, user_type):
    # Цены на годовой абонемент были повышены до $99. Цены остались такими же.
    _calculate_aug_2015_to_jan_2016(duration_minutes, user_type)


def _calculate_feb_2017_to_feb_2018(duration_minutes, user_type):
    # Были снижены цены для 60-90 минут. $4.5 -> $3.0 для годового и $6.0 -> $4.0 для пропуска
    if user_type == "member":
        if duration_minutes <= 30:
            return 0.0
        if duration_minutes <= 60:
            return 1.50
        if duration_minutes <= 90:
            return 1.50 + 3.00
        additional_periods = int((duration_minutes - 90) / 30) + 1
        return 1.50 + 3.00 + (6.00 * additional_periods)

    if duration_minutes <= 30:
        return 0.0
    if duration_minutes <= 60:
        return 2.00
    if duration_minutes <= 90:
        return 2.00 + 4.00
    additional_periods = int((duration_minutes - 90) / 30) + 1
    return 2.00 + 4.00 + (8.00 * additional_periods)


def _calculate_feb_2018_to_mar_2020(duration_minutes, user_type):
    if user_type == "member":
        if duration_minutes <= 45:
            return 0.0
        additional_periods = int((duration_minutes - 45) / 30) + 1
        return 3.00 * additional_periods

    if duration_minutes <= 180:
        return 15.0
    additional_periods = int((duration_minutes - 180) / 30) + 1
    return 15.0 + (3.00 * additional_periods)


def _calculate_mar_2020_to_jun_2020(duration_minutes, user_type):
    # Цены были снижены для одиночных поездок, которые не учитывается в датасете.
    _calculate_feb_2018_to_mar_2020(duration_minutes, user_type)


def _calculate_jun_2020_to_jul_2020(duration_minutes, user_type):
    # Я не уверен, что цены вернулись в норму, так как следующее доказательство увеличения цен появилось в июле, но пусть будет так.
    _calculate_mar_2020_to_jun_2020(duration_minutes, user_type)


def _calculate_jul_2020_to_jan_2022(duration_minutes, user_type):
    # Перешли на минутную модель по цене 0.15 долларов за минуту.
    if user_type == "member":
        if duration_minutes <= 45:
            return 0.0
        extra_minutes = duration_minutes - 45
        return 0.15 * extra_minutes

    if duration_minutes <= 180:
        return 0.0
    extra_minutes = duration_minutes - 30
    return 0.15 * extra_minutes


def _calculate_jan_2022_to_may_2022(duration_minutes, user_type):
    # Цены за программы повысились, но не на тарифы.
    _calculate_jul_2020_to_jan_2022(duration_minutes, user_type)


def _calculate_may_2022_to_may_2023(duration_minutes, user_type):
    # 0.16 долларов за минуту.
    if user_type == "member":
        if duration_minutes <= 45:
            return 0.0
        extra_minutes = duration_minutes - 45
        return 0.16 * extra_minutes

    if duration_minutes <= 180:
        return 0.0
    extra_minutes = duration_minutes - 30
    return 0.16 * extra_minutes


def _calculate_may_2023_to_feb_2024(duration_minutes, user_type, rideable_type):
    is_classic = rideable_type.lower() in ["classic_bike", "docker_bike"]
    is_electric = rideable_type.lower() == "electric_bike"
    is_scooter = rideable_type.lower() == "electric_scooter"

    if user_type == "member":
        if is_electric:
            return 0.17 * duration_minutes
        if is_scooter:
            return 0.27 * duration_minutes
        if duration_minutes <= 45:
            return 0.0
        extra_minutes = duration_minutes - 45
        return 0.17 * extra_minutes

    if is_classic:
        if duration_minutes <= 180:
            return 0.0
        extra_minutes = duration_minutes - 180
        return 0.17 * extra_minutes
    if is_scooter or is_scooter:
        return 0.42 * duration_minutes


def _calculate_after_feb_2024(duration_minutes, user_type, rideable_type):
    is_classic = rideable_type.lower() in ["classic_bike", "docker_bike"]
    is_electric = rideable_type.lower() == "electric_bike"
    is_scooter = rideable_type.lower() == "electric_scooter"

    if user_type == "member":
        if is_electric:
            return 0.18 * duration_minutes
        if is_scooter:
            return 0.29 * duration_minutes
        if duration_minutes <= 45:
            return 0.0
        extra_minutes = duration_minutes - 45
        return 0.18 * extra_minutes

    if is_classic:
        if duration_minutes <= 180:
            return 0.0
        extra_minutes = duration_minutes - 180
        return 0.18 * extra_minutes
    if is_scooter or is_scooter:
        return 0.44 * duration_minutes

In [16]:
parquet_files = [
    "trips-stations_part_1.parquet",
    "trips-stations_part_2.parquet",
    "trips-stations_part_3.parquet",
    "trips-stations_part_4.parquet",
    "trips-stations_part_5.parquet",
    "trips-stations_part_6.parquet",
    "trips-stations_part_7.parquet",
    "trips-stations_part_8.parquet",
    "trips-stations_part_9.parquet",
    "trips-stations_part_10.parquet",
]

for file_name in parquet_files:
    df = pd.read_parquet(TRIPS_FOLDER / file_name)
    df["starttime"] = pd.to_datetime(df["starttime"])
    df["stoptime"] = pd.to_datetime(df["stoptime"], format="mixed")
    df["ride_value"] = df[["starttime", "stoptime", "usertype"]].apply(calculate_ride_value, axis=1)
    df.to_parquet(TRIPS_FOLDER / f"ridd_value_processed_{file_name}", index=False)
    print(f"{file_name} was processed")

trips-stations_part_1.parquet was processed
trips-stations_part_2.parquet was processed
trips-stations_part_3.parquet was processed
trips-stations_part_4.parquet was processed
trips-stations_part_5.parquet was processed
trips-stations_part_6.parquet was processed
trips-stations_part_7.parquet was processed
trips-stations_part_8.parquet was processed
trips-stations_part_9.parquet was processed
trips-stations_part_10.parquet was processed
