In [None]:
import numpy as np
import pandas as pd

In [None]:
# bus_groupを8つ読み込む
bus_groups = {}
for i in range(8):
    bus_groups[f"bus_group{i+1}"] = pd.read_excel(
        "../data/rust_data_clean.xlsx", sheet_name=i
    )

bus_groups["bus_group1"]

In [None]:
def create_period_column(df, year_col, month_col, new_col) -> None:
    """指定された年と月の列から新しいPeriod列を作成する。

    Args:
        df (DataFrame): 処理するデータフレーム。
        year_col (str): 年のデータを含む列の名前。
        month_col (str): 月のデータを含む列の名前。
        new_col (str): 新しく作成するPeriod列の名前。
    """
    condition = df[year_col].notna() & df[month_col].notna()
    df.loc[condition, new_col] = pd.to_datetime(
        df.loc[condition, year_col].astype(int).astype(str)
        + "-"
        + df.loc[condition, month_col].astype(int).astype(str),
        format="%Y-%m",
    ).dt.to_period("M")
    df.drop(columns=[year_col, month_col], inplace=True)

# bus_groups_replaceinfoの作成

In [None]:
bus_groups_replaceinfo = pd.DataFrame()
for i in range(8):
    bus_replaceinfo = bus_groups[f"bus_group{i+1}"][0:10].T
    bus_replaceinfo["group"] = str(i + 1)
    bus_groups_replaceinfo = pd.concat(
        [bus_groups_replaceinfo, bus_replaceinfo], axis=0
    )

bus_groups_replaceinfo = bus_groups_replaceinfo.replace(0, np.nan)
bus_groups_replaceinfo.reset_index(inplace=True)
bus_groups_replaceinfo.rename(
    columns={
        "index": "bus_name",
        0: "month_purchased",
        1: "year_purchased",
        2: "month_1st_rep",
        3: "year_1st_rep",
        4: "odometer_1st",
        5: "month_2and_rep",
        6: "year_2and_rep",
        7: "odometer_2and",
        8: "month_data_begins",
        9: "year_data_begins",
    },
    inplace=True,
)

cols = ["year_purchased", "year_1st_rep", "year_2and_rep", "year_data_begins"]
for col in cols:
    bus_groups_replaceinfo[col] = bus_groups_replaceinfo[col].apply(
        lambda x: x if x == np.nan else 1900 + x
    )

create_period_column(
    bus_groups_replaceinfo, "year_purchased", "month_purchased", "purchase_date"
)
create_period_column(
    bus_groups_replaceinfo, "year_1st_rep", "month_1st_rep", "1st_rep_date"
)
create_period_column(
    bus_groups_replaceinfo, "year_2and_rep", "month_2and_rep", "2and_rep_date"
)
create_period_column(
    bus_groups_replaceinfo, "year_data_begins", "month_data_begins", "data_begins_date"
)

bus_groups_replaceinfo

In [None]:
features = [
    "bus_name",
    "group",
    "purchase_date",
    "1st_rep_date",
    "odometer_1st",
    "2and_rep_date",
    "odometer_2and",
    "data_begins_date",
]
bus_groups_replaceinfo = bus_groups_replaceinfo[features]
bus_groups_replaceinfo.to_csv("../data/bus_groups_replaceinfo.csv", index=False)

# bus_group_timeseriesの作成

In [None]:
start_month = bus_groups["bus_group1"].iloc[8, 0]
start_year = bus_groups["bus_group1"].iloc[9, 0] + 1900
bus_group_timeseries = bus_groups["bus_group1"].iloc[10:, :].reset_index(drop=True)

dates = pd.date_range(
    start=f"{start_year}-{start_month}", periods=len(bus_group_timeseries), freq="MS"
).to_period("M")
bus_group_timeseries.index = dates
bus_group_timeseries.index.name = "date"

bus_group_timeseries

In [None]:
bus_groups_timeseries = pd.DataFrame()

for _, bus_group in bus_groups.items():
    start_month = bus_group.iloc[8, 0]
    start_year = bus_group.iloc[9, 0] + 1900
    timeseries_data = bus_group.iloc[10:, :].reset_index(drop=True)

    dates = pd.date_range(
        start=f"{start_year}-{start_month:02d}-01",
        periods=len(timeseries_data),
        freq="MS",
    ).to_period("M")
    timeseries_data.index = dates
    timeseries_data.index.name = "date"

    bus_groups_timeseries = pd.concat(
        [bus_groups_timeseries, timeseries_data], axis=1, sort=True
    )

bus_groups_timeseries

In [None]:
bus_groups_timeseries.to_csv("../data/bus_groups_timeseries.csv")