In [16]:
import numpy as np
import pandas as pd

In [17]:
# bus_groupを8つ読み込む
bus_groups = {}
for i in range(8):
    bus_groups[f"bus_group{i+1}"] = pd.read_excel(
        "../data/rust_data_clean.xlsx", sheet_name=i
    )

bus_groups["bus_group1"]

Unnamed: 0,4403,4404,4405,4406,4407,4408,4409,4410,4411,4412,4413,4414,4415,4416,4417
0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1,83,83,83,83,83,83,83,83,83,83,83,83,83,83,83
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
9,83,83,83,83,83,83,83,83,83,83,83,83,83,83,83


In [18]:
def create_period_column(df, year_col, month_col, new_col) -> None:
    """指定された年と月の列から新しいPeriod列を作成する。

    Args:
        df (DataFrame): 処理するデータフレーム。
        year_col (str): 年のデータを含む列の名前。
        month_col (str): 月のデータを含む列の名前。
        new_col (str): 新しく作成するPeriod列の名前。
    """
    condition = df[year_col].notna() & df[month_col].notna()
    df.loc[condition, new_col] = pd.to_datetime(
        df.loc[condition, year_col].astype(int).astype(str)
        + "-"
        + df.loc[condition, month_col].astype(int).astype(str),
        format="%Y-%m",
    ).dt.to_period("M")
    df.drop(columns=[year_col, month_col], inplace=True)

In [19]:
bus_groups_replaceinfo = pd.DataFrame()
for i in range(8):
    bus_replaceinfo = bus_groups[f"bus_group{i+1}"][0:10].T
    bus_replaceinfo["group"] = str(i + 1)
    bus_groups_replaceinfo = pd.concat(
        [bus_groups_replaceinfo, bus_replaceinfo], axis=0
    )

bus_groups_replaceinfo = bus_groups_replaceinfo.replace(0, np.nan)
bus_groups_replaceinfo.reset_index(inplace=True)
bus_groups_replaceinfo.rename(
    columns={
        "index": "bus_name",
        0: "month_purchased",
        1: "year_purchased",
        2: "month_1st_rep",
        3: "year_1st_rep",
        4: "odometer_1st",
        5: "month_2and_rep",
        6: "year_2and_rep",
        7: "odometer_2and",
        8: "month_data_begins",
        9: "year_data_begins",
    },
    inplace=True,
)

cols = ["year_purchased", "year_1st_rep", "year_2and_rep", "year_data_begins"]
for col in cols:
    bus_groups_replaceinfo[col] = bus_groups_replaceinfo[col].apply(
        lambda x: x if x == np.nan else 1900 + x
    )

create_period_column(
    bus_groups_replaceinfo, "year_purchased", "month_purchased", "purchase_date"
)
create_period_column(
    bus_groups_replaceinfo, "year_1st_rep", "month_1st_rep", "1st_rep_date"
)
create_period_column(
    bus_groups_replaceinfo, "year_2and_rep", "month_2and_rep", "2and_rep_date"
)
create_period_column(
    bus_groups_replaceinfo, "year_data_begins", "month_data_begins", "data_begins_date"
)

bus_groups_replaceinfo

Unnamed: 0,bus_name,odometer_1st,odometer_2and,group,purchase_date,1st_rep_date,2and_rep_date,data_begins_date
0,4403,,,1,1983-05,NaT,NaT,1983-05
1,4404,,,1,1983-05,NaT,NaT,1983-05
2,4405,,,1,1983-05,NaT,NaT,1983-05
3,4406,,,1,1983-05,NaT,NaT,1983-05
4,4407,,,1,1983-05,NaT,NaT,1983-05
...,...,...,...,...,...,...,...,...
157,4252,190700.0,,8,1972-02,1976-05,NaT,1974-12
158,4253,151500.0,334400.0,8,1972-02,1977-01,1984-09,1974-12
159,4254,168400.0,,8,1972-02,1976-03,NaT,1974-12
160,4255,132000.0,,8,1972-02,1976-01,NaT,1974-12


In [20]:
features = [
    "bus_name",
    "group",
    "purchase_date",
    "1st_rep_date",
    "odometer_1st",
    "2and_rep_date",
    "odometer_2and",
    "data_begins_date",
]
bus_groups_replaceinfo = bus_groups_replaceinfo[features]
bus_groups_replaceinfo.to_csv("../data/bus_groups_replaceinfo.csv", index=False)