## データ前処理

In [26]:
import pandas as pd

In [27]:
race_data = pd.read_pickle("data/race_data.pkl")

### 複数個が一緒になった列を分割していらない部分を除去

In [28]:
race_data = pd.concat([race_data, race_data["選手名府県/年齢/期別"].str.split("/", expand=True)], axis=1).drop("選手名府県/年齢/期別", axis=1)
race_data = race_data.drop(0, axis=1)

### ダミー変数化

In [29]:
prediction = pd.get_dummies(race_data["予想"])
isFine = pd.get_dummies(race_data["好気合"], drop_first=True)
leg_type = pd.get_dummies(race_data["脚質"])
group = pd.get_dummies(race_data["級班"])
race_data = pd.concat([race_data, prediction, isFine, leg_type, group], axis=1).drop(["予想", "好気合", "脚質", "級班"], axis=1)
print(race_data.shape)

(57768, 30)


### 文字列データやエラー値を変換

In [30]:
race_data = race_data.replace(["失", "落", "故", "欠"], 9)
race_data["ギヤ倍数"] = race_data["ギヤ倍数"].map(lambda x: x[:4] if len(x)>4 else x)
race_data[2] = race_data[2].map(lambda x: x.replace(" （欠車）", "") if "欠車"in x else x)
race_data["着順"] = race_data["着順"].map(lambda x: 1 if x in ["1", "2", "3"] else 0)
race_data = race_data.astype("float64")
print(race_data.shape)

(57768, 30)


### データを最大最小値で正規化

In [24]:
def minmax_norm(columns):
    df = race_data[columns]
    for column in columns:
        race_data[columns] = (df - df.min()) / (df.max() - df.min())

In [32]:
minmax_columns = ["総評", "ギヤ倍数", "競走得点", "1着", "2着", "3着", "着外", 1, 2]
minmax_norm(minmax_columns)

### 学習データと教師データに分割

In [33]:
race_y = race_data['着順']
race_x = race_data.drop('着順', axis=1)
race_x = race_x.loc[:, ['車番', '総評', '枠番', 'ギヤ倍数', '競走得点', '1着', '2着', '3着', '着外', 1, 2, 'nan', '×', '▲', '△', '○', '◎', '注', '★', '両', '追', '逃', 'A1', 'A2', 'A3', 'L1', 'S1', 'S2', 'SS']]

### pickleファイルに保存

In [34]:
race_x.to_pickle("data/race_x.pkl")
race_y.to_pickle("data/race_y.pkl")