## データ前処理

In [2]:
import pandas as pd

In [68]:
race_data = pd.read_pickle("data/race_data.pkl")

### 複数個が一緒になった列を分割していらない部分を除去

In [69]:
race_data = pd.concat([race_data, race_data["選手名府県/年齢/期別"].str.split("/", expand=True)], axis=1).drop("選手名府県/年齢/期別", axis=1)
race_data = race_data.drop(0, axis=1)
race_data = race_data.rename(columns={1: "年齢", 2: "期別"})

### ダミー変数化

In [70]:
#ダミー変数の対象と，カテゴリーを定義
dummy_targets = {"予想": ["nan", "×", "▲", "△", "○", "◎", "注"], \
                      "好気合": ["★"], \
                      "脚質": ["両", "追", "逃"], \
                      "級班": ["A1", "A2", "A3", "L1", "S1", "S2", "SS"] }

#定義したカテゴリーを指定しておく
for key, item in dummy_targets.items():
    race_data[key] = pd.Categorical(race_data[key], categories=item)

#ダミー変数化されたデータフレームを格納するリストと削除する列のリストを定義
dummies = [race_data]
drop_targets = []

#ダミー変数化してdummiesに代入
for key, items in dummy_targets.items():
    dummy = pd.get_dummies(race_data[key])
    dummies.append(dummy)
    drop_targets.append(key)

#ダミー変数化されたデータフレームを大元のデータフレームに結合
race_data = pd.concat(dummies, axis=1).drop(drop_targets,  axis=1)

### 文字列データやエラー値を変換

In [71]:
#落車などで順位が出なかった部分を9位として変換
race_data = race_data.replace(["失", "落", "故", "欠"], 9)

#ギヤ倍数の表示がおかしい部分を変換
race_data["ギヤ倍数"] = race_data["ギヤ倍数"].map(lambda x: x[:4] if len(x)>4 else x)

#期別に含まれる欠車の文字を除外
race_data["期別"] = race_data["期別"].map(lambda x: x.replace(" （欠車）", "") if "欠車"in x else x)

#着順の列を3着以内は1,それ以外は0に変換
race_data["着順"] = race_data["着順"].map(lambda x: 1 if x in ["1", "2", "3"] else 0)

#全データをfloat型に変換
race_data = race_data.astype("float64")

### データを最大最小値で正規化

In [72]:
#最大値が1最小値が0になるように正規化
def minmax_norm(columns):
    df = race_data[columns]
    for column in columns:
        race_data[columns] = (df - df.min()) / (df.max() - df.min())

In [73]:
minmax_columns = ["車番", "総評", "枠番", "ギヤ倍数", "競走得点", "1着", "2着", "3着", "着外", "年齢", "期別"]
minmax_norm(minmax_columns)

In [74]:
race_data

Unnamed: 0,着順,車番,総評,枠番,ギヤ倍数,競走得点,1着,2着,3着,着外,...,両,追,逃,A1,A2,A3,L1,S1,S2,SS
2720210401010001,1.0,0.250,0.000000,0.4,0.974359,0.652750,0.169811,0.275862,0.230769,0.038462,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2720210401010001,1.0,0.125,0.333333,0.2,0.820513,0.606417,0.075472,0.172414,0.153846,0.179487,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2720210401010001,1.0,0.000,0.333333,0.0,0.974359,0.613833,0.301887,0.517241,0.269231,0.384615,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2720210401010001,0.0,0.500,0.333333,0.8,0.974359,0.587500,0.056604,0.206897,0.384615,0.371795,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2720210401010001,0.0,0.375,0.666667,0.6,0.794872,0.562083,0.018868,0.172414,0.384615,0.333333,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8320210728020012,1.0,0.625,0.666667,1.0,0.974359,0.824167,0.056604,0.413793,0.461538,0.461538,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8320210728020012,0.0,0.125,0.166667,0.2,0.974359,0.921833,0.264151,0.103448,0.192308,0.423077,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8320210728020012,0.0,0.375,0.500000,0.6,0.974359,0.849000,0.283019,0.344828,0.269231,0.500000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8320210728020012,0.0,0.250,0.500000,0.4,1.000000,0.834583,0.056604,0.275862,0.307692,0.423077,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### 学習データと教師データに分割

In [51]:
race_y = race_data['着順']
race_x = race_data.drop('着順', axis=1)
race_x = race_x.loc[:, ['車番', '総評', '枠番', 'ギヤ倍数', '競走得点', '1着', '2着', '3着', '着外', "年齢", "期別", 'nan', '×', '▲', '△', '○', '◎', '注', '★', '両', '追', '逃', 'A1', 'A2', 'A3', 'L1', 'S1', 'S2', 'SS']]

### pickleファイルに保存

In [34]:
race_x.to_pickle("data/race_x.pkl")
race_y.to_pickle("data/race_y.pkl")