# データの前処理を行うノート

In [22]:
import pandas as pd 

### データの読み込み

In [23]:
line_data_7 = pd.read_pickle('../data/line_data_7.pkl')

In [24]:
line_data_7

Unnamed: 0,no,leg,people,group,number,result
2020123045/3/3,1,追,1,1,0,2
2020123045/3/3,2,追,2,0,0,1
2020123045/3/3,3,追,2,2,1,5
2020123045/3/3,4,追,2,3,1,7
2020123045/3/3,5,両,2,2,0,3
...,...,...,...,...,...,...
2021010181/1/9,3,追,2,1,1,1
2021010181/1/9,4,逃,2,1,0,7
2021010181/1/9,5,逃,2,0,0,4
2021010181/1/9,6,逃,2,2,0,6


### 脚質のデータをダミー変数化する

In [25]:
legtype_data = pd.get_dummies(line_data_7['leg'])

In [26]:
line_data_7 = line_data_7.drop(['leg', 'no'], axis=1)
line_data_7 = pd.concat([line_data_7, legtype_data], axis=1)
line_data_7

Unnamed: 0,people,group,number,result,両,追,逃
2020123045/3/3,1,1,0,2,0,1,0
2020123045/3/3,2,0,0,1,0,1,0
2020123045/3/3,2,2,1,5,0,1,0
2020123045/3/3,2,3,1,7,0,1,0
2020123045/3/3,2,2,0,3,1,0,0
...,...,...,...,...,...,...,...
2021010181/1/9,2,1,1,1,0,1,0
2021010181/1/9,2,1,0,7,0,0,1
2021010181/1/9,2,0,0,4,0,0,1
2021010181/1/9,2,2,0,6,0,0,1


### ライン情報を正規化する

In [27]:
columns = ['people', 'group', 'number']

for column in columns:
    df = line_data_7[column]
    line_data_7[column] = (df - df.min()) / (df.max() - df.min())

line_data_7

Unnamed: 0,people,group,number,result,両,追,逃
2020123045/3/3,0.000000,0.166667,0.000000,2,0,1,0
2020123045/3/3,0.333333,0.000000,0.000000,1,0,1,0
2020123045/3/3,0.333333,0.333333,0.333333,5,0,1,0
2020123045/3/3,0.333333,0.500000,0.333333,7,0,1,0
2020123045/3/3,0.333333,0.333333,0.000000,3,1,0,0
...,...,...,...,...,...,...,...
2021010181/1/9,0.333333,0.166667,0.333333,1,0,1,0
2021010181/1/9,0.333333,0.166667,0.000000,7,0,0,1
2021010181/1/9,0.333333,0.000000,0.000000,4,0,0,1
2021010181/1/9,0.333333,0.333333,0.000000,6,0,0,1


### 着順のデータを0~6にする

In [28]:
line_data_7['result'] -= 1
line_data_7

Unnamed: 0,people,group,number,result,両,追,逃
2020123045/3/3,0.000000,0.166667,0.000000,1,0,1,0
2020123045/3/3,0.333333,0.000000,0.000000,0,0,1,0
2020123045/3/3,0.333333,0.333333,0.333333,4,0,1,0
2020123045/3/3,0.333333,0.500000,0.333333,6,0,1,0
2020123045/3/3,0.333333,0.333333,0.000000,2,1,0,0
...,...,...,...,...,...,...,...
2021010181/1/9,0.333333,0.166667,0.333333,0,0,1,0
2021010181/1/9,0.333333,0.166667,0.000000,6,0,0,1
2021010181/1/9,0.333333,0.000000,0.000000,3,0,0,1
2021010181/1/9,0.333333,0.333333,0.000000,5,0,0,1


### 学習データと教師データに分割して保存する

In [29]:
line_data_7x = line_data_7.drop('result', axis=1)
line_data_7y = line_data_7['result']

In [33]:
pd.to_pickle(line_data_7x, '../data/line_data_7x')
pd.to_pickle(line_data_7y, '../data/line_data_7y')