# ライブラリの読み込み

In [19]:
#データ加工でよく使うやつ
import datetime  # 日付,時刻の処理
import yaml  # yamlファイル操作
import re  # 正規表現操作
import shutil  # ファイル操作
import glob  # ファイル操作
import os  # ファイル操作
import seaborn as sns  # グラフ描画
import pandas as pd  # データ加工入出力,DataFlame
import numpy as np  # 行列計算
import scipy as sp  # 科学技術計算
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

#モデリング
from sklearn.model_selection import train_test_split # 訓練データとテストデータの分割を行うライブラリ
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc  # 評価指標
# KFold交差検証のライブラリ、交差検証のスコアを求めるライブラリ、および交差検証による予測に関するライブラリをimport
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

In [56]:
df = pd.read_pickle('./df_merge2.pkl')

In [57]:
#df['球種'] =df['球種'].astype('object')
df['lag_pitch'] =df['lag_pitch'].astype('object')

In [58]:
df.dtypes

球種                   int64
試合内投球数               int64
日付                  object
イニング                 int64
表裏                  object
打席内投球数               int64
投手投球左右              object
投手役割                object
投手登板順                int64
投手試合内対戦打者数           int64
投手試合内投球数             int64
投手イニング内投球数           int64
打者打席左右              object
打者打順                 int64
打者守備位置              object
打者試合内打席数             int64
プレイ前走者状況            object
count               object
pitch_team_score     int64
month                int32
lag_pitch           object
age                  int64
dtype: object

# 分割データの作成
## カテゴリ変数はダミー化
## 目的変数は数値でよくて、分類のモデルを選べばOK

In [59]:
#category有のとき(事前にint型に変更必要)
df2 = pd.get_dummies(df,columns=['表裏','投手投球左右','投手役割','プレイ前走者状況','打者打席左右','打者守備位置','count','lag_pitch'],drop_first=True)
df2 =df2.drop('日付',axis=1)
df2.dtypes

球種                  int64
試合内投球数              int64
イニング                int64
打席内投球数              int64
投手登板順               int64
投手試合内対戦打者数          int64
投手試合内投球数            int64
投手イニング内投球数          int64
打者打順                int64
打者試合内打席数            int64
pitch_team_score    int64
month               int32
age                 int64
表裏_裏                uint8
投手投球左右_左            uint8
投手役割_救援             uint8
プレイ前走者状況_12_        uint8
プレイ前走者状況_1_3        uint8
プレイ前走者状況_1__        uint8
プレイ前走者状況__23        uint8
プレイ前走者状況__2_        uint8
プレイ前走者状況___3        uint8
プレイ前走者状況____        uint8
打者打席左右_左            uint8
打者守備位置_三            uint8
打者守備位置_中            uint8
打者守備位置_二            uint8
打者守備位置_右            uint8
打者守備位置_左            uint8
打者守備位置_打            uint8
打者守備位置_投            uint8
打者守備位置_指            uint8
打者守備位置_捕            uint8
打者守備位置_無            uint8
打者守備位置_遊            uint8
count_0-0-1         uint8
count_0-0-2         uint8
count_0-1-0         uint8
count_0-1-1 

In [60]:
#label比率を合わせて分割
df_train,df_test=train_test_split(df2, test_size=0.3, random_state = 88 , stratify=df['球種']) 

In [61]:
df_train.isnull().sum()

球種                  0
試合内投球数              0
イニング                0
打席内投球数              0
投手登板順               0
投手試合内対戦打者数          0
投手試合内投球数            0
投手イニング内投球数          0
打者打順                0
打者試合内打席数            0
pitch_team_score    0
month               0
age                 0
表裏_裏                0
投手投球左右_左            0
投手役割_救援             0
プレイ前走者状況_12_        0
プレイ前走者状況_1_3        0
プレイ前走者状況_1__        0
プレイ前走者状況__23        0
プレイ前走者状況__2_        0
プレイ前走者状況___3        0
プレイ前走者状況____        0
打者打席左右_左            0
打者守備位置_三            0
打者守備位置_中            0
打者守備位置_二            0
打者守備位置_右            0
打者守備位置_左            0
打者守備位置_打            0
打者守備位置_投            0
打者守備位置_指            0
打者守備位置_捕            0
打者守備位置_無            0
打者守備位置_遊            0
count_0-0-1         0
count_0-0-2         0
count_0-1-0         0
count_0-1-1         0
count_0-1-2         0
count_0-2-0         0
count_0-2-1         0
count_0-2-2         0
count_0-3-0         0
count_0-3-1         0
count_0-3-

In [62]:
len(df_train)

179981

In [63]:
len(df_test)

77136

In [64]:
df_train_y = pd.DataFrame(df_train.loc[:,'球種'])
df_train_x = pd.DataFrame(df_train.drop('球種',axis=1))
df_test_y = pd.DataFrame(df_test.loc[:,'球種'])
df_test_x = pd.DataFrame(df_test.drop('球種',axis=1))

# Modeling
## randomforest

In [65]:
from sklearn.ensemble import RandomForestClassifier as RFC 
from sklearn.model_selection import train_test_split, GridSearchCV

In [66]:
clf = RFC(verbose=True,       # 学習中にログを表示します。この指定はなくてもOK
          n_jobs=-1,          # 複数のCPUコアを使って並列に学習します。-1は最大値。
          random_state=88)  # 乱数のシードです。
clf.fit(df_train_x, df_train_y)

  after removing the cwd from sys.path.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   33.5s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=88,
                       verbose=True, warm_start=False)

In [71]:
predict_train_label = clf.predict_proba(df_train_x)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    5.7s finished


In [91]:
predict_train_label = pd.DataFrame(predict_train_label)
predict_train_label

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.16,0.03,0.73,0.02,0.04,0.00,0.00,0.02
1,0.82,0.03,0.10,0.01,0.01,0.02,0.00,0.01
2,0.77,0.01,0.20,0.00,0.01,0.01,0.00,0.00
3,0.86,0.03,0.04,0.00,0.03,0.01,0.01,0.02
4,0.76,0.02,0.13,0.00,0.04,0.03,0.00,0.02
...,...,...,...,...,...,...,...,...
179976,0.21,0.03,0.01,0.00,0.74,0.01,0.00,0.00
179977,0.86,0.01,0.07,0.01,0.02,0.01,0.00,0.02
179978,0.09,0.06,0.02,0.79,0.02,0.00,0.01,0.01
179979,0.79,0.01,0.09,0.03,0.00,0.05,0.02,0.01


In [92]:
a

Unnamed: 0,球種
0,2
1,0
2,0
3,0
4,0
...,...
179976,4
179977,0
179978,3
179979,0


In [93]:
a = pd.DataFrame(df_train_y)
a=a.reset_index(drop=True)
predict_train_label2 = pd.concat([a,predict_train_label],axis=1)

In [94]:
predict_train_label2

Unnamed: 0,球種,0,1,2,3,4,5,6,7
0,2,0.16,0.03,0.73,0.02,0.04,0.00,0.00,0.02
1,0,0.82,0.03,0.10,0.01,0.01,0.02,0.00,0.01
2,0,0.77,0.01,0.20,0.00,0.01,0.01,0.00,0.00
3,0,0.86,0.03,0.04,0.00,0.03,0.01,0.01,0.02
4,0,0.76,0.02,0.13,0.00,0.04,0.03,0.00,0.02
...,...,...,...,...,...,...,...,...,...
179976,4,0.21,0.03,0.01,0.00,0.74,0.01,0.00,0.00
179977,0,0.86,0.01,0.07,0.01,0.02,0.01,0.00,0.02
179978,3,0.09,0.06,0.02,0.79,0.02,0.00,0.01,0.01
179979,0,0.79,0.01,0.09,0.03,0.00,0.05,0.02,0.01


In [96]:
predict_train_label2.to_csv("predict_train_label.csv")

In [97]:
predict_test_label = clf.predict_proba(df_test_x)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    7.0s finished


In [98]:
predict_test_label = pd.DataFrame(predict_test_label)
predict_test_label

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.50,0.10,0.26,0.03,0.02,0.06,0.00,0.03
1,0.26,0.16,0.16,0.21,0.06,0.06,0.03,0.06
2,0.34,0.02,0.38,0.07,0.19,0.00,0.00,0.00
3,0.21,0.07,0.46,0.08,0.10,0.04,0.01,0.03
4,0.61,0.00,0.20,0.08,0.06,0.00,0.00,0.05
...,...,...,...,...,...,...,...,...
77131,0.81,0.02,0.11,0.03,0.00,0.00,0.00,0.03
77132,0.42,0.04,0.26,0.08,0.05,0.05,0.03,0.07
77133,0.50,0.00,0.26,0.03,0.17,0.02,0.00,0.02
77134,0.36,0.19,0.11,0.06,0.02,0.05,0.03,0.18


In [100]:
a = pd.DataFrame(df_test_y)
a=a.reset_index(drop=True)
predict_test_label2 = pd.concat([a,predict_test_label],axis=1)

In [101]:
predict_test_label2

Unnamed: 0,球種,0,1,2,3,4,5,6,7
0,2,0.50,0.10,0.26,0.03,0.02,0.06,0.00,0.03
1,6,0.26,0.16,0.16,0.21,0.06,0.06,0.03,0.06
2,0,0.34,0.02,0.38,0.07,0.19,0.00,0.00,0.00
3,3,0.21,0.07,0.46,0.08,0.10,0.04,0.01,0.03
4,2,0.61,0.00,0.20,0.08,0.06,0.00,0.00,0.05
...,...,...,...,...,...,...,...,...,...
77131,0,0.81,0.02,0.11,0.03,0.00,0.00,0.00,0.03
77132,7,0.42,0.04,0.26,0.08,0.05,0.05,0.03,0.07
77133,0,0.50,0.00,0.26,0.03,0.17,0.02,0.00,0.02
77134,4,0.36,0.19,0.11,0.06,0.02,0.05,0.03,0.18


In [96]:
predict_train_label2.to_csv("predict_train_label.csv")