In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [2]:
df = pd.read_csv(r"../data/df_featured.csv")
df.head(3)

Unnamed: 0,Year,Date,Average_Temperature (℃),Total_Precipitation (mm),Average_wind_speed(m/s),Is_Holiday,Match_Number,Rain_Zero_Flag,Occupancy,Weekday_Friday,...,Opponent_オリックス,Opponent_ヤクルト,Opponent_ロッテ,Opponent_中日,Opponent_巨人,Opponent_広島,Opponent_日本ハム,Opponent_楽天,Opponent_西武,Opponent_阪神
0,2015,2015-03-27,12.2,0.0,1.7,0,1,1,0.997797,1,...,0,0,1,0,0,0,0,0,0,0
1,2015,2015-03-28,16.1,0.0,2.2,0,2,1,0.969211,0,...,0,0,1,0,0,0,0,0,0,0
2,2015,2015-03-29,16.1,0.0,2.5,0,3,1,0.987897,0,...,0,0,1,0,0,0,0,0,0,0


In [3]:
train_df = df[df['Year'] != 2024]  # 2024年以外をトレーニングデータに
test_df = df[df['Year'] == 2024]   # 2024年をテストデータに

# 予測に使う変数
X_columns = ['Average_Temperature (℃)', 'Total_Precipitation (mm)',
       'Average_wind_speed(m/s)', 'Weekday_Monday', 'Weekday_Saturday', 'Weekday_Sunday',
       'Weekday_Thursday', 'Weekday_Tuesday', 'Weekday_Wednesday',
       'Opponent_DeNA', 'Opponent_オリックス', 'Opponent_ヤクルト', 'Opponent_ロッテ',
       'Opponent_中日', 'Opponent_巨人', 'Opponent_広島', 'Opponent_日本ハム',
       'Opponent_楽天', 'Opponent_西武', 'Opponent_阪神', 'Is_Holiday',
       'Match_Number', 'Rain_Zero_Flag']

X_train = train_df[X_columns]
y_train = train_df['Occupancy']
X_test = test_df[X_columns]
y_test = test_df['Occupancy']
X_train

Unnamed: 0,Average_Temperature (℃),Total_Precipitation (mm),Average_wind_speed(m/s),Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday,Opponent_DeNA,...,Opponent_中日,Opponent_巨人,Opponent_広島,Opponent_日本ハム,Opponent_楽天,Opponent_西武,Opponent_阪神,Is_Holiday,Match_Number,Rain_Zero_Flag
0,12.2,0.0,1.7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,16.1,0.0,2.2,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,1
2,16.1,0.0,2.5,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,3,1
3,17.1,0.0,2.8,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,4,1
4,16.1,3.5,3.2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,25.7,0.0,2.5,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,60,1
389,24.7,1.0,2.5,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,61,0
390,23.8,0.0,3.3,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,62,1
391,23.0,0.0,3.3,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,63,1


In [4]:
# 決定木回帰モデルの作成
model = DecisionTreeRegressor(random_state=42)

# モデルの訓練
model.fit(X_train, y_train)

# テストデータを使って予測
y_pred = model.predict(X_test)
y_pred = np.clip(y_pred, 0, 1)

In [5]:
# 評価
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R²: {r2}")

# モデルの特徴量重要度
feature_importances = model.feature_importances_

# 特徴量の名前
feature_names = X_train.columns

# 結果をデータフレームとして表示
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# 重要度でソートして表示
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)

RMSE: 0.08564398410542144
R²: -5.605423416792095
                     Feature    Importance
21              Match_Number  3.233432e-01
0    Average_Temperature (℃)  1.641141e-01
2    Average_wind_speed(m/s)  1.083362e-01
4           Weekday_Saturday  8.338472e-02
5             Weekday_Sunday  7.587152e-02
20                Is_Holiday  5.275164e-02
18               Opponent_西武  4.667079e-02
1   Total_Precipitation (mm)  2.897325e-02
10            Opponent_オリックス  1.613229e-02
11             Opponent_ヤクルト  1.447750e-02
8          Weekday_Wednesday  1.366066e-02
17               Opponent_楽天  1.280201e-02
12              Opponent_ロッテ  1.175630e-02
22            Rain_Zero_Flag  1.136818e-02
6           Weekday_Thursday  8.295106e-03
7            Weekday_Tuesday  8.123053e-03
16             Opponent_日本ハム  7.843047e-03
14               Opponent_巨人  7.748513e-03
9              Opponent_DeNA  1.690811e-03
3             Weekday_Monday  1.591942e-03
19               Opponent_阪神  1.046387e-03
13   



In [6]:
# モデルを保存する
joblib.dump(model, '../results/models/decision_tree_model.pkl')

# 予測結果をDataFrameに格納
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

# 結果をCSVファイルとして保存
results_df.to_csv('../results/predictions/decision_tree_predictions.csv', index=False)

# テストデータ（特徴量）も保存
X_test.to_csv('../results/predictions/decision_tree_X_test.csv', index=False)

# 結果を表示（任意）
print(results_df.head())

       Actual  Predicted
393  0.976309   1.000000
394  0.966494   0.888946
395  0.970779   0.960985
396  1.000000   0.784450
397  0.966917   0.910069
