In [21]:
import pandas as pd
df = pd.read_csv("USD_JPY.csv")

# 特徴量の作成
import talib as ta
import numpy as np
import plotly

# 全ての計算で終値を使う
close = np.array(df["終値"])

# 特徴量を入れるための空のdataframeを作成
df_feature = pd.DataFrame(index=range(len(df)),
                          columns=["SMA5/current", "SMA20/current", "RSI", "MACD", "BBANDS+2σ", "BBANDS-2σ"])

# 以下、talibを用いてテクニカル指標（今回の学習で用いる特徴量）を算出しdf_feature入れる

# 単純移動平均は、単純移動平均値とその日の終値の比を特徴量として用いる
df_feature["SMA5/current"] = ta.SMA(close, timeperiod=5) / close
df_feature["SMA20/current"] = ta.SMA(close, timeperiod=20) / close

# RSI
df_feature["RSI"] = ta.RSI(close, timeperiod=14)

# MACD
df_feature["MACD"], _, _ = ta.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)

# ボリンジャーバンド
upper, middle, lower = ta.BBANDS(close, timeperiod=20, nbdevup=3, nbdevdn=3)
df_feature["BBANDS+2σ"] = upper / close
df_feature["BBANDS-2σ"] = lower / close

df["前日比_float"] = df["前日比%"].apply(lambda x: float(x.replace("%", "")))


# 前日比%の分類の仕方。できるだけ各クラスのサンプルが等しいようにわける
def classify_1(x):
    if x <= -0.2:
        return 0
    elif -0.2 < x < 0.2:
        return 1
    elif 0.2 <= x:
        return 2


def classify_2(x):
    if x <= -0.40:
        return 1
    elif -0.4 < x <= -0.10:
        return 2
    elif -0.10 < x <= 0.10:
        return 3
    elif 0.10 < x <= 0.4:
        return 4
    elif 0.4 < x:
        return 5


def classify_3(x):
    if x <= 0:
        return 0
    else:
        return 1


df["前日比_classified"] = df["前日比_float"].apply(lambda x: classify_1(x))

df_y = df["前日比_classified"].shift()

df_xy = pd.concat([df_feature, df_y], axis=1)
df_xy = df_xy.dropna(how="any")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import optuna

X_train, X_test, Y_train, Y_test = train_test_split(df_xy[["SMA5/current", "SMA20/current","RSI","MACD","BBANDS+2σ","BBANDS-2σ"]],df_xy["前日比_classified"], train_size=0.8)

def objective(trial):
    min_samples_split = trial.suggest_int("min_samples_split", 2,16)
    max_leaf_nodes = int(trial.suggest_discrete_uniform("max_leaf_nodes", 4,64,4))
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    n_estimators = int(trial.suggest_discrete_uniform("n_estimators", 10,310,30))
    max_depth = trial.suggest_int("max_depth", 3,10)
    clf = RandomForestClassifier(random_state=1, n_estimators = n_estimators, max_leaf_nodes = max_leaf_nodes, max_depth=max_depth, max_features=None,criterion=criterion,min_samples_split=min_samples_split)
    clf.fit(X_train, Y_train)
    return 1 - accuracy_score(Y_test, clf.predict(X_test))

study = optuna.create_study()
study.optimize(objective, n_trials=100)

optuna.visualization.plot_contour(study,params=["n_estimators", "max_depth","max_leaf_nodes"])


[I 2020-07-30 11:37:21,190] Trial 0 finished with value: 0.3817258883248731 and parameters: {'min_samples_split': 4, 'max_leaf_nodes': 8.0, 'criterion': 'entropy', 'n_estimators': 130.0, 'max_depth': 3}. Best is trial 0 with value: 0.3817258883248731.
[I 2020-07-30 11:37:21,369] Trial 1 finished with value: 0.3817258883248731 and parameters: {'min_samples_split': 3, 'max_leaf_nodes': 48.0, 'criterion': 'entropy', 'n_estimators': 10.0, 'max_depth': 5}. Best is trial 0 with value: 0.3817258883248731.
[I 2020-07-30 11:37:23,773] Trial 2 finished with value: 0.3725888324873097 and parameters: {'min_samples_split': 4, 'max_leaf_nodes': 32.0, 'criterion': 'gini', 'n_estimators': 190.0, 'max_depth': 8}. Best is trial 2 with value: 0.3725888324873097.
[I 2020-07-30 11:37:23,884] Trial 3 finished with value: 0.3888324873096447 and parameters: {'min_samples_split': 5, 'max_leaf_nodes': 52.0, 'criterion': 'entropy', 'n_estimators': 10.0, 'max_depth': 3}. Best is trial 2 with value: 0.372588832487

[I 2020-07-30 11:39:34,793] Trial 32 finished with value: 0.36852791878172586 and parameters: {'min_samples_split': 13, 'max_leaf_nodes': 32.0, 'criterion': 'entropy', 'n_estimators': 250.0, 'max_depth': 9}. Best is trial 17 with value: 0.36751269035532996.
[I 2020-07-30 11:39:40,307] Trial 33 finished with value: 0.36751269035532996 and parameters: {'min_samples_split': 12, 'max_leaf_nodes': 32.0, 'criterion': 'entropy', 'n_estimators': 250.0, 'max_depth': 10}. Best is trial 17 with value: 0.36751269035532996.
[I 2020-07-30 11:39:46,611] Trial 34 finished with value: 0.36751269035532996 and parameters: {'min_samples_split': 10, 'max_leaf_nodes': 32.0, 'criterion': 'entropy', 'n_estimators': 190.0, 'max_depth': 10}. Best is trial 17 with value: 0.36751269035532996.
[I 2020-07-30 11:39:48,184] Trial 35 finished with value: 0.3786802030456853 and parameters: {'min_samples_split': 10, 'max_leaf_nodes': 44.0, 'criterion': 'entropy', 'n_estimators': 160.0, 'max_depth': 3}. Best is trial 17 

[I 2020-07-30 11:41:32,230] Trial 63 finished with value: 0.37157360406091366 and parameters: {'min_samples_split': 16, 'max_leaf_nodes': 36.0, 'criterion': 'entropy', 'n_estimators': 280.0, 'max_depth': 9}. Best is trial 17 with value: 0.36751269035532996.
[I 2020-07-30 11:41:36,512] Trial 64 finished with value: 0.37055837563451777 and parameters: {'min_samples_split': 12, 'max_leaf_nodes': 28.0, 'criterion': 'entropy', 'n_estimators': 250.0, 'max_depth': 7}. Best is trial 17 with value: 0.36751269035532996.
[I 2020-07-30 11:41:40,446] Trial 65 finished with value: 0.36852791878172586 and parameters: {'min_samples_split': 13, 'max_leaf_nodes': 32.0, 'criterion': 'entropy', 'n_estimators': 220.0, 'max_depth': 10}. Best is trial 17 with value: 0.36751269035532996.
[I 2020-07-30 11:41:48,434] Trial 66 finished with value: 0.36852791878172586 and parameters: {'min_samples_split': 14, 'max_leaf_nodes': 40.0, 'criterion': 'entropy', 'n_estimators': 310.0, 'max_depth': 8}. Best is trial 17 

[I 2020-07-30 11:44:00,979] Trial 94 finished with value: 0.36649746192893407 and parameters: {'min_samples_split': 6, 'max_leaf_nodes': 40.0, 'criterion': 'entropy', 'n_estimators': 310.0, 'max_depth': 7}. Best is trial 79 with value: 0.36649746192893407.
[I 2020-07-30 11:44:06,236] Trial 95 finished with value: 0.37055837563451777 and parameters: {'min_samples_split': 6, 'max_leaf_nodes': 48.0, 'criterion': 'entropy', 'n_estimators': 310.0, 'max_depth': 7}. Best is trial 79 with value: 0.36649746192893407.
[I 2020-07-30 11:44:11,833] Trial 96 finished with value: 0.36954314720812187 and parameters: {'min_samples_split': 4, 'max_leaf_nodes': 44.0, 'criterion': 'entropy', 'n_estimators': 310.0, 'max_depth': 7}. Best is trial 79 with value: 0.36649746192893407.
[I 2020-07-30 11:44:13,178] Trial 97 finished with value: 0.3776649746192894 and parameters: {'min_samples_split': 6, 'max_leaf_nodes': 40.0, 'criterion': 'entropy', 'n_estimators': 70.0, 'max_depth': 7}. Best is trial 79 with va

In [22]:
optuna.visualization.plot_contour(study,params=["n_estimators", "max_depth","min_samples_split"])

In [23]:
print(1-study.best_value)
print(study.best_params)

0.6335025380710659
{'min_samples_split': 8, 'max_leaf_nodes': 40.0, 'criterion': 'entropy', 'n_estimators': 310.0, 'max_depth': 7}


In [24]:
1-0.6182741116751269

0.3817258883248731

In [20]:
optuna.visualization.plot_contour(study,params=["n_estimators", "criterion","min_samples_split"])