In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error

In [5]:
# 特徴量データをロード
df = pd.read_csv('bybit_ethusdt_20230101-20230330_features.csv', index_col=0)

In [6]:

# ターゲット変数の作成
price_change = (df['Close'].shift(-1) / df['Close'] - 1) * 100
conditions = [
    (price_change >= 0.1),
    (price_change <= -0.1)
]
choices = ['up', 'down']
df['target'] = np.select(conditions, choices, default='neutral')

df['target'], _ = pd.factorize(df['target'])

# 不要な列を削除
df = df.drop(['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume'], axis=1).dropna()


In [7]:
# データセットをトレーニングセットとテストセットに分割
X = df.drop('target', axis=1)
y = df['target']

# データセットの長さ
n = len(df)

# トレーニングセットとテストセットの分割点
split_point = int(n * 0.8)

# トレーニングデータとテストデータに分割
X_train = X[:split_point]
X_test = X[split_point:]
y_train = y[:split_point]
y_test = y[split_point:]


In [22]:
# LightGBMモデルのトレーニング
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# LightGBMの回帰モデルを初期化
model = lgb.LGBMClassifier()

# チューニングするハイパーパラメータを設定
param_grid = {
    'num_leaves': [31, 50, 100,200],
    'learning_rate': [0.001, 0.01, 0.1]
}

# 時系列データに適したクロスバリデーションを設定
tscv = TimeSeriesSplit(n_splits=10)

# GridSearchCVを初期化（時系列分割を使用）
grid = GridSearchCV(model, param_grid, cv=tscv, scoring='neg_log_loss')

# チューニングを実行
grid.fit(X_train, y_train)

# 最適なパラメータを表示
print("Best parameters found by grid search are:", grid.best_params_)


Best parameters found by grid search are: {'learning_rate': 0.01, 'num_leaves': 31}


In [20]:
# 最適なパラメータでモデルを再トレーニング
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

# テストデータで評価
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.8227853013076715


In [21]:
print("Classification report:")
print(classification_report(y_test, y_pred))

Classification report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     21204
           1       0.00      0.00      0.00      2245
           2       0.00      0.00      0.00      2322

    accuracy                           0.82     25771
   macro avg       0.27      0.33      0.30     25771
weighted avg       0.68      0.82      0.74     25771



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
np.bincount(y_pred)

array([25771])

In [5]:
# LightGBMモデルのトレーニング
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=1000, early_stopping_rounds=100)

# 予測と評価
y_pred = model.predict(X_test)
y_pred_class = [np.argmax(pred) for pred in y_pred]
class_map = {'up': 0, 'neutral': 1, 'down': 2}
y_test_mapped = y_test.map(class_map).values





You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5610
[LightGBM] [Info] Number of data points in the train set: 103082, number of used features: 22
[LightGBM] [Info] Start training from score -0.108597
[LightGBM] [Info] Start training from score -2.997111
[LightGBM] [Info] Start training from score -2.937893
[1]	valid_0's multi_logloss: 0.598115
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 0.585092
[3]	valid_0's multi_logloss: 0.574754
[4]	valid_0's multi_logloss: 0.566571
[5]	valid_0's multi_logloss: 0.559772
[6]	valid_0's multi_logloss: 0.554081
[7]	valid_0's multi_logloss: 0.549353
[8]	valid_0's multi_logloss: 0.545714
[9]	valid_0's multi_logloss: 0.542207
[10]	valid_0's multi_logloss: 0.539142
[11]	valid_0's multi_logloss: 0.536721
[12]	valid_0's multi_logloss: 0.534368
[13]	valid_0's multi_logloss: 0.532379
[14]	valid_0's multi_logloss: 0.530635
[15]	valid_0's multi_logloss: 0.529062
[16]	valid_0's

In [6]:
accuracy = accuracy_score(y_test, y_pred_class)
print(f"Accuracy: {accuracy}")

print("Classification report:")
print(classification_report(y_test, y_pred_class))

Accuracy: 0.8243374335493384
Classification report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90     21204
           1       0.51      0.02      0.04      2245
           2       0.40      0.02      0.04      2322

    accuracy                           0.82     25771
   macro avg       0.58      0.35      0.33     25771
weighted avg       0.76      0.82      0.75     25771



In [13]:
import pickle
filename = 'ethusdt_lightgbm_model.pkl'
pickle.dump(model,open(filename,'wb'))