## Подготовка

In [1]:
import warnings

import lightgbm as lgb
import numpy as np
import optuna
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('datasets/APLE_PERIOD_D1.csv', index_col=[0], parse_dates=[0])

In [4]:
data = data.iloc[::-1]
data

Unnamed: 0_level_0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A30,A31,A32,A33,A34,A35,A36,A37,A38,Close
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-11-30,0.035,-0.015,0.0875,0.0075,-0.0175,-0.0375,0.0875,0.0075,1.000000e-02,7.105427e-15,...,-0.06,0.000050,-0.029931,0.000000,0.000000,0.03,0.03,3.552714e-15,-0.037431,1
2017-12-01,-0.015,-0.045,0.0075,0.0225,-0.0375,-0.1125,0.0075,0.0225,7.105427e-15,-6.000000e-02,...,-0.15,-0.029931,0.000069,0.000000,0.000000,0.00,0.00,1.500000e-01,-0.022431,0
2017-12-04,-0.045,0.060,0.0225,0.1500,-0.1125,-0.0300,0.0225,0.1500,-6.000000e-02,7.500000e-02,...,-0.03,0.000069,-0.029912,0.000000,0.000000,0.03,0.03,3.750000e-02,-0.179912,1
2017-12-05,0.060,-0.060,0.1500,0.0300,-0.0300,-0.1500,0.1500,0.0300,7.500000e-02,-7.500000e-02,...,-0.09,-0.029912,0.000088,0.000000,0.000000,0.00,0.00,1.875000e-01,-0.029912,0
2017-12-06,-0.060,0.045,0.0300,0.1125,-0.1500,-0.0225,0.0300,0.1125,-7.500000e-02,4.500000e-02,...,0.00,0.000088,0.000088,0.000000,0.000000,0.00,0.00,2.250000e-02,-0.112412,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-15,0.075,0.165,0.1875,0.4125,-0.0375,-0.0825,0.3250,0.2375,1.350000e-01,1.100000e-01,...,-0.24,0.026316,0.056391,0.094411,0.014286,-0.09,0.32,6.928571e-02,-0.356109,0
2024-03-18,0.165,0.070,0.4125,0.1750,-0.0825,-0.0350,0.2375,0.1500,1.100000e-01,6.500000e-02,...,-0.12,0.056391,-0.033477,0.014286,-0.085777,0.02,0.31,-5.327694e-02,-0.208477,1
2024-03-19,0.070,-0.095,0.1750,0.0475,-0.0350,-0.2375,0.1500,0.0725,6.500000e-02,-5.000000e-02,...,-0.31,-0.033477,-0.183252,-0.085777,-0.145802,0.27,0.40,-2.080201e-02,-0.230752,1
2024-03-20,-0.095,0.010,0.0475,0.0250,-0.2375,-0.0050,0.0725,0.0100,-5.000000e-02,-1.050000e-01,...,-0.35,-0.183252,0.056823,-0.145802,-0.135833,0.09,0.17,1.266667e-01,0.031823,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1586 entries, 2017-11-30 to 2024-03-21
Data columns (total 39 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      1586 non-null   float64
 1   A2      1586 non-null   float64
 2   A3      1586 non-null   float64
 3   A4      1586 non-null   float64
 4   A5      1586 non-null   float64
 5   A6      1586 non-null   float64
 6   A7      1586 non-null   float64
 7   A8      1586 non-null   float64
 8   A9      1586 non-null   float64
 9   A10     1586 non-null   float64
 10  A11     1586 non-null   float64
 11  A12     1586 non-null   float64
 12  A13     1586 non-null   float64
 13  A14     1586 non-null   float64
 14  A15     1586 non-null   float64
 15  A16     1586 non-null   float64
 16  A17     1586 non-null   float64
 17  A18     1586 non-null   float64
 18  A19     1586 non-null   float64
 19  A20     1586 non-null   float64
 20  A21     1586 non-null   float64
 21  A22     1586 non-nu

## Обучение

In [6]:
X = data.drop('Close', axis=1)
y = data[['Close']]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, random_state=42)

In [8]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
pca = PCA(n_components=7)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [10]:
pca.explained_variance_ratio_

array([0.57315351, 0.09923983, 0.08368575, 0.07269912, 0.03499105,
       0.02666934, 0.02303646])

In [11]:
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
print("Суммарная доля объяснённой дисперсии:", cumulative_variance[-1])

Суммарная доля объяснённой дисперсии: 0.9134750655407526


## Модель LightGBM

In [12]:
train_set = lgb.Dataset(X_train, label=y_train)
test_set = lgb.Dataset(X_test, label=y_test, reference=train_set)
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbose': -1
}
early_stopping_cb = lgb.early_stopping(stopping_rounds=50)
model = lgb.train(
    params=params,
    train_set=train_set,
    num_boost_round=500,
    valid_sets=[train_set, test_set],
    valid_names=['train', 'valid'],
    callbacks=[early_stopping_cb, lgb.log_evaluation(10)]
)
y_pred = model.predict(X_test)
y_pred_binary = [1 if prob > 0.5 else 0 for prob in y_pred]

Training until validation scores don't improve for 50 rounds
[10]	train's binary_logloss: 0.562629	valid's binary_logloss: 0.632678
[20]	train's binary_logloss: 0.483412	valid's binary_logloss: 0.617783
[30]	train's binary_logloss: 0.425084	valid's binary_logloss: 0.618664
[40]	train's binary_logloss: 0.373981	valid's binary_logloss: 0.621302
[50]	train's binary_logloss: 0.329234	valid's binary_logloss: 0.627306
[60]	train's binary_logloss: 0.291211	valid's binary_logloss: 0.638099
[70]	train's binary_logloss: 0.258406	valid's binary_logloss: 0.647431
Early stopping, best iteration is:
[27]	train's binary_logloss: 0.441735	valid's binary_logloss: 0.617009


In [13]:
print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       0.71      0.70      0.71       200
           1       0.70      0.71      0.70       197

    accuracy                           0.71       397
   macro avg       0.71      0.71      0.71       397
weighted avg       0.71      0.71      0.71       397



#### Подбор параметров

In [14]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'precision',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'random_state': 42
    }
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return precision_score(y_test, y_pred, average='weighted')

In [15]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

[I 2024-12-23 14:56:02,078] A new study created in memory with name: no-name-e43eee65-dc25-4f23-89ab-a72e53250797
[I 2024-12-23 14:56:02,399] Trial 0 finished with value: 0.6706870130988518 and parameters: {'num_leaves': 30, 'learning_rate': 0.0782632058053588, 'max_depth': 11, 'n_estimators': 121}. Best is trial 0 with value: 0.6706870130988518.
[I 2024-12-23 14:56:02,513] Trial 1 finished with value: 0.6694770430043038 and parameters: {'num_leaves': 48, 'learning_rate': 0.06164839578405947, 'max_depth': 3, 'n_estimators': 286}. Best is trial 0 with value: 0.6706870130988518.
[I 2024-12-23 14:56:02,807] Trial 2 finished with value: 0.6816381273318337 and parameters: {'num_leaves': 114, 'learning_rate': 0.023816422854107126, 'max_depth': 3, 'n_estimators': 819}. Best is trial 2 with value: 0.6816381273318337.
[I 2024-12-23 14:56:02,985] Trial 3 finished with value: 0.6718356906386604 and parameters: {'num_leaves': 26, 'learning_rate': 0.09072018036317622, 'max_depth': 8, 'n_estimators'

In [16]:
best_params = study.best_params
print(best_params)

{'num_leaves': 69, 'learning_rate': 0.015406500026775783, 'max_depth': 3, 'n_estimators': 309}


In [17]:
best_params.update({
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
})

#### Обучение с лучшими параметрами

In [18]:
model = lgb.train(
    params=best_params,
    train_set=train_set,
    num_boost_round=500,
    valid_sets=[train_set, test_set],
    valid_names=['train', 'valid'],
    callbacks=[early_stopping_cb, lgb.log_evaluation(10)]
)
y_pred = model.predict(X_test)
y_pred_binary = [1 if prob > 0.5 else 0 for prob in y_pred]

Training until validation scores don't improve for 50 rounds
[10]	train's binary_logloss: 0.665651	valid's binary_logloss: 0.672279
[20]	train's binary_logloss: 0.643749	valid's binary_logloss: 0.655448
[30]	train's binary_logloss: 0.626177	valid's binary_logloss: 0.64288
[40]	train's binary_logloss: 0.611732	valid's binary_logloss: 0.633038
[50]	train's binary_logloss: 0.599573	valid's binary_logloss: 0.625042
[60]	train's binary_logloss: 0.589195	valid's binary_logloss: 0.620249
[70]	train's binary_logloss: 0.579786	valid's binary_logloss: 0.615827
[80]	train's binary_logloss: 0.57167	valid's binary_logloss: 0.612303
[90]	train's binary_logloss: 0.564415	valid's binary_logloss: 0.609803
[100]	train's binary_logloss: 0.557533	valid's binary_logloss: 0.60767
[110]	train's binary_logloss: 0.551526	valid's binary_logloss: 0.605486
[120]	train's binary_logloss: 0.545745	valid's binary_logloss: 0.603493
[130]	train's binary_logloss: 0.539733	valid's binary_logloss: 0.600767
[140]	train's b

In [19]:
print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       0.73      0.69      0.71       200
           1       0.70      0.74      0.72       197

    accuracy                           0.72       397
   macro avg       0.72      0.72      0.72       397
weighted avg       0.72      0.72      0.72       397

