In [1]:
import pandas as pd
from indicators import RSI, extract_bb
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("gzpn_data.csv")
df = df.dropna().sample(frac=1).reset_index(drop=True)
df = df.drop(columns=["<TICKER>", "<PER>", "<DATE>", "<TIME>"])
df.columns = ["open", "high", "low", "close", "volume"]
df

Unnamed: 0,open,high,low,close,volume
0,163.91,163.94,163.75,163.77,101460
1,163.40,163.44,163.37,163.37,33520
2,166.17,166.18,166.10,166.11,7130
3,166.58,166.61,166.48,166.56,40590
4,163.45,163.52,163.36,163.47,23970
...,...,...,...,...,...
2128,168.46,168.49,168.18,168.21,164860
2129,166.20,166.30,166.18,166.26,46210
2130,163.77,163.83,163.70,163.81,38600
2131,163.44,163.44,163.42,163.42,4250


In [12]:
n_steps = 11

prices = df["close"]

rsi_values = RSI(prices=prices, n_steps=n_steps)
bb_values = extract_bb(prices=prices, n_steps=n_steps)

assert len(rsi_values) == len(
    bb_values
), f"Indicators length don't coincide: {len(rsi_values)} and {len(bb_values)}"

In [13]:
def prepare_target(df, steps_obs: int = 3):
    targets = []
    for i in range(0, len(df) - steps_obs):
        current_price = df["close"].iloc[i]
        max_price = df["high"].iloc[i + 1 : i + 1 + steps_obs].max()
        targets.append(max_price > current_price)
    targets += [0] * steps_obs
    return np.array(targets, dtype=np.int32)


steps_obs = 3

targets = prepare_target(df=df, steps_obs=steps_obs)

In [14]:
all_data = (
    pd.DataFrame(
        data=np.array([rsi_values, bb_values, targets]).T,
        columns=["rsi", "bb", "target"],
    )
    .dropna()
    .reset_index(drop=True)
    .astype(np.float64)
)
all_data["target"] = all_data["target"].astype(np.int32)
all_data

Unnamed: 0,rsi,bb,target
0,50.576520,-0.805737,1
1,50.472689,0.818016,1
2,49.867092,0.812760,1
3,49.814126,-1.061893,1
4,50.981432,-0.772314,1
...,...,...,...
2117,50.729517,1.510872,0
2118,49.344978,0.315676,1
2119,45.841785,-1.058604,0
2120,41.767491,-1.103584,0


In [15]:
all_data.describe()

Unnamed: 0,rsi,bb,target
count,2122.0,2122.0,2122.0
mean,50.026553,-0.000525,0.757776
std,7.02935,0.95596,0.42853
min,15.256798,-2.591089,0.0
25%,45.248995,-0.830166,1.0
50%,50.074572,-0.183795,1.0
75%,54.632901,0.806517,1.0
max,83.72093,2.53217,1.0


# Обработка фичей

In [16]:
scaler = MinMaxScaler()
scaler.fit(all_data.iloc[:, :-1])
all_data.iloc[:, :-1] = scaler.transform(all_data.iloc[:, :-1])
all_data

Unnamed: 0,rsi,bb,target
0,0.515887,0.348480,1
1,0.514370,0.665417,1
2,0.505524,0.664391,1
3,0.504751,0.298481,1
4,0.521801,0.355003,1
...,...,...,...
2117,0.518121,0.800655,0
2118,0.497898,0.567366,1
2119,0.446730,0.299123,0
2120,0.387220,0.290343,0


In [17]:
X = all_data.iloc[:, :-1]
y = all_data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Обучение модели

## Logistic Regression

In [19]:
model = LogisticRegression()
results = cross_validate(model, X, y, cv=5, scoring="balanced_accuracy")
results["test_score"].mean()

0.687730446975759

In [20]:
tree_params = {
    "criterion": ["gini", "log_loss", "entropy"],
    "max_depth": [15, 20, 25],
    "min_samples_split": [2, 3, 4],
    "min_samples_leaf": [2, 3, 4],
}

## Decision Tree

In [21]:
grs = GridSearchCV(
    DecisionTreeClassifier(random_state=0),
    cv=5,
    param_grid=tree_params,
    n_jobs=-1,
    scoring="balanced_accuracy",
)
grs.fit(X, y)

In [22]:
grs.best_score_

0.6953409701086479

In [23]:
grs.best_params_

{'criterion': 'log_loss',
 'max_depth': 15,
 'min_samples_leaf': 2,
 'min_samples_split': 2}

## XGboost

In [24]:
boost_params = {
    "loss": ["log_loss", "exponential"],
    "learning_rate": [0.1, 0.2],
    "n_estimators": [70, 80, 90],
    "max_depth": [2, 3],
    "min_samples_leaf": [3, 4, 5],
    "min_samples_split": [2, 3],
}

In [26]:
grs = GridSearchCV(
    GradientBoostingClassifier(random_state=0),
    cv=5,
    param_grid=boost_params,
    n_jobs=-1,
    scoring="balanced_accuracy",
)
grs.fit(X, y)
print(grs.best_score_)
print(grs.best_params_)

0.7182958042556702
{'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 2, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 80}
