In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Скачивание и анализ данных

In [4]:
df = pd.read_csv("./gzpn_data.csv")
df = df.dropna().sample(frac=1).reset_index(drop=True)
df = df.drop(columns=["<TICKER>", "<PER>", "<DATE>", "<TIME>"])
df.columns = ["open", "high", "low", "close", "volume"]
df

Unnamed: 0,open,high,low,close,volume
0,163.67,163.70,163.63,163.68,40400
1,166.06,166.24,166.06,166.22,42100
2,163.25,163.25,163.15,163.18,426830
3,165.03,165.03,164.85,164.92,117240
4,163.61,163.63,163.54,163.62,12730
...,...,...,...,...,...
2128,163.75,163.87,163.75,163.85,14780
2129,163.10,163.37,163.09,163.37,132170
2130,163.58,163.60,163.44,163.51,60230
2131,167.37,167.40,167.20,167.20,134170


In [8]:
def prepare_target(df, steps_obs: int = 3):
    targets = []
    for i in range(0, len(df) - steps_obs):
        current_price = df["close"].iloc[i]
        max_price = df["high"].iloc[i + 1 : i + 1 + steps_obs].max()
        targets.append(max_price > current_price)
    targets += [0] * steps_obs
    return np.array(targets, dtype=np.int32)


steps_obs = 3

df["target"] = prepare_target(df=df, steps_obs=steps_obs)
df

Unnamed: 0,open,high,low,close,volume,target
0,163.67,163.70,163.63,163.68,40400,1
1,166.06,166.24,166.06,166.22,42100,0
2,163.25,163.25,163.15,163.18,426830,1
3,165.03,165.03,164.85,164.92,117240,1
4,163.61,163.63,163.54,163.62,12730,1
...,...,...,...,...,...,...
2128,163.75,163.87,163.75,163.85,14780,1
2129,163.10,163.37,163.09,163.37,132170,1
2130,163.58,163.60,163.44,163.51,60230,0
2131,167.37,167.40,167.20,167.20,134170,0


In [9]:
df.describe()

Unnamed: 0,open,high,low,close,volume,target
count,2133.0,2133.0,2133.0,2133.0,2133.0,2133.0
mean,164.921181,164.981688,164.849742,164.921547,114987.0,0.764651
std,1.572417,1.587228,1.56198,1.572919,190835.8,0.424316
min,162.3,162.37,162.2,162.29,400.0,0.0
25%,163.5,163.55,163.45,163.5,25770.0,1.0
50%,164.53,164.69,164.25,164.5,56040.0,1.0
75%,166.3,166.39,166.23,166.3,123850.0,1.0
max,168.92,168.96,168.5,168.92,2695200.0,1.0


In [4]:
num_cols = [
    "Age",
    "Fare",
]
cat_cols = ["Embarked", "Pclass", "sibsp", "Parch", "Sex"]

df_num = df[num_cols]
df_cat = df[cat_cols]

# Обработка численных колонок

In [5]:
scaler = MinMaxScaler()
scaler.fit(df_num)
df_num.iloc[:, :] = scaler.transform(df_num)
df_num

Unnamed: 0,Age,Fare
0,0.248403,0.007832
1,0.486409,0.015469
2,0.348616,0.101497
3,0.210823,0.111257
4,0.273456,0.014110
...,...,...
1302,0.599148,0.077294
1303,0.298509,0.031425
1304,0.348616,0.101497
1305,0.348616,0.025374


# Обработка категориальных колонок

In [6]:
enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(df_cat)
df_cat = pd.DataFrame(
    data=enc.transform(df_cat).toarray(), columns=enc.get_feature_names_out(cat_cols)
).astype(int)
df_cat = df_cat.astype("int64")
df_cat

Unnamed: 0,Embarked_0.0,Embarked_1.0,Embarked_2.0,Pclass_1,Pclass_2,Pclass_3,sibsp_0,sibsp_1,sibsp_2,sibsp_3,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Sex_0,Sex_1
0,1,0,0,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,1,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,0,0,1,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0,0,1,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,1,0,0,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1302,1,0,0,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
1303,0,0,1,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1304,0,0,1,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1305,0,0,1,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0


# Собираем данные вместе

In [7]:
df_final = pd.concat([df_num, df_cat, df.iloc[:, -1]], axis=1)

X = df_final.iloc[:, :-1]
y = df_final.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Строим модель

In [8]:
model = LogisticRegression()
results = cross_validate(model, X, y, cv=5, scoring="accuracy")
results["test_score"].mean()

0.7804158989207686

In [9]:
tree_params = {
    "criterion": ["gini", "log_loss", "entropy"],
    "max_depth": [15, 20, 25],
    "min_samples_split": [2, 3, 4],
    "min_samples_leaf": [2, 3, 4],
}

In [72]:
grs = GridSearchCV(
    DecisionTreeClassifier(random_state=0),
    cv=5,
    param_grid=tree_params,
    n_jobs=-1,
    scoring="accuracy",
)
grs.fit(X, y)

In [73]:
grs.best_params_

{'criterion': 'gini',
 'max_depth': 20,
 'min_samples_leaf': 3,
 'min_samples_split': 2}

In [74]:
grs.best_score_

0.7589745839548419

In [14]:
boost_params = {
    "loss": ["log_loss", "exponential"],
    "learning_rate": [0.1, 0.2],
    "n_estimators": [70, 80, 90],
    "max_depth": [2, 3],
    "min_samples_leaf": [3, 4, 5],
    "min_samples_split": [2, 3],
}

In [15]:
grs = GridSearchCV(
    GradientBoostingClassifier(random_state=0),
    cv=5,
    param_grid=boost_params,
    n_jobs=-1,
    scoring="accuracy",
)
grs.fit(X, y)

In [17]:
grs.best_params_

{'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 2,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 80}

In [18]:
grs.best_score_

0.787294902167237

In [21]:
grs.cv_results_["mean_test_score"]

array([0.78347226, 0.78117341, 0.78270013, 0.78347226, 0.78117341,
       0.78270013, 0.78500483, 0.7872949 , 0.78577111, 0.78500483,
       0.7872949 , 0.78577111, 0.78271475, 0.78577696, 0.78347226,
       0.78271475, 0.78577696, 0.78347226, 0.77735076, 0.77428271,
       0.76892457, 0.77735076, 0.77428271, 0.76892457, 0.7804159 ,
       0.77199555, 0.76893627, 0.7804159 , 0.77199555, 0.76893627,
       0.77046299, 0.76970255, 0.76816999, 0.77046299, 0.76970255,
       0.76816999, 0.78653447, 0.78423854, 0.7781229 , 0.78653447,
       0.78423854, 0.7781229 , 0.78653154, 0.77887748, 0.77887748,
       0.78653154, 0.77887748, 0.77887748, 0.78654032, 0.78193677,
       0.78041005, 0.78654032, 0.78193677, 0.78041005, 0.77429148,
       0.77276476, 0.77582405, 0.77429148, 0.77276476, 0.77582405,
       0.77811412, 0.77505484, 0.77888918, 0.77811412, 0.77505484,
       0.77888918, 0.7735252 , 0.77275599, 0.77275599, 0.7735252 ,
       0.77275599, 0.77275599, 0.7720131 , 0.7697084 , 0.76970

In [None]:
# enc = OneHotEncoder(handle_unknown='ignore')
# enc.fit(df_cat)
# df_cat = pd.DataFrame( data = enc.transform(df_cat).toarray(), columns=enc.get_feature_names(cat_cols)).astype(int)
# df_cat = df_cat.astype('int64')