In [1]:
import yfinance as yf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import datetime
import pandas_ta as ta
from scipy.signal import stft
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None, 'display.max_columns', None)

In [2]:

def fill_missing_dates_daily_2(df, date_col='date'):
    """
    !!! Заменяет значения 'IntradayStd' и 'Volume' нулями в праздники и выходные дни. !!!
    Заполняет пропущенные даты в DataFrame (включая выходные и праздники),
    копируя значения (ffill) с предыдущей даты.
    Удаляет дублирующиеся даты, если они есть, оставляя последнюю запись.
    """
    # Переводим столбец дат в datetime
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')

    # На всякий случай сортируем
    df.sort_values(by=date_col, inplace=True)

    # Удаляем дубликаты по дате, оставляем "последнюю" запись
    df.drop_duplicates(subset=[date_col], keep='last', inplace=True)

    # Ставим дату в индекс
    df.set_index(date_col, inplace=True)

    # Убеждаемся, что индекс уникален
    if not df.index.is_unique:
        raise ValueError("Индекс дат по-прежнему не уникален. Проверь данные.")
    
    df_1 = df.drop(['DayAvgPrice'], axis=1)
    df_2 = df.drop(['IntradayStd', 'Volume'], axis=1)
    df_3 = df.drop(['IntradayStd', 'Volume'], axis=1)

    df_1 = df_1.asfreq(freq ='D',  fill_value = 0.0)
    df_2 = df_2.asfreq(freq ='D',  method='ffill')
    df_3 = df_3.asfreq(freq ='D',  method='bfill')
    
    df_2['IntradayStd'] = df_1['IntradayStd'].copy()
    df_2['Volume'] = df_1['Volume'].copy()
    #df_2['DayAvgPrice_bfill'] = df_3['DayAvgPrice'].copy()

    # Приводим к дневной частоте и заполняем пропуски методом ffill
    #df_daily = df.asfreq('D', method='ffill')

    # Сбрасываем индекс, если нужно вернуть столбец с датами
    df_2.reset_index(inplace=True)
    df_2.rename(columns={'index': date_col}, inplace=True)

    return df_2

In [3]:
def day_of_week_to_imag_2(df):
    """
    day_of_week : pd.Series - день недели в целых числах начиная с пн = О
    period: float — длина одного цикла (например, 7, если недельный цикл)
    """
    #df = df.replace({0: 1.5, 1: 2.2, 2: 2.8, 3: 3., 4: 3.5, 5: 0.5, 6: 0.2})
    #df = df.replace({0: 0.7, 1: 0.8, 2: 1.0, 3: 1.1, 4: 0.9, 5: 0.2, 6: 0.1})
    df = df.replace({0: 3., 1: 4., 2: 4.5, 3: 5., 4: 7., 5: 0.5, 6: 0.1})
    #df = df.replace({0: 10., 1: 14., 2: 17., 3: 25., 4: 27., 5: 0.5, 6: 0.1})

    return df

In [4]:
def spiral_time_indices(date, day_of_week):
    """
    dates : np.array или pd.Series — упорядоченные временные метки (в днях или долях суток)
    period: float — длина одного цикла (например, 7, если недельный цикл)
    """
    # Пусть у нас dates — это массив float (количество дней с какого-то нуля)
    # Если это pd.Datetime, нужно привести к числу дней:
    #t = (date - date[0]).dt.days.astype(float)
    # t = date - date[0]  # если dates уже float
    # t = date
    #real_part = t.astype(float) / 100.

    #real_part = (date.index + 100.) / 100.
    imag_part = day_of_week_to_imag_2(day_of_week)
    #imag_part = day_of_week_to_imag_2(day_of_week) + np.around(np.log(real_part), 3)
    #imag_part = day_of_week_to_imag_2(day_of_week) + real_part / 10.
    
    real_part = date.apply(lambda x: datetime.datetime.timestamp(x) / 86400 / 20000.) # unix days
    #real_part = real_part - real_part[0] + 1.
    #real_part = (date.index + 1.) / 100.
    #real_part = np.log(date.index + 1.)
    #imag_part = day_of_week_to_imag_2(day_of_week)
    #imag_part = 2 * np.pi * day_of_week / 7.
    #imag_part = np.sin(2 * np.pi * real_part/ 7.)
    # real_part = (date.index + 100.) / 100.
    # imag_part = 2 * np.pi * real_part  / 7.
    #real_part = 1.
    #imag_part = 0.
    return real_part, imag_part

In [5]:
# window_cov_eigenvalues(ts, 20) вернёт массив формы (M, n\_features), 
# где M = {len(ts) - window_size + 1}.
# Cov(3f) - ковариационная матрица для 3-х фич, даёт 3 eigenvalues на окно 

def window_cov_eigenvalues(ts, window_size=21):
    """
    Пример: берем окно из time series (одна или несколько фич),
    строим 'ковариацию' признаков за это окно,
    считаем собственные числа (lambda).
    """
    # Будем возвращать список списков (по окну) из eigenvalues
    all_eigs = []

    for start in range(len(ts) - window_size + 1):
        end = start + window_size
        window_data = ts[start:end]  # (window_size,) если ts - 1D

        # Превратим это в 2D вид (если много признаков, shape=(window_size, n_features))
        # Здесь предположим, что ts уже (N, n_features)
        # Но если ts=(N,), сделаем "fake" 2D
        if len(window_data.shape) == 1:
            window_data = window_data.reshape(-1, 1)  # (window_size, 1)

        # Ковариация: shape=(n_features, n_features)
        # rowvar=False => столбцы - это фичи
        cov_mat = np.cov(window_data, rowvar=False)

        # Собственные числа симметричной матрицы => real-valued
        eigvals = np.linalg.eigvalsh(cov_mat)

        all_eigs.append(eigvals)
    
    return np.array(all_eigs)

In [6]:
def create_lambdas_dataset(eigs, horizon=1):
    """
    eigs: (M, d) - M таймшагов, d - кол-во собственных чисел
    return X, y
    X[t] = eigs[t]
    y[t] = eigs[t + horizon]
    """
    X, Y = [], []
    #for i in range(len(eigs) - horizon):
    for i in range(len(eigs)):  # сейчас делаем только Х
        X.append(eigs[i])
        #Y.append(eigs[i + horizon])
    return np.array(X)#, np.array(Y)

In [7]:
# TOEPLITZ
# 1.1. Интерпретация
# 	•	\mathbf{T}[i, i] = r(0) — это просто средняя энергия сигнала на окне (среднее x_n^2).
# 	•	\mathbf{T}[i, j] при i \neq j показывает автокорреляцию на лаг |i-j|.
# 	•	Матрица симметрична, так как |i-j| = |j-i|.
# T(1f) - Toeplitz - матрица для одной фичи, даёт = window_size eigenvalues на окно window_size.

def autocorr_centered(x):
    x = x - np.mean(x)
    c = np.correlate(x, x, mode='full')
    # c[len(x)-1] -- это r(0)
    # Возвращаем часть от lag=0..W-1
    return c[len(x)-1:len(x)-1 + len(x)]

def build_toeplitz_autocorr_centered(window_data):
    """
    Создаёт Toeplitz-матрицу автокорреляции для одномерного окна window_data.
    window_data: np.array формы (W,).
    return: матрица T формы (W, W).
    """
    window_data = window_data - window_data.mean()
    W = len(window_data)
    r = autocorr_centered(window_data)  # shape (W, ), r[0] = r(0), r[1] = r(1), etc.
    # Нормируем?
    r /= (W - np.arange(W))  # иногда нужна такая нормировка

    T = np.zeros((W, W))
    for i in range(W):
        for j in range(W):
            lag = abs(i - j)
            T[i, j] = r[lag]
    return T

In [8]:
# Допустим, у нас есть весь ряд DayAvgPrice длины N. Мы хотим сделать окна длины W и 
# на каждом окне построить Toeplitz-матрицу, затем вычислить её собственные числа.

def toeplitz_eig_dayavg(price_series, window_size=21):
    """
    price_series: np.array формы (N, ), одномерный ряд (DayAvgPrice).
    window_size: длина окна
    return: массив eigenvalues, shape (M, W), где M ~ (N - window_size + 1).
    """
    N = len(price_series)
    all_eigvals = []

    for start in range(N - window_size + 1):
        end = start + window_size
        window_data = price_series[start:end].to_numpy()
        T = build_toeplitz_autocorr_centered(window_data)
        # Собственные числа симметричной (автокорреляционной) матрицы
        eigvals = np.linalg.eigvalsh(T)
        all_eigvals.append(eigvals)

    return np.array(all_eigvals)  # shape (M, window_size)

In [9]:
def lambdas_C_T(df, window_size):

    # вычисляет собственные значения типа С и Т на окне window_size
    # возвращает датафрейм с добавленными столбцами с.з. и без NaN

    #eigs_data = window_cov_eigenvalues(df[['DayAvgPrice', 'IntradayStd', 'Volume']], window_size)
    eigs_data = window_cov_eigenvalues(df[['DayAvgPrice', 'IntradayStd', 'close']], window_size)
    lambdas_C_arr = create_lambdas_dataset(eigs_data, horizon=1)
    lambdas_T_arr = toeplitz_eig_dayavg(df['DayAvgPrice'], window_size)
    df = df.loc[window_size - 1:].reset_index(drop=True)
    
    # Присоединяем eigenvalues
    df = pd.concat([df, pd.DataFrame(lambdas_C_arr, columns=["lambda_C3", "lambda_C2", "lambda_C1"])], axis=1)
    #df_lambda = pd.concat([df_lambda, pd.DataFrame(lambdas_T_arr[:, -3:], columns=["lambda_T3", "lambda_T2", "lambda_T1"])], axis=1)
    df = pd.concat([df, pd.DataFrame(lambdas_T_arr[:, -5:], columns=["lambda_T5", "lambda_T4", "lambda_T3", "lambda_T2", "lambda_T1"])], axis=1)
    
    # Нормировка столбцов lambda_T (как вариант)
    df['lambda_T_sum'] = df['lambda_T1'] + df['lambda_T2'] + df['lambda_T3']
    df['lambda_T1'] = df['lambda_T1'] / df['lambda_T_sum']
    df['lambda_T2'] = df['lambda_T2'] / df['lambda_T_sum']
    df['lambda_T3'] = df['lambda_T3'] / df['lambda_T_sum']
    df = df.drop(['lambda_T_sum', "lambda_T5", "lambda_T4"], axis=1)

    # Нормировка столбцов lambda_C (как вариант)
    df['lambda_C_sum'] = df['lambda_C1'] + df['lambda_C2'] + df['lambda_C3']
    df['lambda_C1'] = df['lambda_C1'] / df['lambda_C_sum']
    df['lambda_C2'] = df['lambda_C2'] / df['lambda_C_sum']
    df['lambda_C3'] = df['lambda_C3'] / df['lambda_C_sum']
    df = df.drop(['lambda_C_sum'], axis=1)



    df = df.dropna()
    df = df.reset_index(drop=True)

    return df
    

In [10]:
# =========================
# Target smoothing
# =========================
from typing import Iterator, Tuple, Optional, List

def smooth_target_ema(
    y: pd.Series, span_fast: int = 5, span_slow: Optional[int] = None
) -> pd.Series:
    """
    EMA-сглаживание таргета.
    - Если указан только span_fast: обычная EMA.
    - Если указан span_slow: двойная EMA (EMA(EMA(y))) — сильнее подавляет шум.
    """
    y1 = y.ewm(span=span_fast, adjust=False).mean()
    if span_slow is None:
        return y1
    y2 = y1.ewm(span=span_slow, adjust=False).mean()
    return y2

In [11]:
# =========================
# Parkinson vola
# =========================
def add_parkinson_features(
    df: pd.DataFrame,
    high_col: str = "high",
    low_col: str  = "low",
    window: int = 20,
    make_derivatives: bool = True,
    winsor_q: float = 0.995,   # срез экстремумов (опционально)
) -> pd.DataFrame:
    df = df.copy()

    # базовая проверка столбцов
    if high_col not in df.columns or low_col not in df.columns:
        raise ValueError(f"Columns `{high_col}` and `{low_col}` must be in df")

    # безопасное отношение high/low
    ratio = (df[high_col] / df[low_col]).replace([np.inf, -np.inf], np.nan)

    # классическая паркинсоновская дисперсия и волатильность
    parkinson_var = (np.log(ratio) ** 2).rolling(window, min_periods=window).mean()
    parkinson_vol = np.sqrt(parkinson_var / (4 * np.log(2)))

    # winsorize для редких всплесков (по желанию)
    # if winsor_q:
    #     q = parkinson_vol.quantile(winsor_q)
    #     parkinson_vol = parkinson_vol.clip(upper=q)

    df["parkinson_vol"] = parkinson_vol.round(6)

    if make_derivatives:
        # сглаживание и динамика (без заглядывания вперёд)
        df["parkinson_vol_ma5"]   = df["parkinson_vol"].rolling(5,  min_periods=5).mean().round(6)
        df["parkinson_vol_ma20"]  = df["parkinson_vol"].rolling(20, min_periods=20).mean().round(6)
        df["parkinson_vol_diff1"] = df["parkinson_vol"].diff(1).round(6)
        df["parkinson_vol_lag1"]  = df["parkinson_vol"].shift(1).round(6)

    # финальные NaN на ранних барах — оставляем; их отфильтрует твой датасет окон
    return df

In [12]:
# Wavelet/STFT энергия (устойчивые спектральные фичи)(эскиз на STFT; для продакшена лучше кэшировать)

def stft_energy_tail(x: np.ndarray, w: int = 64) -> list:
    """
    Возвращает 3 числа — энергию сигнала в низких, средних и высоких частотах
    за последние w наблюдений x.
    """
    if len(x) < w or np.isnan(x).any():
        return [np.nan, np.nan, np.nan]
    
    # Берём последний кусок длиной w
    x_tail = x[-w:]
    f, t, Z = stft(x_tail, nperseg=w//2)
    S = np.abs(Z) ** 2  # спектральная плотность мощности

    # усредняем по времени
    S_mean = np.mean(S, axis=1)

    # делим на 3 диапазона частот
    bands = np.array_split(S_mean, 3)
    energies = [np.mean(b) for b in bands]
    return energies

In [13]:
# bild_advanced_features

# ========= helpers =========
def _safe_div(a, b):
    return a / b.replace(0, np.nan)

def _rolling_cov(x, y, w):
    return (x.rolling(w).mean()*y.rolling(w).mean() - (x*y).rolling(w).mean()) * (-1)  # не используем, см. ниже

# ========= базовые конструкции =========
def realized_vol(ret, w):
    return ret.rolling(w, min_periods=w).std(ddof=1)

def bbands(close, w=20, k=2.0):
    ma = close.rolling(w, min_periods=w).mean()
    sd = close.rolling(w, min_periods=w).std(ddof=1)
    bb_up = ma + k*sd
    bb_mid = ma
    bb_low = ma - k*sd
    # безразмерные и устойчивые к масштабу:
    pct_b = (close - bb_low) / (bb_up - bb_low)
    bandwidth = _safe_div(bb_up - bb_low, bb_mid.abs())
    return bb_low, bb_mid, bb_up, pct_b, bandwidth

def atr(df, n=14):
    h, l, c = df['high'], df['low'], df['close']
    prev_c = c.shift(1)
    tr = pd.concat([
        (h - l),
        (h - prev_c).abs(),
        (l - prev_c).abs()
    ], axis=1).max(axis=1)
    return tr.rolling(n, min_periods=n).mean()

def adx(df, n=14):
    h, l, c = df['high'], df['low'], df['close']
    up_move = h.diff()
    down_move = -l.diff()
    plus_dm = np.where((up_move > down_move) & (up_move > 0), up_move, 0.0)
    minus_dm = np.where((down_move > up_move) & (down_move > 0), down_move, 0.0)
    tr = atr(df, n)*n  # вернуть к сумме TR
    plus_di = 100 * pd.Series(plus_dm, index=h.index).rolling(n, min_periods=n).sum() / tr
    minus_di= 100 * pd.Series(minus_dm, index=h.index).rolling(n, min_periods=n).sum() / tr
    dx = 100 * (plus_di - minus_di).abs() / (plus_di + minus_di).replace(0, np.nan)
    adx = dx.rolling(n, min_periods=n).mean()
    return adx, plus_di, minus_di

def choppiness(df, n=14):
    _atr = atr(df, n)
    hi = df['high'].rolling(n, min_periods=n).max()
    lo = df['low'].rolling(n, min_periods=n).min()
    denom = (hi - lo).replace(0, np.nan)
    return 100 * np.log(_atr.rolling(n, min_periods=n).sum() / denom) / np.log(n)

def garman_klass_sigma(df, w=20):
    u = np.log(df['high']/df['open'])
    d = np.log(df['low']/df['open'])
    c = np.log(df['close']/df['open'])
    var = (0.5*(u - d)**2 - (2*np.log(2)-1)*c**2).rolling(w, min_periods=w).mean()
    return np.sqrt(var.clip(lower=0))

def rogers_satchell_sigma(df, w=20):
    u = np.log(df['high']/df['close'].shift(1))
    d = np.log(df['low']/df['close'].shift(1))
    c = np.log(df['close']/df['open'])
    var = (u*(u-c) + d*(d-c)).rolling(w, min_periods=w).mean()
    return np.sqrt(var.clip(lower=0))

def yang_zhang_sigma(df, w=20):
    oc = np.log(df['open']/df['close'].shift(1))
    co = np.log(df['close']/df['open'])
    k = 0.34/(1.34 + (w+1)/(w-1))
    oc2 = oc.rolling(w, min_periods=w).var(ddof=1)
    co2 = co.rolling(w, min_periods=w).var(ddof=1)
    rs  = (np.log(df['high']/df['close']) * np.log(df['high']/df['open'])
          + np.log(df['low']/df['close'])  * np.log(df['low']/df['open'])).rolling(w, min_periods=w).mean()
    var = oc2 + k*co2 + (1-k)*rs
    return np.sqrt(var.clip(lower=0))

def amihud_illiquidity(ret, volume, w=20):
    illiq = _safe_div(ret.abs(), volume.replace(0, np.nan))
    return illiq.rolling(w, min_periods=w).mean()

def roll_measure(close, w=20):
    dp = close.diff()
    cov = (dp * dp.shift(1)).rolling(w, min_periods=w).mean()
    sigma_spread = -2 * cov
    sigma_spread = sigma_spread.mask(sigma_spread < 0)  # по определению
    return sigma_spread

def vol_of_vol(vol_series, w=20):
    return vol_series.rolling(w, min_periods=w).std(ddof=1)

def kalman_slope(y, q=1e-5, r=1e-2):
    """
    Простая 1D модель: уровень+наклон. Возвращает оценку slope.
    q,r — дисперсии процесса/измерения.
    """
    n = len(y)
    if n == 0: return pd.Series(dtype=float)
    # состояние [level, slope]
    x = np.array([y.fillna(method='ffill').iloc[0], 0.0], dtype=float)
    P = np.eye(2)
    F = np.array([[1.0, 1.0],
                  [0.0, 1.0]])
    Q = q * np.array([[0.25, 0.5],
                      [0.5,  1.0]])
    H = np.array([[1.0, 0.0]])
    R = np.array([[r]])
    slopes = []
    for z in y.fillna(method='ffill').values:
        # predict
        x = F @ x
        P = F @ P @ F.T + Q
        # update
        yk = z - (H @ x)
        S = H @ P @ H.T + R
        K = (P @ H.T) @ np.linalg.inv(S)
        x = x + (K @ yk).ravel()
        P = (np.eye(2) - K @ H) @ P
        slopes.append(x[1])
    return pd.Series(slopes, index=y.index)

# ========= главный конструктор фич =========
def build_advanced_features(df: pd.DataFrame,
                            price_col: str = 'close',
                            w_fast: int = 14,
                            w_slow: int = 20) -> pd.DataFrame:
    """
    Ожидает df с колонками: open, high, low, close, volume (индекс — datetime).
    Возвращает df с новыми фичами (без нормировки).
    """
    df = df.copy()
    for col in ['open','high','low','close','Volume']:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")
    df = df.sort_index()

    # 0) базовые ретёрны
    df['ret'] = np.log(df['close']/df['close'].shift(1))
    df['ret_overnight'] = np.log(df['open']/df['close'].shift(1))
    df['ret_intraday']  = np.log(df['close']/df['open'])

    # 1) волатильности (улучшенные)
    df['gk_sigma']  = garman_klass_sigma(df, w=w_slow)
    df['rs_sigma']  = rogers_satchell_sigma(df, w=w_slow)
    df['yz_sigma']  = yang_zhang_sigma(df, w=w_slow)

    # 2) режим/тренд
    df['adx'], df['di_plus'], df['di_minus'] = adx(df, n=w_fast)
    df['chop'] = choppiness(df, n=w_fast)
    df['kalm_slope'] = kalman_slope(df[price_col])

    # 3) объёмы/ликвидность
    # df['amihud'] = amihud_illiquidity(df['ret'], df['Volume'], w=w_slow)
    # df['roll_sigma_spread'] = roll_measure(df['close'], w=w_slow)

    # 4) вола-волы и связки
    df['rv20'] = realized_vol(df['ret'], w=w_slow)
    df['vol_of_vol'] = vol_of_vol(df['rv20'], w=w_fast)

    # 5) Bollinger в безразмерном виде
    bb_low, bb_mid, bb_up, pct_b, bw = bbands(df['close'], w=w_slow, k=2.0)
    df['bb_pct_b'] = pct_b
    df['bb_bandwidth'] = bw

    # 6) взаимодействие цены и объёма
    dlog_vol = np.log1p(df['Volume']).diff()
    df['corr_ret_dlogvol'] = df['ret'].rolling(w_slow, min_periods=w_slow).corr(dlog_vol)

    # аккуратный старт: удалим строки до прогрева окон (потом они удаляются)
    # warmup = max(w_fast, w_slow) + 1
    # return df.iloc[warmup:].copy()
    return df.copy()

### Дополнение архивного файла свежими данными.

In [319]:
# Set variables
ticker = "BAYN.DE"
start_date = "2020-09-04"
end_date = "2020-09-27"

In [None]:
# Download new stock data
data = yf.download(ticker, start=start_date, end=end_date, interval='1h', auto_adjust=True)
data.columns = [f'{Price}' for Price, Ticker in data.columns]
data['date'] = data.index.date
data = data.reset_index().drop(columns=['Datetime'])
data.columns = data.columns.str.lower()
data

In [None]:
# load old data from data_archiv
data_old = pd.read_csv('../data_archiv/MBG/MBG_to12_06_2025.csv')
data_old.tail(10)

In [None]:
# load old data from data_archiv
data_new = pd.read_csv('../data_archiv/MBG/MB_13_06_2025to12_08_2025.csv')
data_new.head(10)

In [None]:
# Объединяем с новыми данными
data_fresh = pd.concat([data_old, data_new], ignore_index=True)
data_fresh.tail(20)

In [59]:
# Save
data_fresh.to_csv('../data_archiv/MBG/MBG_fresh.csv', index=False)

##### Загружаем свежие данные с Yahoo Finance

In [38]:
# Set variables
ticker = "DTG.DE"
start_date = "2025-11-12"
end_date = "2025-11-13"

In [39]:
# Download new stock data
data = yf.download(ticker, start=start_date, end=end_date, interval='1h', auto_adjust=True)
data.columns = [f'{Price}' for Price, Ticker in data.columns]
data['date'] = data.index.date
data = data.reset_index().drop(columns=['Datetime'])
data.columns = data.columns.str.lower()
data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,close,high,low,open,volume,date
0,36.240002,36.279999,35.779999,36.240002,0,2025-11-12
1,36.369999,36.389999,36.130001,36.240002,83456,2025-11-12
2,36.34,36.459999,36.23,36.360001,55272,2025-11-12
3,36.369999,36.389999,36.23,36.349998,25211,2025-11-12
4,36.529999,36.66,36.349998,36.349998,77557,2025-11-12
5,36.470001,36.59,36.439999,36.52,55861,2025-11-12
6,36.34,36.459999,36.150002,36.43,66320,2025-11-12
7,36.09,36.380001,35.970001,36.349998,58665,2025-11-12
8,36.029999,36.139999,36.029999,36.099998,37288,2025-11-12


In [40]:
# load old data from data_archiv
data_old = pd.read_csv('../data_archiv/DTG/DTG_fresh.csv')
data_old.tail(10)

Unnamed: 0,close,high,low,open,volume,date
8680,35.48,35.59,35.48,35.560001,68349,2025-11-10
8681,35.75,35.880001,35.529999,35.560001,0,2025-11-11
8682,36.130001,36.279999,35.669998,35.740002,132268,2025-11-11
8683,35.950001,36.169998,35.900002,36.139999,84459,2025-11-11
8684,35.860001,35.990002,35.830002,35.93,37753,2025-11-11
8685,35.939999,35.939999,35.82,35.84,40239,2025-11-11
8686,35.93,36.0,35.790001,35.939999,36142,2025-11-11
8687,35.779999,36.0,35.669998,35.939999,70686,2025-11-11
8688,35.860001,35.990002,35.720001,35.779999,67239,2025-11-11
8689,35.98,35.990002,35.830002,35.849998,27649,2025-11-11


In [41]:
# Объединяем с новыми данными
data_fresh = pd.concat([data_old, data], ignore_index=True)
data_fresh.tail(20)

Unnamed: 0,close,high,low,open,volume,date
8679,35.560001,35.700001,35.5,35.639999,98267,2025-11-10
8680,35.48,35.59,35.48,35.560001,68349,2025-11-10
8681,35.75,35.880001,35.529999,35.560001,0,2025-11-11
8682,36.130001,36.279999,35.669998,35.740002,132268,2025-11-11
8683,35.950001,36.169998,35.900002,36.139999,84459,2025-11-11
8684,35.860001,35.990002,35.830002,35.93,37753,2025-11-11
8685,35.939999,35.939999,35.82,35.84,40239,2025-11-11
8686,35.93,36.0,35.790001,35.939999,36142,2025-11-11
8687,35.779999,36.0,35.669998,35.939999,70686,2025-11-11
8688,35.860001,35.990002,35.720001,35.779999,67239,2025-11-11


In [42]:
# Save
data_fresh.to_csv('../data_archiv/DTG/DTG_fresh.csv', index=False)

#### START DATA PREP

In [14]:
data_mb = pd.read_csv('../data_archiv/DTG/DTG_fresh.csv')

In [15]:
# df.columns = ['datetime', 'open', 'high', 'low', 'close', 'volume']
# И datetime — это pandas.Timestamp с часовыми барами
df = data_mb.copy()
df['datetime'] = pd.to_datetime(df['date'])
df = df.set_index('datetime')

# === Ресемплинг на дневную частоту ===
df_daily = pd.DataFrame()

df_daily['open']  = df['open'].resample('1D').first()
df_daily['high']  = df['high'].resample('1D').max()
df_daily['low']   = df['low'].resample('1D').min()
df_daily['close'] = df['close'].resample('1D').last()
df_daily['Volume']= df['volume'].resample('1D').sum()

df_daily['DayAvgPrice']= df[['open', 'high', 'low', 'close']].resample('1D').mean().mean(axis=1)

df_daily['IntradayStd'] = df.resample('1D').apply(
    lambda x: np.std(x[['open','high','low','close']].values.flatten(), ddof=1)
).to_frame(name='IntradayStd')

# === Убираем пустые дни (например, выходные) ===
df_daily = df_daily.dropna(subset=['open', 'high', 'low', 'close'])

# === Сбрасываем индекс, если нужно для модели ===
df_daily = df_daily.reset_index().rename(columns={'datetime': 'date'})

df_daily.tail()

Unnamed: 0,date,open,high,low,close,Volume,DayAvgPrice,IntradayStd
964,2025-11-06,35.400002,35.490002,34.73,34.799999,536722,35.163611,0.181504
965,2025-11-07,35.57,36.040001,33.459999,34.610001,1068926,34.328333,0.542284
966,2025-11-10,35.169998,36.130001,34.98,35.48,827914,35.753889,0.282107
967,2025-11-11,35.560001,36.279999,35.529999,35.98,496435,35.886111,0.159271
968,2025-11-12,36.240002,36.66,35.779999,36.029999,459630,36.299444,0.184235


In [None]:
# df_daily = fill_missing_dates_daily_2(df_daily, date_col='date') # 'DayAvgPrice' и 'IntradayStd'=0 в неторговые дни
# df_daily.tail(10)

In [None]:
# !!! Только в случае попадания выходных или праздников в конец датасета дополняем датасет вручную !!!

# last_index = df_daily.index.max()
# df_daily.loc[last_index + 1] = [df_daily['date'][last_index] + pd.DateOffset(days=1), df_daily['open'][last_index], df_daily['high'][last_index], df_daily['low'][last_index], df_daily['close'][last_index], df_daily['DayAvgPrice'][last_index], 0., 0,]
# df_daily.loc[last_index + 2] = [df_daily['date'][last_index] + pd.DateOffset(days=2), df_daily['open'][last_index], df_daily['high'][last_index], df_daily['low'][last_index], df_daily['close'][last_index], df_daily['DayAvgPrice'][last_index], 0., 0,]
# df_daily.tail()

### Генерация признаков

In [16]:
# Размер окна для поиска паттернов
window_size = 100

##### Старые признаки

In [17]:
# Стандартные
df_daily['day_of_week'] = df_daily['date'].apply(lambda x: x.day_of_week)
df_daily['day_of_year'] = df_daily['date'].apply(lambda x: x.day_of_year)
df_daily['Log_Profit'] = np.log(df_daily['DayAvgPrice'].shift(1) / df_daily['DayAvgPrice'])
df_daily['DayAvgPrice_diff'] = df_daily['DayAvgPrice'] - df_daily['DayAvgPrice'].shift(1)
df_daily['DayAvgPrice_2diff'] = df_daily['DayAvgPrice_diff'] - df_daily['DayAvgPrice_diff'].shift(1)

In [18]:
# Полиномиальные
df_daily['POLY_1'] = np.sqrt(df_daily['DayAvgPrice']) + df_daily['IntradayStd'] * 100.
df_daily['POLY_2'] = df_daily['DayAvgPrice'] + (df_daily['IntradayStd'] * 10.) ** 2
df_daily['POLY_3'] = df_daily['DayAvgPrice'] ** 2 / 1000. + df_daily['Volume'] / 100000.

In [19]:
# === Комплексные
df_daily['real_time'], df_daily['imag_time'] = spiral_time_indices(df_daily['date'], df_daily['day_of_week'])

# === Комплексные циклы ===
t = np.arange(len(df_daily))
df_daily['c_week_real'] = np.cos(2 * np.pi * t / 7)
df_daily['c_week_imag'] = np.sin(2 * np.pi * t / 7)

df_daily['c_month_real'] = np.cos(2 * np.pi * t / 30.44)
df_daily['c_month_imag'] = np.sin(2 * np.pi * t / 30.44)

df_daily['c_quarter_real'] = np.cos(2 * np.pi * t / 91.31)
df_daily['c_quarter_imag'] = np.sin(2 * np.pi * t / 91.31)

df_daily['c_year_real'] = np.cos(2 * np.pi * t / 365.25)
df_daily['c_year_imag'] = np.sin(2 * np.pi * t / 365.25)

##### Новые признаки

In [20]:
# добавляем фичи на волатильность Паркинсона
df_daily = add_parkinson_features(df_daily, high_col='high', low_col='low', window=20)

In [21]:
# Сглаживание DayAvgPrice: rolling (5, 10, 20) и EMA
df_daily['DayAvgPrice_roll5']  = df_daily['DayAvgPrice'].rolling(5,  min_periods=5).mean()
df_daily['DayAvgPrice_roll10'] = df_daily['DayAvgPrice'].rolling(10, min_periods=10).mean()
df_daily['DayAvgPrice_roll20'] = df_daily['DayAvgPrice'].rolling(20, min_periods=20).mean()

df_daily['DayAvgPrice_ema5']   = df_daily['DayAvgPrice'].ewm(span=5,  adjust=False, min_periods=5).mean()
df_daily['DayAvgPrice_ema20']  = df_daily['DayAvgPrice'].ewm(span=20, adjust=False, min_periods=20).mean()

In [22]:
# Сглаживание IntradayStd: rolling (5, 10, 20) и EMA
df_daily['IntradayStd_roll5']  = df_daily['IntradayStd'].rolling(5,  min_periods=5).mean()
df_daily['IntradayStd_roll10'] = df_daily['IntradayStd'].rolling(10, min_periods=10).mean()
df_daily['IntradayStd_roll20'] = df_daily['IntradayStd'].rolling(20, min_periods=20).mean()

df_daily['IntradayStd_ema5']   = df_daily['IntradayStd'].ewm(span=5,  adjust=False, min_periods=5).mean()
df_daily['IntradayStd_ema20']  = df_daily['IntradayStd'].ewm(span=20, adjust=False, min_periods=20).mean()

In [23]:
# Новые признаки

w = window_size

# Лаги/доходности
df_daily['log_ret_1'] = np.log(df_daily['DayAvgPrice']).diff()
for k in [1,2,3,4,5,6,7,10]:
    df_daily[f'DAP_{k}'] = df_daily['DayAvgPrice'].shift(k)
    df_daily[f'log_ret_{k}']  = df_daily['log_ret_1'].rolling(k).sum()  # k-step cum return

# Оконные агрегаты
df_daily['mean_w'] = df_daily['DayAvgPrice'].rolling(w, min_periods=w).mean()
df_daily['std_w']  = df_daily['DayAvgPrice'].rolling(w, min_periods=w).std()
df_daily['z_w']    = (df_daily['DayAvgPrice'] - df_daily['mean_w']) / (df_daily['std_w'] + 1e-9)
df_daily['q10_w']  = df_daily['DayAvgPrice'].rolling(w, min_periods=w).quantile(0.10)
df_daily['q90_w']  = df_daily['DayAvgPrice'].rolling(w, min_periods=w).quantile(0.90)

# Наклон тренда (скользящая линрегрессия через cov/var)
def rolling_slope(s, w):
    x = np.arange(len(s))
    mx = pd.Series(x).rolling(w, min_periods=w).mean().values
    my = s.rolling(w, min_periods=w).mean().values
    cov = (pd.Series(x)*s).rolling(w, min_periods=w).mean().values - mx*my
    var = pd.Series(x).rolling(w, min_periods=w).var().values + 1e-9
    return cov/var
df_daily['slope_w'] = rolling_slope(df_daily['DayAvgPrice'], w)

# Волатильность
# df_daily['parkinson_vol'] = (np.log(df_daily['high']/df_daily['low'])**2).rolling(w, min_periods=w).mean()
df_daily['vol_w'] = df_daily['log_ret_1'].rolling(w, min_periods=w).std()

# Объём
df_daily['vol_z'] = (df_daily['Volume'] - df_daily['Volume'].rolling(w).mean()) / (df_daily['Volume'].rolling(w).std() + 1e-9)

# Календарь (синусы – без утечек)
d = pd.to_datetime(df_daily['date'])
df_daily['dow_sin'] = np.sin(2*np.pi*d.dt.dayofweek/7)
df_daily['dow_cos'] = np.cos(2*np.pi*d.dt.dayofweek/7)
df_daily['moy_sin'] = np.sin(2*np.pi*(d.dt.month-1)/12)
df_daily['moy_cos'] = np.cos(2*np.pi*(d.dt.month-1)/12)

In [24]:
# Техиндикаторы с pandas_ta:
df_daily['rsi14'] = ta.rsi(df_daily['DayAvgPrice'], length=14)
macd = ta.macd(df_daily['DayAvgPrice'], fast=12, slow=26, signal=9)
df_daily[['macd','macd_signal','macd_hist']] = macd[['MACD_12_26_9','MACDs_12_26_9','MACDh_12_26_9']]
bb = ta.bbands(df_daily['DayAvgPrice'], length=20)
df_daily[['bb_low','bb_mid','bb_up']] = bb[['BBL_20_2.0','BBM_20_2.0','BBU_20_2.0']]
df_daily['atr14'] = ta.atr(df_daily['high'], df_daily['low'], df_daily['DayAvgPrice'], length=14)

In [25]:
# Hilbert-фаза/амплитуда (сигнал в комплексной форме)
from scipy.signal import hilbert
analytic = pd.Series(df_daily['DayAvgPrice']).rolling(w, min_periods=w).apply(
    lambda x: np.nan if x.isna().any() else np.angle(hilbert(x.values))[-1], raw=False
)
df_daily['phi_hilbert'] = analytic  # фаза конца окна
# при желании — дельта фазы:
df_daily['dphi_hilbert'] = df_daily['phi_hilbert'].diff()

In [26]:
# Wavelet/STFT энергия (устойчивые спектральные фичи)(эскиз на STFT; для продакшена лучше кэшировать)

prices = df_daily['DayAvgPrice'].values
window_stft = 128  # длина окна для rolling STFT (≈ полгода дневных данных, например)

energy_low = []
energy_mid = []
energy_high = []

for i in range(len(prices)):
    x = prices[:i+1]  # берём префикс до текущего момента
    e = stft_energy_tail(x, w=window_stft)
    energy_low.append(e[0])
    energy_mid.append(e[1])
    energy_high.append(e[2])

df_daily['stft_energy_low']  = energy_low
df_daily['stft_energy_mid']  = energy_mid
df_daily['stft_energy_high'] = energy_high

# Вариант для коротких df (len(df) >= 128)
df_daily[['stft_energy_low', 'stft_energy_mid', 'stft_energy_high']] = (
    df_daily[['stft_energy_low', 'stft_energy_mid', 'stft_energy_high']].fillna(0)
)

# !!!Вариант для длинных df (len(df) >> 128) - лучше сделать dropna после создания всех фич!!!
#df_daily = df_daily.dropna(subset=['stft_energy_low', 'stft_energy_mid', 'stft_energy_high'])

In [27]:
len(df_daily)

969

In [28]:
# advanced_features

# df — ваш OHLCV с datetime индексом (часовой/дневной — не важно)
df_daily = build_advanced_features(df_daily, price_col='DayAvgPrice', w_fast=14, w_slow=20)

# Дальше: добавляйте нужные колонки в real_cols и нормируйте по train-окну
new_cols = [
    'gk_sigma','rs_sigma','yz_sigma','adx','chop','kalm_slope',
    'rv20','vol_of_vol','bb_pct_b','bb_bandwidth',
    'ret_overnight','ret_intraday','corr_ret_dlogvol'
]

In [29]:
# Спектральные признаки(собственные значения TOEPLITZ и CORR матриц) + на весь df.dropna()!!!
df_daily = lambdas_C_T(df_daily, window_size)

In [191]:
# df_daily.isna().sum()

##### Создание целевой переменной Target

In [30]:
# Создаём Target, пустые последние значения заменяются предыдущим непустым ffill
time_shift = 1 # другие сдвиги реализуем в рабочем блоке
df_daily['Target'] = df_daily['DayAvgPrice'].shift( - time_shift).fillna(method='ffill')   # fffill
#df_daily['DayAvgPrice'] = df_daily['DayAvgPrice_bfill']                                   # b?fill
#df_daily['Target'] = df_daily['DayAvgPrice_bfill'].shift( - time_shift).fillna(0)
#df_daily = df_daily.drop(['DayAvgPrice_bfill'], axis=1)                                   # ?bfill
#df_daily[['date', 'DayAvgPrice', 'Target', 'DayAvgPrice_bfill']].tail(10)
df_daily[['date', 'DayAvgPrice', 'Target']].tail(10)

Unnamed: 0,date,DayAvgPrice,Target
859,2025-10-30,34.865,34.751945
860,2025-10-31,34.751945,34.606944
861,2025-11-03,34.606944,33.886111
862,2025-11-04,33.886111,34.891389
863,2025-11-05,34.891389,35.163611
864,2025-11-06,35.163611,34.328333
865,2025-11-07,34.328333,35.753889
866,2025-11-10,35.753889,35.886111
867,2025-11-11,35.886111,36.299444
868,2025-11-12,36.299444,36.299444


In [31]:
# Сглаженный таргет (не меняем исходный Target, а создаём Target_smooth)
ema_fast: int = 5
ema_slow: Optional[int] = None
df_daily["Target_smooth"] = smooth_target_ema(df_daily['Target'], span_fast=ema_fast, span_slow=ema_slow)

In [37]:
len(list(df_daily.columns))

107

In [33]:
df_daily.describe()

Unnamed: 0,open,high,low,close,Volume,DayAvgPrice,IntradayStd,day_of_week,day_of_year,Log_Profit,DayAvgPrice_diff,DayAvgPrice_2diff,POLY_1,POLY_2,POLY_3,real_time,imag_time,c_week_real,c_week_imag,c_month_real,c_month_imag,c_quarter_real,c_quarter_imag,c_year_real,c_year_imag,parkinson_vol,parkinson_vol_ma5,parkinson_vol_ma20,parkinson_vol_diff1,parkinson_vol_lag1,DayAvgPrice_roll5,DayAvgPrice_roll10,DayAvgPrice_roll20,DayAvgPrice_ema5,DayAvgPrice_ema20,IntradayStd_roll5,IntradayStd_roll10,IntradayStd_roll20,IntradayStd_ema5,IntradayStd_ema20,log_ret_1,DAP_1,DAP_2,log_ret_2,DAP_3,log_ret_3,DAP_4,log_ret_4,DAP_5,log_ret_5,DAP_6,log_ret_6,DAP_7,log_ret_7,DAP_10,log_ret_10,mean_w,std_w,z_w,q10_w,q90_w,slope_w,vol_w,vol_z,dow_sin,dow_cos,moy_sin,moy_cos,rsi14,macd,macd_signal,macd_hist,bb_low,bb_mid,bb_up,atr14,phi_hilbert,dphi_hilbert,stft_energy_low,stft_energy_mid,stft_energy_high,ret,ret_overnight,ret_intraday,gk_sigma,rs_sigma,yz_sigma,adx,di_plus,di_minus,chop,kalm_slope,rv20,vol_of_vol,bb_pct_b,bb_bandwidth,corr_ret_dlogvol,lambda_C3,lambda_C2,lambda_C1,lambda_T3,lambda_T2,lambda_T1,Target,Target_smooth
count,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0,869.0
mean,34.156525,34.567186,33.747612,34.164287,1612965.0,34.164593,0.197921,2.009206,189.853855,-0.000343,0.01077,0.000448,25.619772,39.50304,17.324513,0.989113,4.707135,-0.000256,0.001121896,-0.010648,-0.002928,-0.01933,0.027244,-0.120747,0.02938,0.01582,0.015817,0.01584,2e-06,0.015818,34.144187,34.123857,34.090618,34.145412,34.087012,0.197813,0.197614,0.197652,0.197815,0.197705,0.000343,34.153823,34.1435,0.000674,34.133321,0.001001,34.125701,0.001248,34.117061,0.001524,34.10891,0.001785,34.102956,0.001976,34.085476,0.00252,33.594667,2.451729,0.200246,30.394665,36.801089,0.014022,0.017834,-0.049037,0.354277,-0.085464,-0.07305,-0.07423135,51.538552,0.058647,0.061784,-0.003137,31.9524,34.090618,36.228836,0.932778,0.036397,1.2e-05,96.501256,0.247587,0.121323,0.000325,9.1e-05,0.000234,0.01584,0.021994,0.019396,35.640585,24.563394,24.073846,47.341355,0.00694,0.018083,0.002115,0.502609,0.130283,-0.070926,0.001148,0.003909,0.994943,0.115145,0.366748,0.518106,34.174822,34.152678
std,5.260152,5.314538,5.190008,5.260955,1133325.0,5.260624,0.119282,1.410104,101.22855,0.017715,0.623181,0.846859,12.045536,11.154363,11.303343,0.018,1.323405,0.707147,0.7078798,0.706927,0.708014,0.707459,0.70678,0.711653,0.692272,0.004896,0.004845,0.004599,0.000714,0.004896,5.239871,5.217039,5.156346,5.22304,5.074889,0.078953,0.070006,0.063247,0.078109,0.062518,0.017715,5.265826,5.271216,0.026035,5.276641,0.031833,5.281138,0.036601,5.285579,0.04087,5.289843,0.044737,5.293004,0.048147,5.299235,0.059003,4.750612,1.11485,1.306478,3.961644,5.840345,0.062431,0.003915,1.012882,0.514688,0.77671,0.703114,0.7042183,14.26523,0.787406,0.742317,0.23445,4.886781,5.156346,5.636351,0.292734,0.192668,0.018011,31.213995,0.081625,0.03994,0.019234,0.012493,0.016554,0.004866,0.009616,0.00783,18.265277,11.247807,11.734754,12.285013,0.175044,0.006468,0.002232,0.335761,0.059345,0.271791,0.001045,0.002779,0.003743,0.060533,0.058695,0.063454,5.256115,5.222116
min,22.95,23.335,22.475,22.7,121268.0,22.799722,0.047015,0.0,2.0,-0.146723,-2.993889,-6.231945,10.229646,25.417374,2.916306,0.958196,3.0,-0.900969,-0.9749279,-0.999991,-0.999998,-0.999997,-0.999997,-0.999998,-0.999999,0.008612,0.00921,0.009645,-0.005184,0.008612,23.355028,23.904708,24.389833,23.456731,24.478884,0.068789,0.085861,0.104822,0.073593,0.098615,-0.086465,22.799722,22.799722,-0.109183,22.799722,-0.123245,22.799722,-0.140571,22.799722,-0.139858,22.799722,-0.153691,22.799722,-0.158116,22.799722,-0.179765,25.675078,0.828346,-2.83633,23.641069,27.655042,-0.130842,0.011098,-1.444818,-0.433884,-0.900969,-1.0,-1.0,20.95221,-2.13622,-1.821542,-0.64026,22.695489,24.389833,25.777201,0.501537,-0.504529,-0.085452,0.0,0.0,0.0,-0.089562,-0.152844,-0.08796,0.009058,0.010089,0.009788,8.030653,1.187951,0.479616,-8.766098,-0.537462,0.007339,0.000125,-0.266926,0.036085,-0.700877,0.000119,0.000324,0.980299,0.015043,0.20239,0.336118,22.799722,23.456731
25%,30.325,30.665,29.985,30.34,919539.0,30.301389,0.122788,1.0,104.0,-0.010213,-0.282222,-0.430278,17.891249,32.728952,10.294955,0.973596,4.0,-0.900969,-0.7818315,-0.71653,-0.709293,-0.731892,-0.676115,-0.813042,-0.641689,0.012009,0.012052,0.012278,-0.000193,0.012009,30.3015,30.30625,30.233486,30.333331,30.219346,0.139299,0.143013,0.140982,0.139877,0.142645,-0.008574,30.290417,30.262778,-0.01258,30.256667,-0.015509,30.245833,-0.020685,30.217778,-0.022782,30.202917,-0.025817,30.158889,-0.03027,30.135694,-0.038651,30.279042,1.784332,-0.885019,28.71475,31.390069,-0.024121,0.01569,-0.728127,0.0,-0.900969,-0.866025,-0.8660254,40.568788,-0.440916,-0.441577,-0.14939,28.836946,30.233486,31.680954,0.690526,-0.076736,-0.010259,78.37005,0.18958,0.093091,-0.009121,-0.003492,-0.009372,0.012223,0.015704,0.014234,20.81836,16.30231,14.923076,40.483475,-0.110593,0.013634,0.000931,0.212674,0.088494,-0.253132,0.000431,0.001988,0.993558,0.071105,0.327237,0.471487,30.303472,30.353493
50%,33.82,34.07,33.459999,33.849998,1294837.0,33.782222,0.169946,2.0,199.0,-0.000941,0.035556,0.029166,22.718347,36.842073,14.094264,0.989048,4.5,-0.222521,-1.371357e-14,-0.022703,-0.004128,-0.028381,0.052273,-0.262452,0.047289,0.014825,0.014812,0.014755,1e-06,0.014825,33.684056,33.647667,33.49525,33.637456,33.405991,0.185983,0.19264,0.193741,0.186818,0.196234,0.000941,33.738056,33.73,0.001215,33.698889,0.000654,33.675833,0.000938,33.6575,0.00073,33.655,0.001217,33.6375,0.000341,33.594444,-0.000602,32.552583,2.277495,0.346182,29.761028,34.698306,0.011423,0.01751,-0.370334,0.433884,-0.222521,0.0,-1.83697e-16,50.491289,0.023845,0.052397,-0.014813,31.694704,33.49525,35.372454,0.915528,0.039498,0.000688,88.845335,0.251321,0.123741,0.00032,0.000856,0.000832,0.014822,0.020722,0.017965,33.247309,23.557161,24.686405,47.487416,-0.005016,0.016501,0.001503,0.507741,0.121351,-0.082468,0.000912,0.003281,0.995579,0.109317,0.381839,0.514718,33.793333,33.664624
75%,37.849998,38.369999,37.4,37.860001,1986772.0,37.930278,0.231679,3.0,275.0,0.008574,0.331944,0.423056,29.059656,43.15078,20.811698,1.004798,5.0,0.62349,0.7818315,0.696076,0.706377,0.685064,0.730366,0.591779,0.712786,0.018323,0.018279,0.018559,0.000205,0.018323,37.8895,37.980167,37.922056,37.945632,37.955848,0.232055,0.237085,0.23136,0.232344,0.228708,0.010213,37.930278,37.930278,0.016292,37.930278,0.019147,37.930278,0.022614,37.930278,0.027444,37.930278,0.031257,37.930278,0.035353,37.930278,0.043108,38.481568,2.889764,1.175411,33.780389,41.837222,0.054325,0.020624,0.26642,0.781831,0.62349,0.5,0.5,61.24592,0.560666,0.536496,0.162363,35.885186,37.922056,40.311201,1.06965,0.167476,0.010264,127.269114,0.310926,0.152773,0.010324,0.004653,0.00939,0.018597,0.025309,0.022364,44.911587,30.782656,32.983508,56.125915,0.118454,0.021288,0.002311,0.793136,0.155333,0.131131,0.001478,0.005212,0.997603,0.15763,0.40963,0.574523,37.930278,37.945632
max,47.41,47.64,46.87,47.64,7742448.0,47.315,0.971442,4.0,365.0,0.086465,5.962778,5.611945,103.330032,132.634413,79.331036,1.020198,7.0,1.0,0.9749279,1.0,0.999998,0.999988,1.0,0.999991,0.999995,0.034251,0.033232,0.032113,0.006287,0.034251,46.958333,46.875639,46.480736,46.815664,45.520425,0.617681,0.553631,0.43086,0.623092,0.459443,0.146723,47.315,47.315,0.156065,47.315,0.158446,47.315,0.170473,47.315,0.178261,47.315,0.180421,47.315,0.19308,47.315,0.208193,41.145458,6.061485,4.338092,37.458222,46.620444,0.185976,0.029348,5.575053,0.974928,1.0,1.0,1.0,93.924552,3.033822,2.802738,0.787132,45.410689,46.480736,50.282102,2.106821,0.489061,0.0998,139.584155,0.42385,0.204655,0.166402,0.109666,0.107444,0.035076,0.065589,0.054778,91.02493,71.447824,56.726908,85.211578,0.62861,0.039381,0.015854,1.404195,0.397732,0.488045,0.005543,0.014391,0.999556,0.32854,0.478968,0.644102,47.315,46.815664


In [34]:
df_daily.head()

Unnamed: 0,date,open,high,low,close,Volume,DayAvgPrice,IntradayStd,day_of_week,day_of_year,Log_Profit,DayAvgPrice_diff,DayAvgPrice_2diff,POLY_1,POLY_2,POLY_3,real_time,imag_time,c_week_real,c_week_imag,c_month_real,c_month_imag,c_quarter_real,c_quarter_imag,c_year_real,c_year_imag,parkinson_vol,parkinson_vol_ma5,parkinson_vol_ma20,parkinson_vol_diff1,parkinson_vol_lag1,DayAvgPrice_roll5,DayAvgPrice_roll10,DayAvgPrice_roll20,DayAvgPrice_ema5,DayAvgPrice_ema20,IntradayStd_roll5,IntradayStd_roll10,IntradayStd_roll20,IntradayStd_ema5,IntradayStd_ema20,log_ret_1,DAP_1,DAP_2,log_ret_2,DAP_3,log_ret_3,DAP_4,log_ret_4,DAP_5,log_ret_5,DAP_6,log_ret_6,DAP_7,log_ret_7,DAP_10,log_ret_10,mean_w,std_w,z_w,q10_w,q90_w,slope_w,vol_w,vol_z,dow_sin,dow_cos,moy_sin,moy_cos,rsi14,macd,macd_signal,macd_hist,bb_low,bb_mid,bb_up,atr14,phi_hilbert,dphi_hilbert,stft_energy_low,stft_energy_mid,stft_energy_high,ret,ret_overnight,ret_intraday,gk_sigma,rs_sigma,yz_sigma,adx,di_plus,di_minus,chop,kalm_slope,rv20,vol_of_vol,bb_pct_b,bb_bandwidth,corr_ret_dlogvol,lambda_C3,lambda_C2,lambda_C1,lambda_T3,lambda_T2,lambda_T1,Target,Target_smooth
0,2022-06-21,27.31,27.88,27.030001,27.18,931103,27.409861,0.246471,1,172,-0.017286,0.469722,0.445278,29.882515,33.484643,10.06233,0.958196,4.0,-0.222521,0.974928,-0.21907,0.975709,0.826479,0.562968,-0.14889,0.988854,0.0156,0.015481,0.017555,0.000307,0.015293,27.176111,27.950361,28.614931,27.338104,28.020139,0.199733,0.200091,0.180634,0.205849,0.194253,0.017286,26.940139,26.915694,0.018193,26.908194,0.018472,27.706667,-0.01077,27.655139,-0.008909,27.808333,-0.014433,28.712639,-0.046435,29.586667,-0.076421,27.271078,2.989436,0.046424,23.701931,31.797597,-0.018603,0.027784,-0.713442,0.781831,0.62349,0.5,-0.866025,44.845841,-0.092564,0.27017,-0.362734,26.450865,28.614931,30.778996,0.883096,-0.187106,0.008084,0.0,0.0,0.0,0.000736,0.005508,-0.004772,0.01648,0.021895,0.018909,29.90562,13.951396,35.103514,40.003375,-0.176382,0.016681,0.002036,0.197697,0.158431,-0.434182,0.00059,0.00232,0.99709,0.144378,0.378687,0.476936,26.235417,26.235417
1,2022-06-22,26.77,26.78,26.025,26.14,2534998,26.235417,0.163473,2,173,0.043793,-1.174445,-1.644167,21.469346,28.907757,26.038277,0.958246,4.5,-0.900969,0.433884,-0.414391,0.910099,0.785815,0.618462,-0.165878,0.986146,0.015875,0.015556,0.01731,0.000275,0.0156,26.881861,27.586,28.538458,26.970542,27.850165,0.20726,0.20161,0.180641,0.191724,0.191321,-0.043793,27.409861,26.940139,-0.026507,26.915694,-0.025599,26.908194,-0.025321,27.706667,-0.054563,27.655139,-0.052701,27.808333,-0.058225,29.879028,-0.130047,27.217166,2.958547,-0.331835,23.701931,31.797597,-0.016599,0.028105,0.360883,0.974928,-0.222521,0.5,-0.866025,36.548407,-0.216133,0.172909,-0.389042,26.161973,28.538458,30.914943,0.918937,-0.210845,-0.023739,0.0,0.0,0.0,-0.039015,-0.0152,-0.023815,0.016513,0.022514,0.019142,31.297289,9.73067,42.61513,35.825056,-0.204039,0.018554,0.002017,0.03657,0.175501,-0.507981,0.000605,0.002343,0.997052,0.153363,0.375136,0.471501,25.593472,26.021435
2,2022-06-23,26.0,26.07,25.205,25.21,2669225,25.593472,0.241397,3,174,0.024773,-0.641944,0.5325,29.198717,31.420732,27.347276,0.958296,5.0,-0.900969,-0.433884,-0.592119,0.805851,0.741431,0.671029,-0.182817,0.983147,0.015962,0.015605,0.017066,8.7e-05,0.015875,26.618917,27.188556,28.383542,26.511518,27.635242,0.19563,0.204492,0.18284,0.208281,0.196091,-0.024773,26.235417,27.409861,-0.068566,26.940139,-0.05128,26.915694,-0.050372,26.908194,-0.050094,27.706667,-0.079336,27.655139,-0.077474,29.567917,-0.144353,27.144106,2.90656,-0.533494,23.701931,31.745333,-0.014145,0.027884,0.442074,0.433884,-0.900969,0.5,-0.866025,32.958832,-0.361693,0.065989,-0.427681,25.685103,28.383542,31.08198,0.9269,-0.229675,-0.01883,0.0,0.0,0.0,-0.036226,-0.00537,-0.030856,0.016278,0.022152,0.018885,33.016922,9.519763,48.533799,28.849285,-0.237763,0.018925,0.001968,-0.032751,0.203989,-0.529925,0.000619,0.002463,0.996918,0.164679,0.371982,0.463339,25.335556,25.792809
3,2022-06-24,25.425,25.765,24.975,25.73,2229549,25.335556,0.229558,4,175,0.010129,-0.257917,0.384028,27.989232,30.605238,22.93738,0.958346,7.0,-0.222521,-0.974928,-0.744708,0.66739,0.693538,0.72042,-0.199702,0.979857,0.016295,0.015805,0.01686,0.000333,0.015962,26.302889,26.850847,28.202972,26.119531,27.416224,0.205281,0.20533,0.187339,0.215374,0.199278,-0.010129,25.593472,26.235417,-0.034902,27.409861,-0.078694,26.940139,-0.061409,26.915694,-0.060501,26.908194,-0.060222,27.706667,-0.089464,28.712639,-0.125129,27.072899,2.861987,-0.607041,23.701931,31.723083,-0.012021,0.027873,0.140669,-0.433884,-0.900969,0.5,-0.866025,31.615328,-0.492188,-0.045646,-0.446541,25.212035,28.202972,31.19391,0.917121,-0.230156,-0.000481,0.0,0.0,0.0,0.020417,0.008492,0.011925,0.01672,0.022389,0.019422,36.379304,8.12974,50.042136,27.815735,-0.264555,0.019286,0.001926,0.111742,0.21872,-0.527309,0.00063,0.002592,0.996778,0.174294,0.371159,0.454547,26.350139,25.978585
4,2022-06-27,26.040001,26.86,25.965,26.09,1031036,26.350139,0.23753,0,178,-0.039265,1.014583,1.2725,28.886272,31.992205,11.00469,0.958496,3.0,0.62349,-0.781831,-0.865681,0.500596,0.642363,0.766401,-0.216527,0.976277,0.016697,0.016086,0.016691,0.000402,0.016295,26.184889,26.705028,28.044514,26.1964,27.314692,0.223686,0.209366,0.189928,0.222759,0.202921,0.039265,25.335556,25.593472,0.029136,26.235417,0.004363,27.409861,-0.039429,26.940139,-0.022144,26.915694,-0.021236,26.908194,-0.020957,27.808333,-0.053862,27.01843,2.822732,-0.236753,23.701931,31.697125,-0.009612,0.028122,-0.665284,0.0,1.0,0.5,-0.866025,41.685497,-0.507883,-0.138094,-0.369789,25.013784,28.044514,31.075244,0.960501,-0.230141,1.5e-05,0.0,0.0,0.0,0.013894,0.011976,0.001918,0.01727,0.023459,0.020135,39.371643,16.579478,43.903432,28.172745,-0.249719,0.019619,0.00188,0.203567,0.225705,-0.551621,0.000644,0.002681,0.996675,0.177886,0.371627,0.450488,26.295417,26.084196


##### Записываем сформированный датасет

In [170]:
# df_daily['Target'].iloc[-1] = 0

In [35]:
# Добавляем количество дней между наблюдениями
df_daily['days_since_prev'] = df_daily['date'].diff().dt.days.fillna(1)

In [36]:
# Записываем сформированный датасет
df_daily.to_csv('../data_archiv/DTG/DTG_new_fea_to_12_11_2025_1d_w100_noweekend.csv', index=False)

In [95]:
# Проверка на утечку данных в будущее. 
# Сравниваются датафреймы, созданные в разные даты.
df_1 = df_daily.copy()

In [96]:
df_2 = pd.read_csv('../data_archiv/DTG/DTG_new_fea_to_29_10_2025_1d_w92.csv')

In [97]:
cols = [c for c in df_1.columns if c not in ('date',)]
df_1 = df_1[:len(df_2)]

In [98]:
for col in cols:
    diff = round((df_1[col] - df_2[col]).abs().max(), 6)
    print("max abs diff on tail:", col, diff)

max abs diff on tail: open 0.0
max abs diff on tail: high 0.0
max abs diff on tail: low 0.0
max abs diff on tail: close 0.0
max abs diff on tail: DayAvgPrice 0.0
max abs diff on tail: IntradayStd 0.0
max abs diff on tail: Volume 0.0
max abs diff on tail: day_of_week 0
max abs diff on tail: day_of_year 0
max abs diff on tail: Log_Profit 0.0
max abs diff on tail: DayAvgPrice_diff 0.0
max abs diff on tail: DayAvgPrice_2diff 0.0
max abs diff on tail: POLY_1 0.0
max abs diff on tail: POLY_2 0.0
max abs diff on tail: POLY_3 0.0
max abs diff on tail: real_time 0.0
max abs diff on tail: imag_time 0.0
max abs diff on tail: c_week_real 0.0
max abs diff on tail: c_week_imag 0.0
max abs diff on tail: c_month_real 0.0
max abs diff on tail: c_month_imag 0.0
max abs diff on tail: c_quarter_real 0.0
max abs diff on tail: c_quarter_imag 0.0
max abs diff on tail: c_year_real 0.0
max abs diff on tail: c_year_imag 0.0
max abs diff on tail: parkinson_vol 0.0
max abs diff on tail: parkinson_vol_ma5 0.0
max 