In [1]:
import yfinance as yf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import datetime
import pandas_ta as ta
from scipy.signal import stft
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None, 'display.max_columns', None)

In [2]:

def fill_missing_dates_daily_2(df, date_col='date'):
    """
    !!! Заменяет значения 'IntradayStd' и 'Volume' нулями в праздники и выходные дни. !!!
    Заполняет пропущенные даты в DataFrame (включая выходные и праздники),
    копируя значения (ffill) с предыдущей даты.
    Удаляет дублирующиеся даты, если они есть, оставляя последнюю запись.
    """
    # Переводим столбец дат в datetime
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')

    # На всякий случай сортируем
    df.sort_values(by=date_col, inplace=True)

    # Удаляем дубликаты по дате, оставляем "последнюю" запись
    df.drop_duplicates(subset=[date_col], keep='last', inplace=True)

    # Ставим дату в индекс
    df.set_index(date_col, inplace=True)

    # Убеждаемся, что индекс уникален
    if not df.index.is_unique:
        raise ValueError("Индекс дат по-прежнему не уникален. Проверь данные.")
    
    df_1 = df.drop(['DayAvgPrice'], axis=1)
    df_2 = df.drop(['IntradayStd', 'Volume'], axis=1)
    df_3 = df.drop(['IntradayStd', 'Volume'], axis=1)

    df_1 = df_1.asfreq(freq ='D',  fill_value = 0.0)
    df_2 = df_2.asfreq(freq ='D',  method='ffill')
    df_3 = df_3.asfreq(freq ='D',  method='bfill')
    
    df_2['IntradayStd'] = df_1['IntradayStd'].copy()
    df_2['Volume'] = df_1['Volume'].copy()
    #df_2['DayAvgPrice_bfill'] = df_3['DayAvgPrice'].copy()

    # Приводим к дневной частоте и заполняем пропуски методом ffill
    #df_daily = df.asfreq('D', method='ffill')

    # Сбрасываем индекс, если нужно вернуть столбец с датами
    df_2.reset_index(inplace=True)
    df_2.rename(columns={'index': date_col}, inplace=True)

    return df_2

In [3]:
def day_of_week_to_imag_2(df):
    """
    day_of_week : pd.Series - день недели в целых числах начиная с пн = О
    period: float — длина одного цикла (например, 7, если недельный цикл)
    """
    #df = df.replace({0: 1.5, 1: 2.2, 2: 2.8, 3: 3., 4: 3.5, 5: 0.5, 6: 0.2})
    #df = df.replace({0: 0.7, 1: 0.8, 2: 1.0, 3: 1.1, 4: 0.9, 5: 0.2, 6: 0.1})
    df = df.replace({0: 3., 1: 4., 2: 4.5, 3: 5., 4: 7., 5: 0.5, 6: 0.1})
    #df = df.replace({0: 10., 1: 14., 2: 17., 3: 25., 4: 27., 5: 0.5, 6: 0.1})

    return df

In [4]:
def spiral_time_indices(date, day_of_week):
    """
    dates : np.array или pd.Series — упорядоченные временные метки (в днях или долях суток)
    period: float — длина одного цикла (например, 7, если недельный цикл)
    """
    # Пусть у нас dates — это массив float (количество дней с какого-то нуля)
    # Если это pd.Datetime, нужно привести к числу дней:
    #t = (date - date[0]).dt.days.astype(float)
    # t = date - date[0]  # если dates уже float
    # t = date
    #real_part = t.astype(float) / 100.

    #real_part = (date.index + 100.) / 100.
    imag_part = day_of_week_to_imag_2(day_of_week)
    #imag_part = day_of_week_to_imag_2(day_of_week) + np.around(np.log(real_part), 3)
    #imag_part = day_of_week_to_imag_2(day_of_week) + real_part / 10.
    
    real_part = date.apply(lambda x: datetime.datetime.timestamp(x) / 86400 / 20000.) # unix days
    #real_part = real_part - real_part[0] + 1.
    #real_part = (date.index + 1.) / 100.
    #real_part = np.log(date.index + 1.)
    #imag_part = day_of_week_to_imag_2(day_of_week)
    #imag_part = 2 * np.pi * day_of_week / 7.
    #imag_part = np.sin(2 * np.pi * real_part/ 7.)
    # real_part = (date.index + 100.) / 100.
    # imag_part = 2 * np.pi * real_part  / 7.
    #real_part = 1.
    #imag_part = 0.
    return real_part, imag_part

In [5]:
# window_cov_eigenvalues(ts, 20) вернёт массив формы (M, n\_features), 
# где M = {len(ts) - window_size + 1}.
# Cov(3f) - ковариационная матрица для 3-х фич, даёт 3 eigenvalues на окно 

def window_cov_eigenvalues(ts, window_size=21):
    """
    Пример: берем окно из time series (одна или несколько фич),
    строим 'ковариацию' признаков за это окно,
    считаем собственные числа (lambda).
    """
    # Будем возвращать список списков (по окну) из eigenvalues
    all_eigs = []

    for start in range(len(ts) - window_size + 1):
        end = start + window_size
        window_data = ts[start:end]  # (window_size,) если ts - 1D

        # Превратим это в 2D вид (если много признаков, shape=(window_size, n_features))
        # Здесь предположим, что ts уже (N, n_features)
        # Но если ts=(N,), сделаем "fake" 2D
        if len(window_data.shape) == 1:
            window_data = window_data.reshape(-1, 1)  # (window_size, 1)

        # Ковариация: shape=(n_features, n_features)
        # rowvar=False => столбцы - это фичи
        cov_mat = np.cov(window_data, rowvar=False)

        # Собственные числа симметричной матрицы => real-valued
        eigvals = np.linalg.eigvalsh(cov_mat)

        all_eigs.append(eigvals)
    
    return np.array(all_eigs)

In [6]:
def create_lambdas_dataset(eigs, horizon=1):
    """
    eigs: (M, d) - M таймшагов, d - кол-во собственных чисел
    return X, y
    X[t] = eigs[t]
    y[t] = eigs[t + horizon]
    """
    X, Y = [], []
    #for i in range(len(eigs) - horizon):
    for i in range(len(eigs)):  # сейчас делаем только Х
        X.append(eigs[i])
        #Y.append(eigs[i + horizon])
    return np.array(X)#, np.array(Y)

In [7]:
# TOEPLITZ
# 1.1. Интерпретация
# 	•	\mathbf{T}[i, i] = r(0) — это просто средняя энергия сигнала на окне (среднее x_n^2).
# 	•	\mathbf{T}[i, j] при i \neq j показывает автокорреляцию на лаг |i-j|.
# 	•	Матрица симметрична, так как |i-j| = |j-i|.
# T(1f) - Toeplitz - матрица для одной фичи, даёт = window_size eigenvalues на окно window_size.

def autocorr_centered(x):
    x = x - np.mean(x)
    c = np.correlate(x, x, mode='full')
    # c[len(x)-1] -- это r(0)
    # Возвращаем часть от lag=0..W-1
    return c[len(x)-1:len(x)-1 + len(x)]

def build_toeplitz_autocorr_centered(window_data):
    """
    Создаёт Toeplitz-матрицу автокорреляции для одномерного окна window_data.
    window_data: np.array формы (W,).
    return: матрица T формы (W, W).
    """
    window_data = window_data - window_data.mean()
    W = len(window_data)
    r = autocorr_centered(window_data)  # shape (W, ), r[0] = r(0), r[1] = r(1), etc.
    # Нормируем?
    r /= (W - np.arange(W))  # иногда нужна такая нормировка

    T = np.zeros((W, W))
    for i in range(W):
        for j in range(W):
            lag = abs(i - j)
            T[i, j] = r[lag]
    return T

In [8]:
# Допустим, у нас есть весь ряд DayAvgPrice длины N. Мы хотим сделать окна длины W и 
# на каждом окне построить Toeplitz-матрицу, затем вычислить её собственные числа.

def toeplitz_eig_dayavg(price_series, window_size=21):
    """
    price_series: np.array формы (N, ), одномерный ряд (DayAvgPrice).
    window_size: длина окна
    return: массив eigenvalues, shape (M, W), где M ~ (N - window_size + 1).
    """
    N = len(price_series)
    all_eigvals = []

    for start in range(N - window_size + 1):
        end = start + window_size
        window_data = price_series[start:end].to_numpy()
        T = build_toeplitz_autocorr_centered(window_data)
        # Собственные числа симметричной (автокорреляционной) матрицы
        eigvals = np.linalg.eigvalsh(T)
        all_eigvals.append(eigvals)

    return np.array(all_eigvals)  # shape (M, window_size)

In [9]:
def lambdas_C_T(df, window_size):

    # вычисляет собственные значения типа С и Т на окне window_size
    # возвращает датафрейм с добавленными столбцами с.з. и без NaN

    #eigs_data = window_cov_eigenvalues(df[['DayAvgPrice', 'IntradayStd', 'Volume']], window_size)
    eigs_data = window_cov_eigenvalues(df[['DayAvgPrice', 'IntradayStd', 'close']], window_size)
    lambdas_C_arr = create_lambdas_dataset(eigs_data, horizon=1)
    lambdas_T_arr = toeplitz_eig_dayavg(df['DayAvgPrice'], window_size)
    df = df.loc[window_size - 1:].reset_index(drop=True)
    
    # Присоединяем eigenvalues
    df = pd.concat([df, pd.DataFrame(lambdas_C_arr, columns=["lambda_C3", "lambda_C2", "lambda_C1"])], axis=1)
    #df_lambda = pd.concat([df_lambda, pd.DataFrame(lambdas_T_arr[:, -3:], columns=["lambda_T3", "lambda_T2", "lambda_T1"])], axis=1)
    df = pd.concat([df, pd.DataFrame(lambdas_T_arr[:, -5:], columns=["lambda_T5", "lambda_T4", "lambda_T3", "lambda_T2", "lambda_T1"])], axis=1)
    
    # Нормировка столбцов lambda_T (как вариант)
    df['lambda_T_sum'] = df['lambda_T1'] + df['lambda_T2'] + df['lambda_T3']
    df['lambda_T1'] = df['lambda_T1'] / df['lambda_T_sum']
    df['lambda_T2'] = df['lambda_T2'] / df['lambda_T_sum']
    df['lambda_T3'] = df['lambda_T3'] / df['lambda_T_sum']
    df = df.drop(['lambda_T_sum', "lambda_T5", "lambda_T4"], axis=1)

    # Нормировка столбцов lambda_C (как вариант)
    df['lambda_C_sum'] = df['lambda_C1'] + df['lambda_C2'] + df['lambda_C3']
    df['lambda_C1'] = df['lambda_C1'] / df['lambda_C_sum']
    df['lambda_C2'] = df['lambda_C2'] / df['lambda_C_sum']
    df['lambda_C3'] = df['lambda_C3'] / df['lambda_C_sum']
    df = df.drop(['lambda_C_sum'], axis=1)



    df = df.dropna()
    df = df.reset_index(drop=True)

    return df
    

In [10]:
# =========================
# Target smoothing
# =========================
from typing import Iterator, Tuple, Optional, List

def smooth_target_ema(
    y: pd.Series, span_fast: int = 5, span_slow: Optional[int] = None
) -> pd.Series:
    """
    EMA-сглаживание таргета.
    - Если указан только span_fast: обычная EMA.
    - Если указан span_slow: двойная EMA (EMA(EMA(y))) — сильнее подавляет шум.
    """
    y1 = y.ewm(span=span_fast, adjust=False).mean()
    if span_slow is None:
        return y1
    y2 = y1.ewm(span=span_slow, adjust=False).mean()
    return y2

In [11]:
# =========================
# Parkinson vola
# =========================
def add_parkinson_features(
    df: pd.DataFrame,
    high_col: str = "high",
    low_col: str  = "low",
    window: int = 20,
    make_derivatives: bool = True,
    winsor_q: float = 0.995,   # срез экстремумов (опционально)
) -> pd.DataFrame:
    df = df.copy()

    # базовая проверка столбцов
    if high_col not in df.columns or low_col not in df.columns:
        raise ValueError(f"Columns `{high_col}` and `{low_col}` must be in df")

    # безопасное отношение high/low
    ratio = (df[high_col] / df[low_col]).replace([np.inf, -np.inf], np.nan)

    # классическая паркинсоновская дисперсия и волатильность
    parkinson_var = (np.log(ratio) ** 2).rolling(window, min_periods=window).mean()
    parkinson_vol = np.sqrt(parkinson_var / (4 * np.log(2)))

    # winsorize для редких всплесков (по желанию)
    # if winsor_q:
    #     q = parkinson_vol.quantile(winsor_q)
    #     parkinson_vol = parkinson_vol.clip(upper=q)

    df["parkinson_vol"] = parkinson_vol.round(6)

    if make_derivatives:
        # сглаживание и динамика (без заглядывания вперёд)
        df["parkinson_vol_ma5"]   = df["parkinson_vol"].rolling(5,  min_periods=5).mean().round(6)
        df["parkinson_vol_ma20"]  = df["parkinson_vol"].rolling(20, min_periods=20).mean().round(6)
        df["parkinson_vol_diff1"] = df["parkinson_vol"].diff(1).round(6)
        df["parkinson_vol_lag1"]  = df["parkinson_vol"].shift(1).round(6)

    # финальные NaN на ранних барах — оставляем; их отфильтрует твой датасет окон
    return df

In [12]:
# Wavelet/STFT энергия (устойчивые спектральные фичи)(эскиз на STFT; для продакшена лучше кэшировать)

def stft_energy_tail(x: np.ndarray, w: int = 64) -> list:
    """
    Возвращает 3 числа — энергию сигнала в низких, средних и высоких частотах
    за последние w наблюдений x.
    """
    if len(x) < w or np.isnan(x).any():
        return [np.nan, np.nan, np.nan]
    
    # Берём последний кусок длиной w
    x_tail = x[-w:]
    f, t, Z = stft(x_tail, nperseg=w//2)
    S = np.abs(Z) ** 2  # спектральная плотность мощности

    # усредняем по времени
    S_mean = np.mean(S, axis=1)

    # делим на 3 диапазона частот
    bands = np.array_split(S_mean, 3)
    energies = [np.mean(b) for b in bands]
    return energies

In [13]:
# bild_advanced_features

# ========= helpers =========
def _safe_div(a, b):
    return a / b.replace(0, np.nan)

def _rolling_cov(x, y, w):
    return (x.rolling(w).mean()*y.rolling(w).mean() - (x*y).rolling(w).mean()) * (-1)  # не используем, см. ниже

# ========= базовые конструкции =========
def realized_vol(ret, w):
    return ret.rolling(w, min_periods=w).std(ddof=1)

def bbands(close, w=20, k=2.0):
    ma = close.rolling(w, min_periods=w).mean()
    sd = close.rolling(w, min_periods=w).std(ddof=1)
    bb_up = ma + k*sd
    bb_mid = ma
    bb_low = ma - k*sd
    # безразмерные и устойчивые к масштабу:
    pct_b = (close - bb_low) / (bb_up - bb_low)
    bandwidth = _safe_div(bb_up - bb_low, bb_mid.abs())
    return bb_low, bb_mid, bb_up, pct_b, bandwidth

def atr(df, n=14):
    h, l, c = df['high'], df['low'], df['close']
    prev_c = c.shift(1)
    tr = pd.concat([
        (h - l),
        (h - prev_c).abs(),
        (l - prev_c).abs()
    ], axis=1).max(axis=1)
    return tr.rolling(n, min_periods=n).mean()

def adx(df, n=14):
    h, l, c = df['high'], df['low'], df['close']
    up_move = h.diff()
    down_move = -l.diff()
    plus_dm = np.where((up_move > down_move) & (up_move > 0), up_move, 0.0)
    minus_dm = np.where((down_move > up_move) & (down_move > 0), down_move, 0.0)
    tr = atr(df, n)*n  # вернуть к сумме TR
    plus_di = 100 * pd.Series(plus_dm, index=h.index).rolling(n, min_periods=n).sum() / tr
    minus_di= 100 * pd.Series(minus_dm, index=h.index).rolling(n, min_periods=n).sum() / tr
    dx = 100 * (plus_di - minus_di).abs() / (plus_di + minus_di).replace(0, np.nan)
    adx = dx.rolling(n, min_periods=n).mean()
    return adx, plus_di, minus_di

def choppiness(df, n=14):
    _atr = atr(df, n)
    hi = df['high'].rolling(n, min_periods=n).max()
    lo = df['low'].rolling(n, min_periods=n).min()
    denom = (hi - lo).replace(0, np.nan)
    return 100 * np.log(_atr.rolling(n, min_periods=n).sum() / denom) / np.log(n)

def garman_klass_sigma(df, w=20):
    u = np.log(df['high']/df['open'])
    d = np.log(df['low']/df['open'])
    c = np.log(df['close']/df['open'])
    var = (0.5*(u - d)**2 - (2*np.log(2)-1)*c**2).rolling(w, min_periods=w).mean()
    return np.sqrt(var.clip(lower=0))

def rogers_satchell_sigma(df, w=20):
    u = np.log(df['high']/df['close'].shift(1))
    d = np.log(df['low']/df['close'].shift(1))
    c = np.log(df['close']/df['open'])
    var = (u*(u-c) + d*(d-c)).rolling(w, min_periods=w).mean()
    return np.sqrt(var.clip(lower=0))

def yang_zhang_sigma(df, w=20):
    oc = np.log(df['open']/df['close'].shift(1))
    co = np.log(df['close']/df['open'])
    k = 0.34/(1.34 + (w+1)/(w-1))
    oc2 = oc.rolling(w, min_periods=w).var(ddof=1)
    co2 = co.rolling(w, min_periods=w).var(ddof=1)
    rs  = (np.log(df['high']/df['close']) * np.log(df['high']/df['open'])
          + np.log(df['low']/df['close'])  * np.log(df['low']/df['open'])).rolling(w, min_periods=w).mean()
    var = oc2 + k*co2 + (1-k)*rs
    return np.sqrt(var.clip(lower=0))

def amihud_illiquidity(ret, volume, w=20):
    illiq = _safe_div(ret.abs(), volume.replace(0, np.nan))
    return illiq.rolling(w, min_periods=w).mean()

def roll_measure(close, w=20):
    dp = close.diff()
    cov = (dp * dp.shift(1)).rolling(w, min_periods=w).mean()
    sigma_spread = -2 * cov
    sigma_spread = sigma_spread.mask(sigma_spread < 0)  # по определению
    return sigma_spread

def vol_of_vol(vol_series, w=20):
    return vol_series.rolling(w, min_periods=w).std(ddof=1)

def kalman_slope(y, q=1e-5, r=1e-2):
    """
    Простая 1D модель: уровень+наклон. Возвращает оценку slope.
    q,r — дисперсии процесса/измерения.
    """
    n = len(y)
    if n == 0: return pd.Series(dtype=float)
    # состояние [level, slope]
    x = np.array([y.fillna(method='ffill').iloc[0], 0.0], dtype=float)
    P = np.eye(2)
    F = np.array([[1.0, 1.0],
                  [0.0, 1.0]])
    Q = q * np.array([[0.25, 0.5],
                      [0.5,  1.0]])
    H = np.array([[1.0, 0.0]])
    R = np.array([[r]])
    slopes = []
    for z in y.fillna(method='ffill').values:
        # predict
        x = F @ x
        P = F @ P @ F.T + Q
        # update
        yk = z - (H @ x)
        S = H @ P @ H.T + R
        K = (P @ H.T) @ np.linalg.inv(S)
        x = x + (K @ yk).ravel()
        P = (np.eye(2) - K @ H) @ P
        slopes.append(x[1])
    return pd.Series(slopes, index=y.index)

# ========= главный конструктор фич =========
def build_advanced_features(df: pd.DataFrame,
                            price_col: str = 'close',
                            w_fast: int = 14,
                            w_slow: int = 20) -> pd.DataFrame:
    """
    Ожидает df с колонками: open, high, low, close, volume (индекс — datetime).
    Возвращает df с новыми фичами (без нормировки).
    """
    df = df.copy()
    for col in ['open','high','low','close','Volume']:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")
    df = df.sort_index()

    # 0) базовые ретёрны
    df['ret'] = np.log(df['close']/df['close'].shift(1))
    df['ret_overnight'] = np.log(df['open']/df['close'].shift(1))
    df['ret_intraday']  = np.log(df['close']/df['open'])

    # 1) волатильности (улучшенные)
    df['gk_sigma']  = garman_klass_sigma(df, w=w_slow)
    df['rs_sigma']  = rogers_satchell_sigma(df, w=w_slow)
    df['yz_sigma']  = yang_zhang_sigma(df, w=w_slow)

    # 2) режим/тренд
    df['adx'], df['di_plus'], df['di_minus'] = adx(df, n=w_fast)
    df['chop'] = choppiness(df, n=w_fast)
    df['kalm_slope'] = kalman_slope(df[price_col])

    # 3) объёмы/ликвидность
    # df['amihud'] = amihud_illiquidity(df['ret'], df['Volume'], w=w_slow)
    # df['roll_sigma_spread'] = roll_measure(df['close'], w=w_slow)

    # 4) вола-волы и связки
    df['rv20'] = realized_vol(df['ret'], w=w_slow)
    df['vol_of_vol'] = vol_of_vol(df['rv20'], w=w_fast)

    # 5) Bollinger в безразмерном виде
    bb_low, bb_mid, bb_up, pct_b, bw = bbands(df['close'], w=w_slow, k=2.0)
    df['bb_pct_b'] = pct_b
    df['bb_bandwidth'] = bw

    # 6) взаимодействие цены и объёма
    dlog_vol = np.log1p(df['Volume']).diff()
    df['corr_ret_dlogvol'] = df['ret'].rolling(w_slow, min_periods=w_slow).corr(dlog_vol)

    # аккуратный старт: удалим строки до прогрева окон (потом они удаляются)
    # warmup = max(w_fast, w_slow) + 1
    # return df.iloc[warmup:].copy()
    return df.copy()

### Дополнение архивного файла свежими данными.

In [319]:
# Set variables
ticker = "BAYN.DE"
start_date = "2020-09-04"
end_date = "2020-09-27"

In [None]:
# Download new stock data
data = yf.download(ticker, start=start_date, end=end_date, interval='1h', auto_adjust=True)
data.columns = [f'{Price}' for Price, Ticker in data.columns]
data['date'] = data.index.date
data = data.reset_index().drop(columns=['Datetime'])
data.columns = data.columns.str.lower()
data

In [None]:
# load old data from data_archiv
data_old = pd.read_csv('../data_archiv/MBG/MBG_to12_06_2025.csv')
data_old.tail(10)

In [None]:
# load old data from data_archiv
data_new = pd.read_csv('../data_archiv/MBG/MB_13_06_2025to12_08_2025.csv')
data_new.head(10)

In [None]:
# Объединяем с новыми данными
data_fresh = pd.concat([data_old, data_new], ignore_index=True)
data_fresh.tail(20)

In [59]:
# Save
data_fresh.to_csv('../data_archiv/MBG/MBG_fresh.csv', index=False)

##### Загружаем свежие данные с Yahoo Finance

In [54]:
# Set variables
ticker = "DTG.DE"
start_date = "2025-11-13"
end_date = "2025-11-14"

In [55]:
# Download new stock data
data = yf.download(ticker, start=start_date, end=end_date, interval='1h', auto_adjust=True)
data.columns = [f'{Price}' for Price, Ticker in data.columns]
data['date'] = data.index.date
data = data.reset_index().drop(columns=['Datetime'])
data.columns = data.columns.str.lower()
data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,close,high,low,open,volume,date
0,36.25,36.369999,36.150002,36.189999,0,2025-11-13
1,36.150002,36.310001,36.040001,36.25,138735,2025-11-13
2,35.98,36.130001,35.970001,36.130001,19781,2025-11-13
3,36.02,36.02,35.91,35.98,31517,2025-11-13
4,35.959999,36.049999,35.939999,35.990002,10612,2025-11-13
5,35.91,36.049999,35.869999,35.939999,20074,2025-11-13
6,35.959999,36.0,35.860001,35.900002,32631,2025-11-13
7,35.779999,35.950001,35.740002,35.950001,42725,2025-11-13
8,35.810001,35.919998,35.73,35.77,70158,2025-11-13


In [62]:
# load old data from data_archiv
data_old = pd.read_csv('../data_archiv/DTG/DTG_fresh.csv')
data_old.tail(10)

Unnamed: 0,close,high,low,open,volume,date
8689,35.98,35.990002,35.830002,35.849998,27649,2025-11-11
8690,36.240002,36.279999,35.779999,36.240002,0,2025-11-12
8691,36.369999,36.389999,36.130001,36.240002,83456,2025-11-12
8692,36.34,36.459999,36.23,36.360001,55272,2025-11-12
8693,36.369999,36.389999,36.23,36.349998,25211,2025-11-12
8694,36.529999,36.66,36.349998,36.349998,77557,2025-11-12
8695,36.470001,36.59,36.439999,36.52,55861,2025-11-12
8696,36.34,36.459999,36.150002,36.43,66320,2025-11-12
8697,36.09,36.380001,35.970001,36.349998,58665,2025-11-12
8698,36.029999,36.139999,36.029999,36.099998,37288,2025-11-12


In [66]:
data_old.isna().value_counts()

close  high   low    open   volume  date 
False  False  False  False  False   False    8708
dtype: int64

In [65]:
# Объединяем с новыми данными
data_fresh = pd.concat([data_old, data], ignore_index=True)
data_fresh.tail(20)

Unnamed: 0,close,high,low,open,volume,date
8688,35.860001,35.990002,35.720001,35.779999,67239,2025-11-11
8689,35.98,35.990002,35.830002,35.849998,27649,2025-11-11
8690,36.240002,36.279999,35.779999,36.240002,0,2025-11-12
8691,36.369999,36.389999,36.130001,36.240002,83456,2025-11-12
8692,36.34,36.459999,36.23,36.360001,55272,2025-11-12
8693,36.369999,36.389999,36.23,36.349998,25211,2025-11-12
8694,36.529999,36.66,36.349998,36.349998,77557,2025-11-12
8695,36.470001,36.59,36.439999,36.52,55861,2025-11-12
8696,36.34,36.459999,36.150002,36.43,66320,2025-11-12
8697,36.09,36.380001,35.970001,36.349998,58665,2025-11-12


In [67]:
# Save
data_fresh.to_csv('../data_archiv/DTG/DTG_fresh.csv', index=False)

#### START DATA PREP

In [68]:
data_mb = pd.read_csv('../data_archiv/DTG/DTG_fresh.csv')

In [70]:
data_mb[['open','high','low','close']] = data_mb[['open','high','low','close']].apply(pd.to_numeric, errors='coerce')
# И datetime — это pandas.Timestamp с часовыми барами
df = data_mb.copy()
df['datetime'] = pd.to_datetime(df['date'])
df = df.set_index('datetime')

# === Ресемплинг на дневную частоту ===
df_daily = pd.DataFrame()

df_daily['open']  = df['open'].resample('1D').first()
df_daily['high']  = df['high'].resample('1D').max()
df_daily['low']   = df['low'].resample('1D').min()
df_daily['close'] = df['close'].resample('1D').last()
df_daily['Volume']= df['volume'].resample('1D').sum()

df_daily['DayAvgPrice']= df[['open', 'high', 'low', 'close']].resample('1D').mean().mean(axis=1)

df_daily['IntradayStd'] = df.resample('1D').apply(
    lambda day: day[['open','high','low','close']].to_numpy(dtype=float).flatten().std(ddof=1)
).rename('IntradayStd').to_frame()

# df_daily['IntradayStd'] = df.resample('1D').apply(
#     lambda x: np.std(x[['open','high','low','close']].values.flatten(), ddof=1)
# ).to_frame(name='IntradayStd')

# === Убираем пустые дни (например, выходные) ===
df_daily = df_daily.dropna(subset=['open', 'high', 'low', 'close'])

# === Сбрасываем индекс, если нужно для модели ===
df_daily = df_daily.reset_index().rename(columns={'datetime': 'date'})

df_daily.tail()

Unnamed: 0,date,open,high,low,close,Volume,DayAvgPrice,IntradayStd
965,2025-11-07,35.57,36.040001,33.459999,34.610001,1068926,34.328333,0.542284
966,2025-11-10,35.169998,36.130001,34.98,35.48,827914,35.753889,0.282107
967,2025-11-11,35.560001,36.279999,35.529999,35.98,496435,35.886111,0.159271
968,2025-11-12,36.240002,36.66,35.779999,36.029999,459630,36.299444,0.184235
969,2025-11-13,36.189999,36.369999,35.73,35.810001,366233,35.998056,0.155321


In [None]:
# df_daily = fill_missing_dates_daily_2(df_daily, date_col='date') # 'DayAvgPrice' и 'IntradayStd'=0 в неторговые дни
# df_daily.tail(10)

In [None]:
# !!! Только в случае попадания выходных или праздников в конец датасета дополняем датасет вручную !!!

# last_index = df_daily.index.max()
# df_daily.loc[last_index + 1] = [df_daily['date'][last_index] + pd.DateOffset(days=1), df_daily['open'][last_index], df_daily['high'][last_index], df_daily['low'][last_index], df_daily['close'][last_index], df_daily['DayAvgPrice'][last_index], 0., 0,]
# df_daily.loc[last_index + 2] = [df_daily['date'][last_index] + pd.DateOffset(days=2), df_daily['open'][last_index], df_daily['high'][last_index], df_daily['low'][last_index], df_daily['close'][last_index], df_daily['DayAvgPrice'][last_index], 0., 0,]
# df_daily.tail()

### Генерация признаков

In [71]:
# Размер окна для поиска паттернов
window_size = 21

##### Старые признаки

In [72]:
# Стандартные
df_daily['day_of_week'] = df_daily['date'].apply(lambda x: x.day_of_week)
df_daily['day_of_year'] = df_daily['date'].apply(lambda x: x.day_of_year)
df_daily['Log_Profit'] = np.log(df_daily['DayAvgPrice'].shift(1) / df_daily['DayAvgPrice'])
df_daily['DayAvgPrice_diff'] = df_daily['DayAvgPrice'] - df_daily['DayAvgPrice'].shift(1)
df_daily['DayAvgPrice_2diff'] = df_daily['DayAvgPrice_diff'] - df_daily['DayAvgPrice_diff'].shift(1)

In [73]:
# Полиномиальные
df_daily['POLY_1'] = np.sqrt(df_daily['DayAvgPrice']) + df_daily['IntradayStd'] * 100.
df_daily['POLY_2'] = df_daily['DayAvgPrice'] + (df_daily['IntradayStd'] * 10.) ** 2
df_daily['POLY_3'] = df_daily['DayAvgPrice'] ** 2 / 1000. + df_daily['Volume'] / 100000.

In [74]:
# === Комплексные
df_daily['real_time'], df_daily['imag_time'] = spiral_time_indices(df_daily['date'], df_daily['day_of_week'])

# === Комплексные циклы ===
t = np.arange(len(df_daily))
df_daily['c_week_real'] = np.cos(2 * np.pi * t / 7)
df_daily['c_week_imag'] = np.sin(2 * np.pi * t / 7)

df_daily['c_month_real'] = np.cos(2 * np.pi * t / 30.44)
df_daily['c_month_imag'] = np.sin(2 * np.pi * t / 30.44)

df_daily['c_quarter_real'] = np.cos(2 * np.pi * t / 91.31)
df_daily['c_quarter_imag'] = np.sin(2 * np.pi * t / 91.31)

df_daily['c_year_real'] = np.cos(2 * np.pi * t / 365.25)
df_daily['c_year_imag'] = np.sin(2 * np.pi * t / 365.25)

##### Новые признаки

In [75]:
# добавляем фичи на волатильность Паркинсона
df_daily = add_parkinson_features(df_daily, high_col='high', low_col='low', window=20)

In [76]:
# Сглаживание DayAvgPrice: rolling (5, 10, 20) и EMA
df_daily['DayAvgPrice_roll5']  = df_daily['DayAvgPrice'].rolling(5,  min_periods=5).mean()
df_daily['DayAvgPrice_roll10'] = df_daily['DayAvgPrice'].rolling(10, min_periods=10).mean()
df_daily['DayAvgPrice_roll20'] = df_daily['DayAvgPrice'].rolling(20, min_periods=20).mean()

df_daily['DayAvgPrice_ema5']   = df_daily['DayAvgPrice'].ewm(span=5,  adjust=False, min_periods=5).mean()
df_daily['DayAvgPrice_ema20']  = df_daily['DayAvgPrice'].ewm(span=20, adjust=False, min_periods=20).mean()

In [77]:
# Сглаживание IntradayStd: rolling (5, 10, 20) и EMA
df_daily['IntradayStd_roll5']  = df_daily['IntradayStd'].rolling(5,  min_periods=5).mean()
df_daily['IntradayStd_roll10'] = df_daily['IntradayStd'].rolling(10, min_periods=10).mean()
df_daily['IntradayStd_roll20'] = df_daily['IntradayStd'].rolling(20, min_periods=20).mean()

df_daily['IntradayStd_ema5']   = df_daily['IntradayStd'].ewm(span=5,  adjust=False, min_periods=5).mean()
df_daily['IntradayStd_ema20']  = df_daily['IntradayStd'].ewm(span=20, adjust=False, min_periods=20).mean()

In [78]:
# Новые признаки

w = window_size

# Лаги/доходности
df_daily['log_ret_1'] = np.log(df_daily['DayAvgPrice']).diff()
for k in [1,2,3,4,5,6,7,10]:
    df_daily[f'DAP_{k}'] = df_daily['DayAvgPrice'].shift(k)
    df_daily[f'log_ret_{k}']  = df_daily['log_ret_1'].rolling(k).sum()  # k-step cum return

# Оконные агрегаты
df_daily['mean_w'] = df_daily['DayAvgPrice'].rolling(w, min_periods=w).mean()
df_daily['std_w']  = df_daily['DayAvgPrice'].rolling(w, min_periods=w).std()
df_daily['z_w']    = (df_daily['DayAvgPrice'] - df_daily['mean_w']) / (df_daily['std_w'] + 1e-9)
df_daily['q10_w']  = df_daily['DayAvgPrice'].rolling(w, min_periods=w).quantile(0.10)
df_daily['q90_w']  = df_daily['DayAvgPrice'].rolling(w, min_periods=w).quantile(0.90)

# Наклон тренда (скользящая линрегрессия через cov/var)
def rolling_slope(s, w):
    x = np.arange(len(s))
    mx = pd.Series(x).rolling(w, min_periods=w).mean().values
    my = s.rolling(w, min_periods=w).mean().values
    cov = (pd.Series(x)*s).rolling(w, min_periods=w).mean().values - mx*my
    var = pd.Series(x).rolling(w, min_periods=w).var().values + 1e-9
    return cov/var
df_daily['slope_w'] = rolling_slope(df_daily['DayAvgPrice'], w)

# Волатильность
# df_daily['parkinson_vol'] = (np.log(df_daily['high']/df_daily['low'])**2).rolling(w, min_periods=w).mean()
df_daily['vol_w'] = df_daily['log_ret_1'].rolling(w, min_periods=w).std()

# Объём
df_daily['vol_z'] = (df_daily['Volume'] - df_daily['Volume'].rolling(w).mean()) / (df_daily['Volume'].rolling(w).std() + 1e-9)

# Календарь (синусы – без утечек)
d = pd.to_datetime(df_daily['date'])
df_daily['dow_sin'] = np.sin(2*np.pi*d.dt.dayofweek/7)
df_daily['dow_cos'] = np.cos(2*np.pi*d.dt.dayofweek/7)
df_daily['moy_sin'] = np.sin(2*np.pi*(d.dt.month-1)/12)
df_daily['moy_cos'] = np.cos(2*np.pi*(d.dt.month-1)/12)

In [79]:
# Техиндикаторы с pandas_ta:
df_daily['rsi14'] = ta.rsi(df_daily['DayAvgPrice'], length=14)
macd = ta.macd(df_daily['DayAvgPrice'], fast=12, slow=26, signal=9)
df_daily[['macd','macd_signal','macd_hist']] = macd[['MACD_12_26_9','MACDs_12_26_9','MACDh_12_26_9']]
bb = ta.bbands(df_daily['DayAvgPrice'], length=20)
df_daily[['bb_low','bb_mid','bb_up']] = bb[['BBL_20_2.0','BBM_20_2.0','BBU_20_2.0']]
df_daily['atr14'] = ta.atr(df_daily['high'], df_daily['low'], df_daily['DayAvgPrice'], length=14)

In [80]:
# Hilbert-фаза/амплитуда (сигнал в комплексной форме)
from scipy.signal import hilbert
analytic = pd.Series(df_daily['DayAvgPrice']).rolling(w, min_periods=w).apply(
    lambda x: np.nan if x.isna().any() else np.angle(hilbert(x.values))[-1], raw=False
)
df_daily['phi_hilbert'] = analytic  # фаза конца окна
# при желании — дельта фазы:
df_daily['dphi_hilbert'] = df_daily['phi_hilbert'].diff()

In [81]:
# Wavelet/STFT энергия (устойчивые спектральные фичи)(эскиз на STFT; для продакшена лучше кэшировать)

prices = df_daily['DayAvgPrice'].values
window_stft = 128  # длина окна для rolling STFT (≈ полгода дневных данных, например)

energy_low = []
energy_mid = []
energy_high = []

for i in range(len(prices)):
    x = prices[:i+1]  # берём префикс до текущего момента
    e = stft_energy_tail(x, w=window_stft)
    energy_low.append(e[0])
    energy_mid.append(e[1])
    energy_high.append(e[2])

df_daily['stft_energy_low']  = energy_low
df_daily['stft_energy_mid']  = energy_mid
df_daily['stft_energy_high'] = energy_high

# Вариант для коротких df (len(df) >= 128)
df_daily[['stft_energy_low', 'stft_energy_mid', 'stft_energy_high']] = (
    df_daily[['stft_energy_low', 'stft_energy_mid', 'stft_energy_high']].fillna(0)
)

# !!!Вариант для длинных df (len(df) >> 128) - лучше сделать dropna после создания всех фич!!!
#df_daily = df_daily.dropna(subset=['stft_energy_low', 'stft_energy_mid', 'stft_energy_high'])

In [82]:
len(df_daily)

970

In [83]:
# advanced_features

# df — ваш OHLCV с datetime индексом (часовой/дневной — не важно)
df_daily = build_advanced_features(df_daily, price_col='DayAvgPrice', w_fast=14, w_slow=20)

# Дальше: добавляйте нужные колонки в real_cols и нормируйте по train-окну
new_cols = [
    'gk_sigma','rs_sigma','yz_sigma','adx','chop','kalm_slope',
    'rv20','vol_of_vol','bb_pct_b','bb_bandwidth',
    'ret_overnight','ret_intraday','corr_ret_dlogvol'
]

In [84]:
# Спектральные признаки(собственные значения TOEPLITZ и CORR матриц) + на весь df.dropna()!!!
df_daily = lambdas_C_T(df_daily, window_size)

In [None]:
# df_daily.isna().sum()

##### Создание целевой переменной Target

In [86]:
# Создаём Target, пустые последние значения заменяются предыдущим непустым ffill
time_shift = 1 # другие сдвиги реализуем в рабочем блоке
df_daily['Target'] = df_daily['DayAvgPrice'].shift( - time_shift).fillna(method='ffill')   # fffill
#df_daily['DayAvgPrice'] = df_daily['DayAvgPrice_bfill']                                   # b?fill
#df_daily['Target'] = df_daily['DayAvgPrice_bfill'].shift( - time_shift).fillna(0)
#df_daily = df_daily.drop(['DayAvgPrice_bfill'], axis=1)                                   # ?bfill
#df_daily[['date', 'DayAvgPrice', 'Target', 'DayAvgPrice_bfill']].tail(10)
df_daily[['date', 'DayAvgPrice', 'Target']].tail(10)

Unnamed: 0,date,DayAvgPrice,Target
922,2025-10-31,34.751945,34.606944
923,2025-11-03,34.606944,33.886111
924,2025-11-04,33.886111,34.891389
925,2025-11-05,34.891389,35.163611
926,2025-11-06,35.163611,34.328333
927,2025-11-07,34.328333,35.753889
928,2025-11-10,35.753889,35.886111
929,2025-11-11,35.886111,36.299444
930,2025-11-12,36.299444,35.998056
931,2025-11-13,35.998056,35.998056


In [87]:
# Сглаженный таргет (не меняем исходный Target, а создаём Target_smooth)
ema_fast: int = 5
ema_slow: Optional[int] = None
df_daily["Target_smooth"] = smooth_target_ema(df_daily['Target'], span_fast=ema_fast, span_slow=ema_slow)

In [88]:
# Добавляем количество дней между наблюдениями
df_daily['days_since_prev'] = df_daily['date'].diff().dt.days.fillna(1)

In [89]:
len(list(df_daily.columns))

107

In [90]:
df_daily.describe()

Unnamed: 0,open,high,low,close,Volume,DayAvgPrice,IntradayStd,day_of_week,day_of_year,Log_Profit,DayAvgPrice_diff,DayAvgPrice_2diff,POLY_1,POLY_2,POLY_3,real_time,imag_time,c_week_real,c_week_imag,c_month_real,c_month_imag,c_quarter_real,c_quarter_imag,c_year_real,c_year_imag,parkinson_vol,parkinson_vol_ma5,parkinson_vol_ma20,parkinson_vol_diff1,parkinson_vol_lag1,DayAvgPrice_roll5,DayAvgPrice_roll10,DayAvgPrice_roll20,DayAvgPrice_ema5,DayAvgPrice_ema20,IntradayStd_roll5,IntradayStd_roll10,IntradayStd_roll20,IntradayStd_ema5,IntradayStd_ema20,log_ret_1,DAP_1,DAP_2,log_ret_2,DAP_3,log_ret_3,DAP_4,log_ret_4,DAP_5,log_ret_5,DAP_6,log_ret_6,DAP_7,log_ret_7,DAP_10,log_ret_10,mean_w,std_w,z_w,q10_w,q90_w,slope_w,vol_w,vol_z,dow_sin,dow_cos,moy_sin,moy_cos,rsi14,macd,macd_signal,macd_hist,bb_low,bb_mid,bb_up,atr14,phi_hilbert,dphi_hilbert,stft_energy_low,stft_energy_mid,stft_energy_high,ret,ret_overnight,ret_intraday,gk_sigma,rs_sigma,yz_sigma,adx,di_plus,di_minus,chop,kalm_slope,rv20,vol_of_vol,bb_pct_b,bb_bandwidth,corr_ret_dlogvol,lambda_C3,lambda_C2,lambda_C1,lambda_T3,lambda_T2,lambda_T1,Target,Target_smooth,days_since_prev
count,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0,932.0
mean,33.662296,34.074045,33.249753,33.66573,1626297.0,33.667936,0.199217,2.01073,185.790773,-0.000424,0.012611,-5.7e-05,25.705225,39.020258,17.426012,0.986939,4.707082,-0.000967,0.0004655405,-0.009396,-0.002459,-0.018809,-0.00168,-0.088947,0.08511,0.016371,0.016411,0.016542,-2e-05,0.016391,33.642978,33.613724,33.555042,33.643414,33.568139,0.199251,0.199268,0.20031,0.199248,0.20013,0.000424,33.655326,33.642658,0.000846,33.630475,0.001254,33.618496,0.001656,33.607812,0.002024,33.595238,0.002459,33.584138,0.002836,33.550379,0.004006,33.550258,1.130109,0.044703,32.171729,34.925562,0.010797,0.017137,-0.021025,0.354926,-0.087022,-0.01419748,-0.08973326,51.678324,0.054979,0.046804,0.008175,31.414595,33.555042,35.69549,0.944181,0.002746,0.000312,90.115342,0.231197,0.113292,0.000422,0.000334,8.7e-05,0.016431,0.022729,0.020004,35.43649,24.694092,23.59896,47.702435,0.011684,0.01862,0.002164,0.511871,0.132962,-0.068028,0.003314,0.017377,0.979309,0.122445,0.339217,0.538338,33.680459,33.658637,1.429185
std,5.433288,5.479246,5.374738,5.437765,1126751.0,5.435745,0.117688,1.408846,99.323773,0.017977,0.619459,0.841541,11.872482,11.004661,11.23276,0.019283,1.321569,0.707722,0.7072496,0.706336,0.708569,0.712454,0.702229,0.701554,0.702655,0.005403,0.005426,0.005347,0.000714,0.005438,5.421464,5.410312,5.385909,5.406436,5.286285,0.077548,0.068785,0.063219,0.076724,0.061975,0.017977,5.44396,5.451532,0.026459,5.459195,0.032476,5.466775,0.037566,5.475109,0.042002,5.484981,0.046026,5.492817,0.049545,5.519772,0.059922,5.381132,0.569103,1.357297,5.225023,5.646875,0.155847,0.006122,1.000671,0.51423,0.776501,0.716959,0.6919403,14.037166,0.783897,0.75119,0.237845,5.153992,5.385909,5.812702,0.289409,0.075761,0.017167,38.584766,0.100155,0.049034,0.019517,0.01258,0.01684,0.005421,0.009943,0.0081,17.846903,11.060428,11.610968,12.099615,0.172151,0.006889,0.002188,0.334964,0.05903,0.266285,0.003096,0.013405,0.015905,0.07298,0.051604,0.08116,5.427641,5.393916,0.85358
min,22.95,23.335,22.475,22.7,0.0,22.799722,0.0,0.0,2.0,-0.146723,-2.993889,-6.231945,5.077893,24.870843,0.664866,0.953698,3.0,-0.900969,-0.9749279,-0.999991,-0.999998,-0.999997,-0.999997,-0.999998,-0.999999,0.008612,0.00921,0.009645,-0.005184,0.008612,23.355028,23.820159,24.220792,23.456731,24.458399,0.068789,0.085861,0.104822,0.073593,0.098615,-0.086465,22.799722,22.799722,-0.109183,22.799722,-0.123245,22.799722,-0.140571,22.799722,-0.139858,22.799722,-0.153691,22.799722,-0.158116,22.799722,-0.179765,24.339418,0.279353,-2.994168,22.90875,25.257083,-0.43113,0.007089,-2.282547,-0.433884,-0.900969,-1.0,-1.0,20.95221,-2.13622,-2.07368,-0.64026,20.464927,24.220792,25.777201,0.501537,-0.231769,-0.077337,0.0,0.0,0.0,-0.089562,-0.152844,-0.08796,0.009058,0.010089,0.009788,8.030653,1.187951,0.479616,-8.766098,-0.537462,0.007339,0.000125,-0.266926,0.036085,-0.700877,0.000263,0.001549,0.884734,0.007208,0.171565,0.35031,22.799722,23.456731,1.0
25%,29.8,30.1,29.43,29.71,931325.8,29.759375,0.124482,1.0,104.0,-0.01017,-0.287986,-0.430625,18.074679,32.263118,10.429959,0.970185,4.0,-0.900969,-0.7818315,-0.712192,-0.71292,-0.73877,-0.69521,-0.784601,-0.602557,0.012257,0.012294,0.012469,-0.000217,0.012257,29.685153,29.749865,29.722615,29.700519,29.823414,0.143168,0.14513,0.144255,0.142018,0.146084,-0.00881,29.749444,29.742778,-0.012831,29.727639,-0.015446,29.717083,-0.02067,29.685208,-0.022758,29.678264,-0.025589,29.665937,-0.029938,29.604931,-0.038547,29.75228,0.751654,-1.141638,28.631111,30.647639,-0.097873,0.012295,-0.694729,0.0,-0.900969,-0.8660254,-0.8660254,40.93559,-0.456392,-0.467999,-0.139774,27.987236,29.722615,31.125688,0.702414,-0.050796,-0.009626,68.47734,0.1816,0.088987,-0.009427,-0.003514,-0.009464,0.012328,0.015974,0.014452,20.98704,16.566616,14.349909,40.71195,-0.106367,0.013865,0.000982,0.221104,0.090415,-0.244802,0.00127,0.008421,0.972676,0.064716,0.30194,0.476763,29.766042,29.70807,1.0
50%,33.36,33.665,33.02,33.325,1316239.0,33.307361,0.171802,2.0,188.0,-0.000878,0.030764,0.026806,22.984981,36.261378,14.271063,0.986923,4.5,-0.222521,-1.151044e-14,-0.018576,-0.004128,-0.030788,-0.007053,-0.159513,0.151016,0.015196,0.01511,0.015141,-1e-05,0.015196,33.358139,33.333944,33.121924,33.360368,33.076241,0.189906,0.194883,0.196252,0.190054,0.197989,0.000878,33.305,33.298194,0.001243,33.292083,0.000725,33.285694,0.001715,33.277222,0.001344,33.268889,0.001815,33.250139,0.001916,33.216389,0.003221,33.096488,1.037796,0.112934,32.180833,34.075833,0.000641,0.016288,-0.282178,0.433884,-0.222521,1.224647e-16,-1.83697e-16,50.785421,0.02993,0.051792,0.006658,31.34739,33.121924,34.76109,0.936332,-0.001713,0.000401,88.229479,0.239,0.11695,0.000513,0.000958,0.000626,0.015117,0.021194,0.018714,33.219272,23.732241,23.873053,47.973767,0.00345,0.017037,0.001581,0.535947,0.124426,-0.060069,0.002464,0.013449,0.98422,0.106011,0.33615,0.554847,33.31125,33.366169,1.0
75%,37.562501,38.119999,37.2325,37.6225,1997846.0,37.699375,0.233215,3.0,269.25,0.00881,0.330833,0.417431,29.242951,42.878202,20.906853,1.00341,5.0,0.62349,0.7818315,0.696076,0.706377,0.694404,0.694466,0.596103,0.785599,0.018903,0.018972,0.019078,0.000192,0.018933,37.670778,37.699726,37.696271,37.708101,37.673627,0.233507,0.235981,0.232235,0.233777,0.230891,0.01017,37.699375,37.699375,0.016486,37.699375,0.019324,37.699375,0.024096,37.699375,0.027779,37.699375,0.03395,37.699375,0.036879,37.699375,0.046888,37.682462,1.350927,1.162325,36.421944,39.138056,0.125597,0.020563,0.381122,0.781831,0.62349,0.8660254,0.5,61.319489,0.565815,0.544994,0.173048,35.630188,37.696271,39.911182,1.082304,0.060865,0.010244,126.745048,0.308873,0.15201,0.01063,0.004958,0.009502,0.019111,0.026756,0.02279,44.609671,30.788865,32.284413,56.44285,0.12514,0.021755,0.002394,0.797153,0.161024,0.117066,0.004196,0.02264,0.989889,0.172166,0.371581,0.612071,37.699375,37.708101,1.0
max,47.41,47.64,46.87,47.64,7742448.0,47.315,0.971442,4.0,365.0,0.086465,5.962778,5.611945,103.330032,132.634413,79.331036,1.020248,7.0,1.0,0.9749279,1.0,0.999998,0.999988,1.0,0.999991,0.999995,0.03739,0.036736,0.034672,0.006287,0.03739,46.958333,46.875639,46.480736,46.815664,45.520425,0.617681,0.553631,0.43086,0.623092,0.459443,0.146723,47.315,47.315,0.156065,47.315,0.158446,47.315,0.170473,47.315,0.178261,47.315,0.180421,47.315,0.19308,47.315,0.208193,46.44086,4.146942,3.588842,45.68,47.115,0.60766,0.045895,3.870368,0.974928,1.0,1.0,1.0,93.924552,3.033822,2.802738,0.787132,45.410689,46.480736,50.282102,2.106821,0.21582,0.11607,139.584155,0.42385,0.204655,0.166402,0.109666,0.107444,0.037407,0.065589,0.054778,91.02493,71.447824,56.726908,85.211578,0.62861,0.049307,0.015854,1.404195,0.397732,0.488045,0.025001,0.090876,0.997991,0.31257,0.484139,0.669826,47.315,46.815664,5.0


In [91]:
df_daily.head()

Unnamed: 0,date,open,high,low,close,Volume,DayAvgPrice,IntradayStd,day_of_week,day_of_year,Log_Profit,DayAvgPrice_diff,DayAvgPrice_2diff,POLY_1,POLY_2,POLY_3,real_time,imag_time,c_week_real,c_week_imag,c_month_real,c_month_imag,c_quarter_real,c_quarter_imag,c_year_real,c_year_imag,parkinson_vol,parkinson_vol_ma5,parkinson_vol_ma20,parkinson_vol_diff1,parkinson_vol_lag1,DayAvgPrice_roll5,DayAvgPrice_roll10,DayAvgPrice_roll20,DayAvgPrice_ema5,DayAvgPrice_ema20,IntradayStd_roll5,IntradayStd_roll10,IntradayStd_roll20,IntradayStd_ema5,IntradayStd_ema20,log_ret_1,DAP_1,DAP_2,log_ret_2,DAP_3,log_ret_3,DAP_4,log_ret_4,DAP_5,log_ret_5,DAP_6,log_ret_6,DAP_7,log_ret_7,DAP_10,log_ret_10,mean_w,std_w,z_w,q10_w,q90_w,slope_w,vol_w,vol_z,dow_sin,dow_cos,moy_sin,moy_cos,rsi14,macd,macd_signal,macd_hist,bb_low,bb_mid,bb_up,atr14,phi_hilbert,dphi_hilbert,stft_energy_low,stft_energy_mid,stft_energy_high,ret,ret_overnight,ret_intraday,gk_sigma,rs_sigma,yz_sigma,adx,di_plus,di_minus,chop,kalm_slope,rv20,vol_of_vol,bb_pct_b,bb_bandwidth,corr_ret_dlogvol,lambda_C3,lambda_C2,lambda_C1,lambda_T3,lambda_T2,lambda_T1,Target,Target_smooth,days_since_prev
0,2022-03-23,24.5,25.02,23.815,24.01,2801649,24.327222,0.29575,2,82,-0.003386,0.082222,0.330417,34.507218,33.074001,28.608304,0.953698,4.5,-0.900969,0.4338837,0.01032,0.999947,-0.864444,0.502729,0.793844,0.608121,0.035833,0.035265,0.028623,0.000535,0.035298,24.437222,24.212694,24.681417,24.328609,25.359565,0.230573,0.294872,0.328787,0.245714,0.293269,0.003386,24.245,24.493194,-0.006799,24.531528,-0.008363,24.589167,-0.01071,24.371389,-0.001814,23.444167,0.036974,24.546111,-0.008957,24.202778,0.005129,25.016647,2.608575,-0.264292,22.90875,28.249167,-0.224987,0.04353,-0.133557,0.974928,-0.222521,0.866025,0.5,39.317772,-1.731032,-2.07368,0.342649,20.464927,24.681417,28.897907,1.381981,-0.225221,0.025716,0.0,0.0,0.0,-0.006849,0.013354,-0.020203,0.036117,0.054492,0.045344,55.157418,18.83842,24.025815,57.892966,-0.099165,0.045768,0.004629,0.427604,0.346201,-0.185959,0.001135,0.006319,0.992545,0.280706,0.3237,0.395594,25.668611,25.668611,1.0
1,2022-03-24,24.89,26.33,24.79,25.71,3962871,25.668611,0.32174,3,83,-0.053673,1.341389,1.259167,37.240414,36.02027,40.287588,0.953748,5.0,-0.900969,-0.4338837,-0.194837,0.980836,-0.896965,0.442102,0.783266,0.621687,0.036619,0.035636,0.029578,0.000786,0.035833,24.653111,24.415125,24.396187,24.775277,25.388998,0.247598,0.281665,0.337159,0.271056,0.29598,0.053673,24.327222,24.245,0.057058,24.493194,0.046874,24.531528,0.04531,24.589167,0.042963,24.371389,0.051859,23.444167,0.090647,23.644306,0.082147,24.728426,2.119223,0.443646,22.90875,27.011389,-0.125704,0.045812,0.300435,0.433884,-0.900969,0.866025,0.5,47.988322,-1.523342,-1.963613,0.44027,21.447917,24.396187,27.344458,1.426324,-0.184684,0.040537,0.0,0.0,0.0,0.06841,0.035996,0.032414,0.036931,0.056858,0.046866,51.498936,24.691654,15.743654,50.177115,-0.029243,0.049307,0.004367,0.731407,0.241921,-0.13821,0.001708,0.009562,0.98873,0.273773,0.357145,0.369082,25.734861,25.690694,1.0
2,2022-03-25,26.915,27.025,25.225,25.41,3796605,25.734861,0.428828,4,84,-0.002578,0.06625,-1.275139,47.955768,44.124218,38.628333,0.953798,7.0,-0.222521,-0.9749279,-0.391723,0.920083,-0.92524,0.379383,0.772456,0.635068,0.03739,0.036074,0.030618,0.000771,0.036619,24.893778,24.595125,24.270472,25.095138,25.421937,0.270339,0.27481,0.344724,0.323647,0.308632,0.002578,25.668611,24.327222,0.056251,24.245,0.059636,24.493194,0.049451,24.531528,0.047887,24.589167,0.045541,24.371389,0.054437,23.934861,0.072511,24.459934,1.502801,0.848368,22.90875,26.032361,-0.022851,0.045895,0.172308,-0.433884,-0.900969,0.866025,0.5,48.380618,-1.337978,-1.838486,0.500508,21.81723,24.270472,26.723715,1.453015,-0.068614,0.11607,0.0,0.0,0.0,-0.011737,0.045804,-0.057541,0.037285,0.051794,0.044117,48.858523,28.949336,9.633145,58.71449,0.027795,0.043961,0.003442,0.751516,0.197458,-0.037884,0.003672,0.019513,0.976815,0.263474,0.284839,0.451687,26.017778,25.799722,1.0
3,2022-03-28,25.79,26.43,25.57,26.34,2415723,26.017778,0.210701,0,87,-0.010934,0.282917,0.216667,26.170864,30.45727,24.834155,0.953946,3.0,0.62349,-0.7818315,-0.571978,0.820269,-0.949135,0.314868,0.761418,0.648262,0.036679,0.036364,0.031555,-0.000711,0.03739,25.198694,24.742292,24.220792,25.402685,25.478684,0.281537,0.269767,0.339301,0.285998,0.299306,0.010934,25.734861,25.668611,0.013511,24.327222,0.067184,24.245,0.07057,24.493194,0.060385,24.531528,0.058821,24.589167,0.056474,24.546111,0.058227,24.353677,1.284517,1.295507,22.90875,26.017778,0.047972,0.040491,-0.458201,0.0,1.0,0.866025,0.5,50.111127,-1.154933,-1.701775,0.546842,21.958797,24.220792,26.482786,1.410657,-0.016667,0.051947,0.0,0.0,0.0,0.035946,0.014844,0.021102,0.036282,0.050844,0.043172,46.270344,24.513816,9.877178,64.916895,0.079267,0.044223,0.002112,0.996097,0.18136,-0.025662,0.005125,0.027656,0.967219,0.20043,0.315417,0.484153,26.711944,26.103796,3.0
4,2022-03-29,26.71,27.71,25.66,27.040001,4744336,26.711944,0.568963,1,88,-0.026331,0.694167,0.41125,62.064668,59.083843,48.156888,0.953996,4.0,1.0,-1.469576e-15,-0.72795,0.68563,-0.968539,0.248863,0.750154,0.661263,0.03648,0.0366,0.03236,-0.000199,0.036679,25.692083,25.069069,24.254771,25.839105,25.596137,0.365196,0.300395,0.342451,0.38032,0.324987,0.026331,26.017778,25.734861,0.037264,25.668611,0.039842,24.327222,0.093515,24.245,0.0969,24.493194,0.086716,24.531528,0.085152,23.444167,0.130489,24.339418,1.254859,1.890672,22.90875,26.017778,0.1133,0.03987,0.55904,0.781831,0.62349,0.866025,0.5,54.17082,-0.942985,-1.550017,0.607032,21.867971,24.254771,26.64157,1.456324,0.024491,0.041158,0.0,0.0,0.0,0.026229,0.013949,0.012279,0.037314,0.051853,0.043829,44.750014,25.656718,9.844431,58.600822,0.136447,0.040681,0.002138,1.053098,0.207057,-0.028054,0.006298,0.029763,0.963939,0.168533,0.386454,0.445013,26.410694,26.206096,1.0


##### Записываем сформированный датасет

In [92]:
# Записываем сформированный датасет
df_daily.to_csv('../data_archiv/DTG/DTG_new_fea_to_13_11_2025_1d_w21_noweekend.csv', index=False)

In [95]:
# Проверка на утечку данных в будущее. 
# Сравниваются датафреймы, созданные в разные даты.
df_1 = df_daily.copy()

In [96]:
df_2 = pd.read_csv('../data_archiv/DTG/DTG_new_fea_to_29_10_2025_1d_w92.csv')

In [97]:
cols = [c for c in df_1.columns if c not in ('date',)]
df_1 = df_1[:len(df_2)]

In [98]:
for col in cols:
    diff = round((df_1[col] - df_2[col]).abs().max(), 6)
    print("max abs diff on tail:", col, diff)

max abs diff on tail: open 0.0
max abs diff on tail: high 0.0
max abs diff on tail: low 0.0
max abs diff on tail: close 0.0
max abs diff on tail: DayAvgPrice 0.0
max abs diff on tail: IntradayStd 0.0
max abs diff on tail: Volume 0.0
max abs diff on tail: day_of_week 0
max abs diff on tail: day_of_year 0
max abs diff on tail: Log_Profit 0.0
max abs diff on tail: DayAvgPrice_diff 0.0
max abs diff on tail: DayAvgPrice_2diff 0.0
max abs diff on tail: POLY_1 0.0
max abs diff on tail: POLY_2 0.0
max abs diff on tail: POLY_3 0.0
max abs diff on tail: real_time 0.0
max abs diff on tail: imag_time 0.0
max abs diff on tail: c_week_real 0.0
max abs diff on tail: c_week_imag 0.0
max abs diff on tail: c_month_real 0.0
max abs diff on tail: c_month_imag 0.0
max abs diff on tail: c_quarter_real 0.0
max abs diff on tail: c_quarter_imag 0.0
max abs diff on tail: c_year_real 0.0
max abs diff on tail: c_year_imag 0.0
max abs diff on tail: parkinson_vol 0.0
max abs diff on tail: parkinson_vol_ma5 0.0
max 