In [4]:
import pandas as pd
import os

data_dir = 'dataset'
start_date = '2008-01-01'
end_date = '2024-12-31'

date_index = pd.date_range(start=start_date, end=end_date, freq='B')
df_all = pd.DataFrame(index=date_index)

for filename in os.listdir(data_dir):
    if not filename.endswith('.csv'):
        continue
    
    ticker = filename.replace('_D1.csv', '') 
    file_path = os.path.join(data_dir, filename)
    
    df = pd.read_csv(file_path, parse_dates=['datetime'])
    df = df[['datetime', 'close']].copy()
    df.set_index('datetime', inplace=True)
    df = df.rename(columns={'close': ticker})
    
    df = df.loc[(df.index >= start_date) & (df.index <= end_date)]
    
    df_all = df_all.join(df, how='left')

df_all = df_all.fillna(method='ffill').fillna(method='bfill')

df_all.index.name = 'Date'

print("Объединённый датафрейм:")
print(df_all.info())
print(df_all.head())

df_all.to_csv('df_prices.csv')
print("Сохранено в df_prices.csv")

  df_all = df_all.fillna(method='ffill').fillna(method='bfill')


Объединённый датафрейм:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4436 entries, 2008-01-01 to 2024-12-31
Freq: B
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TRMK    4436 non-null   float64
 1   PLZL    4436 non-null   float64
 2   SBER    4436 non-null   float64
 3   ROSN    4436 non-null   float64
 4   TATN    4436 non-null   float64
 5   TTLK    4436 non-null   float64
 6   UTAR    4436 non-null   float64
 7   KMAZ    4436 non-null   float64
 8   LSRG    4436 non-null   float64
 9   MTSS    4436 non-null   float64
 10  GMKN    4436 non-null   float64
 11  DVEC    4436 non-null   float64
 12  IRAO    4436 non-null   float64
 13  GLTR    4436 non-null   float64
 14  IRKT    4436 non-null   float64
 15  MRKC    4436 non-null   float64
 16  AFKS    4436 non-null   float64
 17  LKOH    4436 non-null   float64
 18  RTKM    4436 non-null   float64
 19  MAGN    4436 non-null   float64
 20  GAZP    4436 non-null   

In [6]:
missing_values = df_prices.isna().sum()
total_missing = missing_values.sum()

print(f"\nПропущенные значения: всего {total_missing}")
print(missing_values[missing_values > 0].sort_values(ascending=False).head())


🔧 Пропущенные значения: всего 0
Series([], dtype: int64)


In [7]:
print("\nСтатистика по ценам закрытия:")
print(df_prices.describe().T[['mean', 'std', 'min', 'max']].round(2))


📊 Статистика по ценам закрытия:
          mean      std     min       max
TRMK     94.63    48.43   13.97    275.70
PLZL   5230.69  4808.99  357.00  18592.00
SBER    151.99    83.78   14.16    387.60
ROSN    327.36   124.54   82.92    655.25
TATN    369.96   212.41   27.50    837.00
TTLK      0.30     0.26    0.04      1.61
UTAR     11.77     5.45    2.25     25.26
KMAZ     71.39    45.87   12.30    267.40
LSRG    732.34   271.79  108.00   2100.00
MTSS    256.90    46.39   93.16    380.00
GMKN    108.14    60.84   12.45    281.30
DVEC      1.13     0.76    0.15      5.40
IRAO      2.17     2.02    0.00      6.50
GLTR    116.71   222.58    0.22    844.95
IRKT     22.11    21.37    2.74    147.50
MRKC      0.49     0.29    0.19      1.47
AFKS     21.06     9.01    3.60     48.00
LKOH   3510.05  1876.27  740.30   8152.00
RTKM    112.25    57.33   52.03    334.99
MAGN     30.99    16.89    4.42     79.21
GAZP    175.94    54.69   86.60    389.82
NVTK    724.75   466.14   45.80   1993.00
S