In [1]:
import pandas as pd
import os

PATH = r"/Users/andrewnizov/Desktop/АнДан/Проект"

## GDP RU

In [2]:
def process_gdp_ru(path):
    df = pd.read_csv(os.path.join(path, r'macro/GDP_processed.csv'))
    def days_in_quarter(year, quarter):
        if quarter == 1:
            return 90 if year % 4 != 0 else 91  # проверка високосного года
        elif quarter == 2:
            return 91
        elif quarter == 3:
            return 92
        elif quarter == 4:
            return 92
    
    def start_of_quarter(year, quarter):
        '''
        Q1 начинается с 1 января.
        Q2 начинается с 1 апреля.
        Q3 начинается с 1 июля.
        Q4 начинается с 1 октября.
        '''
        if quarter == 1:
            return f"{year}-01-01"
        elif quarter == 2:
            return f"{year}-04-01"
        elif quarter == 3:
            return f"{year}-07-01"
        elif quarter == 4:
            return f"{year}-10-01"
    
    # Расширяем DataFrame до дневных данных
    daily_data = []
    for index, row in df.iterrows():
        num_days = days_in_quarter(row['year'], row['quater'])
        daily_gdp = row['gdp in 2021 prices'] / num_days
        daily_gdp_no_season = row['gdp without seasonality in 2021 prices'] / num_days
        start_date = start_of_quarter(int(row['year']), int(row['quater']))
        end_date = pd.Period(start_date, freq='Q').end_time.strftime('%Y-%m-%d')
        dates = pd.date_range(start=start_date, end=end_date, freq='D')
        for date in dates:
            daily_data.append({'date': date, 'daily_gdp': daily_gdp, 'daily_gdp_no_season': daily_gdp_no_season})
    
    # Создаем новый DataFrame
    return pd.DataFrame(daily_data)

## Macro RU

Универсально для macro_agg и households_bm

In [3]:
def process_macro_monthtly(path, file):
    df = pd.read_csv(os.path.join(path, f'macro/{file}'))
    def get_last_date(date):
        '''
        date: TimeStamp
        '''
        return (date+pd.DateOffset(months=1)).replace(day=1) - pd.Timedelta(days=1)
    
    daily_data = []
    for index, row in df.iterrows():
        start_date = pd.to_datetime(row['date'], format = '%Y-%m-%d')
        last_date = get_last_date(start_date)
        dates = pd.date_range(start=start_date, end=last_date, freq='D')
        
        for date in dates:
            daily_record = {'date': date}
            for col in df.columns[1:]:
                daily_record[col] = row[col]
            daily_data.append(daily_record)
    return pd.DataFrame(daily_data)

In [4]:
process_macro_monthtly(PATH, 'monetary_agg_processed.csv').head()

Unnamed: 0,date,Денежный агрегат М0,Денежный агрегат М1,Денежный агрегат М2,Широкая денежная масса
0,2001-01-01,418.9,869.4,1150.6,1573.8
1,2001-01-02,418.9,869.4,1150.6,1573.8
2,2001-01-03,418.9,869.4,1150.6,1573.8
3,2001-01-04,418.9,869.4,1150.6,1573.8
4,2001-01-05,418.9,869.4,1150.6,1573.8


In [5]:
df = pd.read_csv(os.path.join(PATH, r'macro/nonfinancial_sector_households_debt_extended_processed.csv'))

In [6]:
df.head()

Unnamed: 0,"Долг нефинансового сектора и домашних хозяйств, итого",Нефинансовый сектор,Внутренние заимствования,Кредиты,Долговые ценные бумаги в портфеле резидентов,Внешние заимствования,"Кредиты и займы, полученные от нерезидентов","Долговые ценные бумаги в портфеле нерезидентов, включая векселя",Домашние хозяйства и НКООДХ,Внутренние заимствования.1,Внешние заимствования.1,date
0,39267862,30555941,20979755,19340212,1639543,9576186,9450744,125442,8711921,8492165,219756,2013-01-01
1,41641441,32464355,21415828,19693260,1722568,11048527,10915324,133203,9177086,8905818,271268,2013-04-01
2,44286477,34347540,22429633,20657017,1772616,11917907,11790084,127823,9938937,9645301,293636,2013-07-01
3,46181943,35591204,23590436,21754043,1836393,12000768,11895100,105668,10590739,10284975,305764,2013-10-01
4,47752644,36600506,24100233,22196461,1903772,12500273,12380763,119510,11152138,10877194,274944,2014-01-01


In [7]:
def process_debt(path):
    df = pd.read_csv(os.path.join(path, r'macro/nonfinancial_sector_households_debt_extended_processed.csv'))
    def get_last_date(date):
        '''
        date: TimeStamp
        '''
        return (date+pd.DateOffset(months=3)).replace(day=1) - pd.Timedelta(days=1)
    
    # Расширяем DataFrame до дневных данных
    daily_data = []
    for index, row in df.iterrows():
        start_date = pd.to_datetime(row['date'])
        end_date = get_last_date(start_date)
        dates = pd.date_range(start=start_date, end=end_date, freq='D')
        
        for date in dates:
            daily_record = {'date': date}
            for col in df.columns[:-1]:
                daily_record[col] = row[col]
            daily_data.append(daily_record)
    
    # Создаем новый DataFrame
    return pd.DataFrame(daily_data)

In [8]:
process_debt(PATH).head(2)

Unnamed: 0,date,"Долг нефинансового сектора и домашних хозяйств, итого",Нефинансовый сектор,Внутренние заимствования,Кредиты,Долговые ценные бумаги в портфеле резидентов,Внешние заимствования,"Кредиты и займы, полученные от нерезидентов","Долговые ценные бумаги в портфеле нерезидентов, включая векселя",Домашние хозяйства и НКООДХ,Внутренние заимствования.1,Внешние заимствования.1
0,2013-01-01,39267862,30555941,20979755,19340212,1639543,9576186,9450744,125442,8711921,8492165,219756
1,2013-01-02,39267862,30555941,20979755,19340212,1639543,9576186,9450744,125442,8711921,8492165,219756


In [9]:
# imoex поменять rename('Unnamed: 0': 'date')
pd.read_csv(os.path.join(PATH, r'macro/imoex.csv'))

Unnamed: 0.1,Unnamed: 0,Open,High,Low,Close,Volume
0,2014-01-06,1504.14,1504.33,1464.97,1466.15,1.690029e+10
1,2014-01-08,1465.99,1470.25,1460.41,1462.67,1.602738e+10
2,2014-01-09,1462.67,1470.65,1460.81,1465.14,2.005612e+10
3,2014-01-10,1465.74,1470.73,1454.75,1469.19,2.927489e+10
4,2014-01-13,1469.19,1482.45,1466.51,1477.56,2.959673e+10
...,...,...,...,...,...,...
2586,2024-04-29,3468.92,3480.47,3467.25,3478.08,3.173054e+10
2587,2024-04-30,3481.29,3486.30,3464.86,3469.83,2.546404e+10
2588,2024-05-02,3470.87,3477.42,3439.58,3442.83,5.677735e+10
2589,2024-05-03,3443.49,3450.43,3424.63,3441.77,4.971335e+10


In [10]:
# тоже поменять  rename('Unnamed: 0': 'date')
pd.read_csv(os.path.join(PATH, r'macro/data_interbank_rates.csv'))

Unnamed: 0.1,Unnamed: 0,Interbank credit market rates
0,2014-01-04,7.9700
1,2014-01-07,8.2300
2,2014-01-08,7.6100
3,2014-01-09,8.1200
4,2014-01-10,7.8800
...,...,...
2539,2024-11-04,15.6794
2540,2024-12-01,15.1452
2541,2024-12-02,15.9739
2542,2024-12-03,15.9884


In [11]:
# тоже поменять  rename('Unnamed: 0': 'date')
pd.read_csv(os.path.join(PATH, r'macro/data_precious_metals.csv'))

Unnamed: 0.1,Unnamed: 0,gold_rates,silver_rates,platinum_rates,palladium_rates
0,2014-01-01,1261.58,20.63,1425.90,746.55
1,2014-01-02,1409.87,21.95,1564.26,798.53
2,2014-01-03,1544.66,24.87,1679.90,862.05
3,2014-01-04,1481.29,22.56,1627.81,889.46
4,2014-01-05,1483.88,22.20,1632.03,922.25
...,...,...,...,...,...
2543,2024-11-01,5848.46,66.46,2701.63,2830.97
2544,2024-11-04,7061.43,83.81,2943.14,3197.89
2545,2024-12-01,5785.30,65.55,2654.59,2845.84
2546,2024-12-03,6326.15,71.37,2671.83,3038.95


## US indicators

### GDP

In [12]:
def process_gdp_usa(path):
    df = pd.read_csv(os.path.join(path, r'macro/GDP_USA.csv'))
    def get_last_date(date):
        '''
        date: TimeStamp
        '''
        return (date+pd.DateOffset(months=3)).replace(day=1) - pd.Timedelta(days=1)
    
    # Расширяем DataFrame до дневных данных
    daily_data = []
    for index, row in df.iterrows():
        start_date = pd.to_datetime(row['date'])
        end_date = get_last_date(start_date)
        dates = pd.date_range(start=start_date, end=end_date, freq='D')
        num_days = len(dates)
        
        for date in dates:
            daily_data.append({'date': date, 'real_gdp_usa': row['value'] / num_days})
            
    
    # Создаем новый DataFrame
    return pd.DataFrame(daily_data)

In [13]:
process_gdp_usa(PATH).tail()

Unnamed: 0,date,real_gdp_usa
8121,2002-03-27,38.901311
8122,2002-03-28,38.901311
8123,2002-03-29,38.901311
8124,2002-03-30,38.901311
8125,2002-03-31,38.901311


### S&P500

In [14]:
# здесь единственное - переименовать столбцы и убедиться, что дата в правильном формате
df = pd.read_csv(os.path.join(PATH, r'macro/S&P500.csv'))
df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume'], dtype='object')

### EFFR

In [15]:
df = pd.read_csv(os.path.join(PATH, r'macro/EFFR.csv'))

In [16]:
df.head(20)

Unnamed: 0,DATE,FEDFUNDS
0,1954-07-01,0.8
1,1954-08-01,1.22
2,1954-09-01,1.07
3,1954-10-01,0.85
4,1954-11-01,0.83
5,1954-12-01,1.28
6,1955-01-01,1.39
7,1955-02-01,1.29
8,1955-03-01,1.35
9,1955-04-01,1.43


In [17]:
def process_effr(path):
    df = pd.read_csv(os.path.join(path, r'macro/EFFR.csv'))
    def get_last_date(date):
        '''
        date: TimeStamp
        '''
        return (date+pd.DateOffset(months=1)).replace(day=1) - pd.Timedelta(days=1)
    
    daily_data = []
    for index, row in df.iterrows():
        start_date = pd.to_datetime(row['DATE'], format = '%Y-%m-%d')
        last_date = get_last_date(start_date)
        dates = pd.date_range(start=start_date, end=last_date, freq='D')
        
        for date in dates:
            daily_data.append({'date': date, 'effr': row['FEDFUNDS']})

    return pd.DataFrame(daily_data)

In [37]:
process_effr(PATH).head()

Unnamed: 0,date,effr
0,1954-07-01,0.8
1,1954-07-02,0.8
2,1954-07-03,0.8
3,1954-07-04,0.8
4,1954-07-05,0.8


In [19]:
# переименовать .rename('Unnamed: 0': 'date')
pd.read_csv(os.path.join(PATH, r'macro/data_usdrub.csv'))

Unnamed: 0.1,Unnamed: 0,usdrub
0,2010-01-01,30.1851
1,2010-01-04,29.4956
2,2010-01-05,29.1537
3,2010-01-06,30.7400
4,2010-01-07,31.2554
...,...,...
3536,2024-11-01,89.3939
3537,2024-11-04,93.2198
3538,2024-12-01,88.7818
3539,2024-12-03,90.6252


## Котировки

In [20]:
# CLOSE, VOLUME, TRADEDATE
df1 = pd.read_csv(os.path.join(PATH, r'renewable_energy/IRAO_RU.csv'))

In [21]:
# volume, close, date, в date нужно вычленить только день
pd.read_csv(os.path.join(PATH, r'renewable_energy/ALTO_USA.csv')).tail()

Unnamed: 0.1,Unnamed: 0,date,open,close,volume
1254,1254,2019-05-15,1.11,1.17,144560
1255,1255,2019-05-14,1.115,1.1,258629
1256,1256,2019-05-13,1.23,1.11,403926
1257,1257,2019-05-10,1.26,1.24,394551
1258,1258,2019-05-09,1.25,1.25,445669


In [22]:
os.listdir(os.path.join(PATH, 'renewable_energy'))[0]

'UPRO_RU.csv'

In [38]:
def merge_stocks(path, path_total):
    def merge_dfs(left, right):
        return pd.merge(left, right, on = 'date', how = 'outer')
        
    file_names = os.listdir(path)
    for indx, file_name in enumerate(file_names):
        df = pd.read_csv(os.path.join(path, file_name))
        ticker_name = file_name[:-4]
        if 'USA' == file_name[-7:-4]:
            df = df[['volume', 'close', 'date']]
            df['date'] = df['date'].apply(lambda x: pd.to_datetime(x).strftime('%Y-%m-%d'))
            df['date'] = pd.to_datetime(df['date'])
            df[['volume', 'close']] = df[['volume', 'close']].astype('float32')
            df.rename({'volume': f'volume_{ticker_name}', \
                       'close': f'close_{ticker_name}'}, axis = 1, inplace = True)
        elif 'RU' == file_name[-6:-4]:
            df = df[['TRADEDATE', 'VOLUME', 'CLOSE']]
            df['TRADEDATE'] = pd.to_datetime(df['TRADEDATE'])
            df[['VOLUME', 'CLOSE']] = df[['VOLUME', 'CLOSE']].astype('float32')
            df.rename({'TRADEDATE': 'date', \
                       'VOLUME': f'volume_{ticker_name}', \
                      'CLOSE': f'close_{ticker_name}'}, axis = 1, inplace = True)
        if indx == 0:
            final_df = df
        else:
            final_df = merge_dfs(final_df, df)
            del df

    # Переходим к макроданным
    # курс доллара
    usd_rub = pd.read_csv(os.path.join(path_total, r'macro/data_usdrub.csv'))
    usd_rub.rename({'Unnamed: 0':'date'}, axis = 1, inplace = True)
    usd_rub['date'] = pd.to_datetime(usd_rub['date'])
    usd_rub = usd_rub[usd_rub['date'].dt.year > 2018]
    final_df = merge_dfs(final_df[final_df['date'].dt.year > 2018], usd_rub)
    del usd_rub

    # effr
    effr = process_effr(path_total)
    effr['date'] = pd.to_datetime(effr['date'])
    effr = effr[effr['date'].dt.year > 2018]
    final_df = merge_dfs(final_df, effr)
    del effr

    # S&P500
    df = pd.read_csv(os.path.join(path_total, r'macro/S&P500.csv'))
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['date'].dt.year > 2018]
    df = df[['date', 'close', 'volume']]
    final_df = merge_dfs(final_df, df)
    del df

    # debt
    df = process_debt(path_total)
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['date'].dt.year > 2018]
    final_df = merge_dfs(final_df, df)
    del df

    # GDP USA
    df = process_gdp_usa(path_total)
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['date'].dt.year > 2018]
    final_df = merge_dfs(final_df, df)
    del df

    # Monetary supply
    df = process_macro_monthtly(path_total, 'monetary_agg_processed.csv')
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['date'].dt.year > 2018]
    final_df = merge_dfs(final_df, df)
    del df

    # Households bm
    df = process_macro_monthtly(path_total, 'households_bm_processed.csv')
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['date'].dt.year > 2018]
    final_df = merge_dfs(final_df, df)
    del df

    # GDP RU
    df = process_gdp_ru(path_total)
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['date'].dt.year > 2018]
    final_df = merge_dfs(final_df, df)
    del df

    # imoex
    df = pd.read_csv(os.path.join(path_total, r'macro/imoex.csv'))
    df.rename({'Unnamed: 0':'date'}, axis = 1, inplace = True)
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['date'].dt.year > 2018]
    final_df = merge_dfs(final_df, df)
    del df

    # interbanks rates
    df = pd.read_csv(os.path.join(path_total, r'macro/data_interbank_rates.csv'))
    df.rename({'Unnamed: 0':'date'}, axis = 1, inplace = True)
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['date'].dt.year > 2018]
    final_df = merge_dfs(final_df, df)
    del df

    # precious metals
    df = pd.read_csv(os.path.join(PATH, r'macro/data_precious_metals.csv'))
    df.rename({'Unnamed: 0':'date'}, axis = 1, inplace = True)
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['date'].dt.year > 2018]
    final_df = merge_dfs(final_df, df)
    del df
    
    
    return final_df

In [39]:
df = merge_stocks(os.path.join(PATH, 'renewable_energy'), PATH)
df.to_csv('renewable_energy.csv', index = False , encoding = 'utf-8')

In [40]:
df = merge_stocks(os.path.join(PATH, 'fintech'), PATH)
df.to_csv('fintech.csv', index = False , encoding = 'utf-8')

In [41]:
df = merge_stocks(os.path.join(PATH, 'industrial_goods'), PATH)
df.to_csv('industrial_goods.csv', index = False , encoding = 'utf-8')

In [42]:
df = merge_stocks(os.path.join(PATH, 'healthcare_services'), PATH)
df.to_csv('healthcare_services.csv', index = False , encoding = 'utf-8')