In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import metrics as mtr
import operations as op
from operations import neutralize
from operations import normalize
pd.options.display.expand_frame_repr = False

In [3]:
data_close = pd.read_csv('../data/Close.csv', index_col='date')
data_close.index = data_close.index.astype('datetime64')
data_close.drop(columns='Unnamed: 0', inplace=True)
data_close = data_close.astype('float64')

data_open = pd.read_csv('../data/Open.csv', index_col='date')
data_open.index = data_open.index.astype('datetime64')
data_open.drop(columns='Unnamed: 0', inplace=True)
data_open = data_open.astype('float64')

data_high = pd.read_csv('../data/High.csv', index_col='date')
data_high.index = data_high.index.astype('datetime64')
data_high.drop(columns='Unnamed: 0', inplace=True)
data_high = data_high.astype('float64')

data_low = pd.read_csv('../data/Low.csv', index_col='date')
data_low.index = data_low.index.astype('datetime64')
data_low.drop(columns='Unnamed: 0', inplace=True)
data_low = data_low.astype('float64')

data_volume = pd.read_csv('../data/Volume.csv').T.iloc[1:].reset_index()
data_volume.index = data_volume['index'].astype('datetime64')
data_volume.drop(data_volume.columns [0], axis=1, inplace=True)
data_volume = data_volume.astype('float64')
data_volume.columns = data_volume.columns.astype(str)

data_returns = pd.read_csv('../data/returns.csv', index_col='date')
data_returns.index = data_returns.index.astype('datetime64')
data_returns = data_returns.astype('float64')

ts_rank_close_20 = pd.read_csv('../data/ts_rank_volume_20.csv', index_col='date')
ts_rank_close_20.index = ts_rank_close_20.index.astype('datetime64')
ts_rank_close_20 = ts_rank_close_20.astype('float64')

ts_rank_volume_20 = pd.read_csv('../data/ts_rank_volume_20.csv', index_col='date')
ts_rank_volume_20.index = ts_rank_volume_20.index.astype('datetime64')
ts_rank_volume_20 = ts_rank_volume_20.astype('float64')

ts_rank_returns_20 = pd.read_csv('../data/ts_rank_returns_20.csv', index_col='date')
ts_rank_returns_20.index = ts_rank_returns_20.index.astype('datetime64')
ts_rank_returns_20 = ts_rank_returns_20.astype('float64')

In [4]:
def alpha1(n=1, data_close=0, data_open=0, data_high=0, data_low=0, data_volume=0):
    kernel = (data_close - data_close.shift(n)) / (data_close.shift(n) + data_close)
    alpha = -kernel
    return normalize(neutralize(alpha)).fillna(0)

In [5]:
def alpha2(n=1, data_close=0, data_open=0, data_high=0, data_low=0, data_volume=0):
    kernel = (data_high - data_low) * data_close  
    alpha = kernel / data_close.shift(n)
    return normalize(neutralize(alpha)).fillna(0)

In [6]:
def alpha31(n=1, data_close=0, data_open=0, data_high=0, data_low=0, data_volume=0):
    kernel = data_high + data_low - 2 * data_close
    alpha = kernel / data_open.shift(n)
    return normalize(neutralize(alpha)).fillna(0)

In [7]:
def alpha32(n=1, data_close=0, data_open=0, data_high=0, data_low=0, data_volume=0):
    kernel = (data_high + data_low < 2 * data_close.shift(n)).replace({True: 1, False: -1})
    alpha = kernel
    return normalize(neutralize(alpha)).fillna(0)

In [8]:
def alpha33(n=1, data_close=0, data_open=0, data_high=0, data_low=0, data_volume=0):
    kernel = ((data_high * data_low) / data_close.shift(n) ** 2).copy()
    alpha = kernel
    return normalize(neutralize(alpha)).fillna(0)

In [9]:
def alpha41(n=1, data_close=0, data_open=0, data_high=0, data_low=0, data_volume=0):
    kernel = data_open / data_close.shift(n)
    alpha = kernel
    return normalize(neutralize(alpha)).fillna(0)

In [10]:
def alpha42(n=1, data_close=0, data_open=0, data_high=0, data_low=0, data_volume=0):
    kernel = (data_high - data_low) / data_close.shift(n)
    alpha = kernel
    return normalize(neutralize(alpha)).fillna(0)

In [11]:
def alpha51(n=1, data_close=0, data_open=0, data_high=0, data_low=0, data_volume=0):
    kernel = (data_close - data_low.shift(n)) / (data_high - data_low.shift(n))
    alpha = kernel
    return normalize(neutralize(alpha)).fillna(0)

In [12]:
def alpha52(n=1, data_close=0, data_open=0, data_high=0, data_low=0, data_volume=0):
    kernel = ((data_close - data_low.shift(n)) / (data_high - data_low.shift(n)) < 0.5).replace({True: 1, False: -1})
    alpha = kernel
    return normalize(neutralize(alpha)).fillna(0)

In [13]:
def alpha61(n=1, data_close=0, data_open=0, data_high=0, data_low=0, data_volume=0):
    kernel = (data_low - data_high)
    alpha = (data_close.shift(n) - data_open) / (kernel + 0.001)
    return normalize(neutralize(alpha)).fillna(0)

In [13]:
def generate_alphas(data_returns, algh_alpha, max_n=1, data_close=0, data_open=0, data_high=0, data_low=0, data_volume=0):
    pnls_matrix = pd.DataFrame()
    pnls_matrix[0] = mtr.get_pnl(data_returns,  (alpha61(2, data_close, data_open, data_high, data_low, data_volume)), 1).fillna(0)
    alpha_number = 1        # будем запоминать номер альфы, которая есть в pnls_matrix (матрице корреляций) 
    pnl_init = 0
    for n in tqdm(range(1, max_n + 1)):
        alpha_base = algh_alpha(n, data_close, data_open, data_high, data_low, data_volume)     # alpha_base - альфа без усиляющих множителей, "просто ядро"
        days_pnl = mtr.get_pnl(data_returns, alpha_base, 1).fillna(0)['pnl']
        if abs(days_pnl.sum()) > 0.4:        # рассматриваем только те альфы, у которых суммарный за все время pnl > 0.4
            if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:
                if type(pnl_init) is type(0):       # если тип данных pnl_init == int, то запоминаем первую альфу, у которой pnl > 0.4
                    pnl_init = days_pnl
                    pnls_matrix[alpha_number] = days_pnl
                    alpha_number += 1
                    print('alpha_base', 'n =', n, alpha_number)
                    print('\n')
            else:       # если есть альфа, у которой pnl > 0.5, то считаем корреляцию между новой альфой и первой
                if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:        #если нашлась альфа с pnl > 0.4 и corr < 0.6 (между нынешней и последней добавленной), то запоминаем
                    pnls_matrix[alpha_number] = days_pnl
                    alpha_number += 1
                    print('alpha_base', 'n =', n, alpha_number)
                    print('\n')

        days_pnl = mtr.get_pnl(data_returns, op.get_alpha_mul_ts_tank_data(alpha_base, ts_rank_volume_20), 1).fillna(0)['pnl']
        if abs(days_pnl.sum()) > 0.4:
            if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:
                if type(pnl_init) is type(0):
                    pnl_init = days_pnl
                    pnls_matrix[alpha_number] = days_pnl
                    alpha_number += 1
                    print('alpha_ts_rank_volume', 'n =', n)
                    print('\n')
            else:
                if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:
                    pnls_matrix[alpha_number] = days_pnl
                    alpha_number += 1
                    print('alpha_ts_rank_volume', 'n =', n, alpha_number)
                    print('\n')

        days_pnl = mtr.get_pnl(data_returns, op.get_alpha_mul_ts_tank_data(alpha_base, ts_rank_close_20), 1).fillna(0)['pnl']
        if abs(days_pnl.sum()) > 0.4:
            if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:
                if type(pnl_init) is type(0):
                    pnl_init = days_pnl
                    pnls_matrix[alpha_number] = days_pnl
                    alpha_number += 1
                    print('alpha_ts_rank_close', 'n =', n)
                    print('\n')
            else:
                if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:
                    pnls_matrix[alpha_number] = days_pnl
                    alpha_number += 1
                    print('alpha_ts_rank_close', 'n =', n, alpha_number)
                    print('\n')

        days_pnl = mtr.get_pnl(data_returns, op.get_alpha_mul_ts_tank_data(alpha_base, ts_rank_returns_20), 1).fillna(0)['pnl']
        if abs(days_pnl.sum()) > 0.4:
            if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:
                if type(pnl_init) is type(0):
                    pnl_init = days_pnl
                    pnls_matrix[alpha_number] = days_pnl
                    alpha_number += 1
                    print('alpha_ts_rank_returns', 'n =', n)
                    print('\n')
            else:
                if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:
                    pnls_matrix[alpha_number] = days_pnl
                    alpha_number += 1
                    print('alpha_ts_rank_returns', 'n =', n, alpha_number)
                    print('\n')


        days_pnl = mtr.get_pnl(data_returns, op.ranking(alpha_base), 1).fillna(0)['pnl']
        if abs(days_pnl.sum()) > 0.4:
            if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:
                if type(pnl_init) is type(0):
                    pnl_init = days_pnl
                    pnls_matrix[alpha_number] = days_pnl
                    alpha_number += 1
                    print('alpha_rank', 'n =', n)
                    print('\n')
            else:
                if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:
                    pnls_matrix[alpha_number] = days_pnl
                    alpha_number += 1
                    print('alpha_rank', 'n =', n, alpha_number)
                    print('\n')
        
        for j in range(1, 11):
            days_pnl = mtr.get_pnl(data_returns, op.get_decay_alpha(alpha_base, j + 1, j), 1).fillna(0)['pnl']
            if abs(days_pnl.sum()) > 0.4:
                if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:
                    if type(pnl_init) is type(0):
                        pnl_init = days_pnl
                        pnls_matrix[alpha_number] = days_pnl
                        alpha_number += 1
                        print('alpha_dec', 'n =', n,  alpha_number, 'd =', j + 1, 'k =', j)
                        print('\n')
                else:
                    if stats.pearsonr(pnls_matrix[alpha_number - 1], days_pnl)[0] < 0.6:
                        pnls_matrix[alpha_number] = days_pnl
                        alpha_number += 1
                        print('alpha_dec', 'n =', n,  alpha_number, 'd =', j + 1, 'k =', j)
                        print('\n')

    return pnls_matrix

In [14]:
a61 = generate_alphas(data_returns, alpha61, max_n=20, data_close=data_close, data_open=data_open, data_high=data_high, data_low=data_low, data_volume=data_volume)

  5%|▌         | 1/20 [02:13<42:14, 133.39s/it]

alpha_dec n = 2 2 d = 8 k = 7




100%|██████████| 20/20 [42:22<00:00, 127.10s/it]


In [15]:
a2 = generate_alphas(data_returns, alpha2, max_n=20, data_close=data_close, data_open=data_open, data_high=data_high, data_low=data_low, data_volume=data_volume)

  0%|          | 0/20 [00:00<?, ?it/s]

alpha_base n = 1 2




100%|██████████| 20/20 [44:37<00:00, 133.89s/it]


In [16]:
a31 = generate_alphas(data_returns, alpha31, max_n=20, data_close=data_close, data_open=data_open, data_high=data_high, data_low=data_low, data_volume=data_volume)

  0%|          | 0/20 [00:00<?, ?it/s]

alpha_base n = 1 2




100%|██████████| 20/20 [44:11<00:00, 132.58s/it]


In [17]:
a32 = generate_alphas(data_returns, alpha32, max_n=20, data_close=data_close, data_open=data_open, data_high=data_high, data_low=data_low, data_volume=data_volume)

100%|██████████| 20/20 [43:13<00:00, 129.70s/it]


In [18]:
a33 = generate_alphas(data_returns, alpha33, max_n=20, data_close=data_close, data_open=data_open, data_high=data_high, data_low=data_low, data_volume=data_volume)

 10%|█         | 2/20 [04:14<38:05, 126.97s/it]

alpha_ts_rank_volume n = 3




100%|██████████| 20/20 [42:31<00:00, 127.58s/it]


In [19]:
a41 = generate_alphas(data_returns, alpha41, max_n=20, data_close=data_close, data_open=data_open, data_high=data_high, data_low=data_low, data_volume=data_volume)

 10%|█         | 2/20 [04:14<38:07, 127.11s/it]

alpha_ts_rank_volume n = 3




100%|██████████| 20/20 [31:23<00:00, 94.18s/it]


In [20]:
a42 = generate_alphas(data_returns, alpha42, max_n=20, data_close=data_close, data_open=data_open, data_high=data_high, data_low=data_low, data_volume=data_volume)

100%|██████████| 20/20 [25:41<00:00, 77.06s/it]


In [21]:
a51 = generate_alphas(data_returns, alpha51, max_n=20, data_close=data_close, data_open=data_open, data_high=data_high, data_low=data_low, data_volume=data_volume)

100%|██████████| 20/20 [29:18<00:00, 87.90s/it]


In [22]:
a52 = generate_alphas(data_returns, alpha52, max_n=20, data_close=data_close, data_open=data_open, data_high=data_high, data_low=data_low, data_volume=data_volume)

  0%|          | 0/20 [00:00<?, ?it/s]

alpha_dec n = 1 2 d = 6 k = 5




100%|██████████| 20/20 [26:24<00:00, 79.21s/it]


In [23]:
a61 = generate_alphas(data_returns, alpha61, max_n=20, data_close=data_close, data_open=data_open, data_high=data_high, data_low=data_low, data_volume=data_volume)

  5%|▌         | 1/20 [01:14<23:38, 74.68s/it]

alpha_dec n = 2 2 d = 8 k = 7




100%|██████████| 20/20 [25:02<00:00, 75.13s/it]
