In [1]:
import csv
import numpy as np
import pandas as pd

In [2]:
HIGH_KEY = ' High'
LOW_KEY = ' Low'
CLOSE_KEY =  ' Close/Last'
DATE_KEY = 'Date' 

**Exponential Moving Average**

$EMA = price(today) * a + EMA(yesterday) * (1-a)$

$a = \frac{2}{N+1}$

first EMA equals SMA of time period N

In [3]:
def EMA(n, p):
  p_len = len(p)
  a = 2 / (n + 1)

  emas = []
  emas.append(np.average(p[p_len-n:p_len])) # SMA as first value
              
  for i in range(n, p_len):
    ema = a*p[p_len-i-1] + (1-a)*emas[0]
    emas.insert(0, ema)

  return emas



**RoR**

$RoR = \ln{\frac{p_t}{p_{t-n}}}$

In [4]:
def RoR (pt, pt_):
  return np.log(pt/pt_)

**Gradient a of the price trend**

$a= \frac{\sum_ {t=1}^n (t-\bar{t}) (p_{t}-\bar{p})}{\sum_ {t=1}^n (t-\bar{t})^2}$

where is the index of the time instant ,
are the corresponding stock prices and are average values of t and p, respectively

In [5]:
def grad_price_trend(p):
  n = len(p)
  n_avg = np.average(range(1, n+1))
  p_avg = np.average(p)

  numerator = 0
  denominator = 0
  for i in range(1, n+1):
    numerator += (i - n_avg)*(p[p.index[0] + i-1] - p_avg)
    denominator += (i - n_avg)**2

  return numerator/denominator

**Relative Strength Index**

$RSI = 100 - \frac{100}{1 + RS}$

$RS = \frac{EMA(U)}{EMA(D)}$

In [6]:
def RSI(n, p):
  U = [] # gain
  D = [] # loss
  for i in range(len(p) - 1):
    if p[i] == p[i+1]:
      U.append(0); D.append(0)
    else:
      if p[i] > p[i+1]:
        U.append (p[i] - p[i+1]); D.append(0)
      else:
        U.append(0); D.append(p[i+1] - p[i])


  if not D:
    return 100;
  if not U:
    return 0;

  U_ema = EMA(n,U)
  D_ema = EMA(n,D)

  RS = np.divide(U_ema,D_ema)

  return 100 - 100/(1 + RS)

**MACD**

MACD = EMA for 12 days - EMA for 26 days

In [7]:
def MACD(p):
  ema26 = EMA(26, p)
  ema12 = EMA(12, p)[0:len(ema26)] 

  return np.subtract(ema12,ema26)


**Commodity Channel Index**

$CCI = \frac{Typical Price - MA}{0.015 * Mean Deviation}$

$Typical Price = \sum_{i=1}^N (High + Low + Close)/3$

N - number of periods (20)
$MA = Moving Average = (\sum_{i=1}^N Typical Price)/N$

$Mean Deviation = (\sum_{i=1}^N |Typical Price - MA|)/N$

In [8]:
def CCI(p, h, l):
  typical_price = np.add(np.add(p, h), l)/3
  MA = np.average(typical_price)
  mean_deviation = np.average(np.abs(typical_price - MA))
  result = (typical_price[typical_price.index[0]] - MA)/(0.015 * mean_deviation)
  return result

**Calculate all features**

In [9]:
def calculate_features(d):
  d.add
  
  features = []
  for i in range(17):
    features.append([])
  print(features)

  # Relative Strength Index for 14 days
  features[14] = RSI(14, d.get(CLOSE_KEY))[0:len(d.get(CLOSE_KEY)) - 26]
  features[15] = MACD(d.get(CLOSE_KEY))[0:len(d.get(CLOSE_KEY)) - 26]

  for i in range(len(d.get(CLOSE_KEY)) - 26):
    features[1].append(RoR (d.get(CLOSE_KEY)[i], d.get(CLOSE_KEY)[i+1]))
    features[2].append(RoR (d.get(CLOSE_KEY)[i+1], d.get(CLOSE_KEY)[i+2]))
    features[3].append(RoR (d.get(CLOSE_KEY)[i+2], d.get(CLOSE_KEY)[i+3]))
    features[4].append(RoR (d.get(CLOSE_KEY)[i+3], d.get(CLOSE_KEY)[i+4]))

    features[5].append(RoR (d.get(CLOSE_KEY)[i], d.get(CLOSE_KEY)[i+2]))
    features[6].append(RoR (d.get(CLOSE_KEY)[i+1], d.get(CLOSE_KEY)[i+3]))

    # gradient of 5-day price trend
    features[7].append(grad_price_trend(d.get(CLOSE_KEY).take(range(i,i+5))))
    features[8].append(grad_price_trend(d.get(CLOSE_KEY).take(range(i+5,i+10))))
    # gradient of 10-day price trend
    features[9].append(grad_price_trend(d.get(CLOSE_KEY).take(range(i,i+10))))

    features[10].append(features[1][i] - features[2][i])
    features[11].append(features[1][i] - features[3][i])

    # f12 = RSI[i]
    # f13 = MACD[i]

    features[14].append(d.get(CLOSE_KEY)[i] - np.average(d.get(CLOSE_KEY).take(range(i+1,i+13))))

    # 14-day rate of change 
    features[15].append((d.get(CLOSE_KEY)[i] - d.get(CLOSE_KEY)[i+14])/d.get(CLOSE_KEY)[i+14])

    features[16].append(CCI(d.get(CLOSE_KEY).take(range(i,i+20)), d.get(HIGH_KEY).take(range(i,i+20)), d.get(LOW_KEY).take(range(i,i+20))))

  result = {
      DATE_KEY: d.get(DATE_KEY).take(range(0,len(d.get(CLOSE_KEY)) - 26)),
      DATE_KEY: d.get(DATE_KEY).take(range(0,len(d.get(CLOSE_KEY)) - 26)),
      CLOSE_KEY: d.get(CLOSE_KEY).take(range(0,len(d.get(CLOSE_KEY)) - 26)),
      HIGH_KEY: d.get(HIGH_KEY).take(range(0,len(d.get(CLOSE_KEY)) - 26)),
      LOW_KEY: d.get(LOW_KEY).take(range(0,len(d.get(CLOSE_KEY)) - 26))
  }

  for i in range(1,17):
    result['f'+str(i)] = features[i]
  print(type(result))
  print(pd.DataFrame(data=result))
  return pd.DataFrame(data=result)
    


In [191]:
def read_and_extract_features(input, output):
  df = pd.read_csv (input)
  df[CLOSE_KEY] = df[CLOSE_KEY].copy().apply(lambda x: float(x[2:]))
  df[HIGH_KEY] = df[HIGH_KEY].copy().apply(lambda x: float(x[2:]))
  df[LOW_KEY] = df[LOW_KEY].copy().apply(lambda x: float(x[2:]))
  print(df)

  features = calculate_features(df)
  features.to_csv(output)

read_and_extract_features('HistoricalQuotes.csv', 'features-BIDU-5y.csv')

            Date   Close/Last   Volume      Open      High       Low
0     12/11/2020       161.24  6179815   $158.03  163.4000  157.6800
1     12/10/2020       158.32  5265865   $154.12  158.8000  151.7300
2     12/09/2020       151.59  6953750      $148  156.5631  147.0100
3     12/08/2020       144.99  3886697   $146.61  147.0700  143.5000
4     12/07/2020       144.30  2302886   $143.73  145.2000  142.5400
...          ...          ...      ...       ...       ...       ...
1254  12/18/2015       191.92  3482011   $195.54  196.3753  190.8800
1255  12/17/2015       196.94  2744170   $200.72  201.2610  196.8200
1256  12/16/2015       199.34  3274416   $201.07  201.3000  197.1111
1257  12/15/2015       199.13  3640095      $196  200.8900  195.2400
1258  12/14/2015       193.89  3226387   $193.89  196.0900  193.3200

[1259 rows x 6 columns]
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]


  


<class 'dict'>
            Date   Close/Last      High  ...        f16       f17         f18
0     12/11/2020       161.24  163.4000  ...  17.886667  0.184630  223.835410
1     12/10/2020       158.32  158.8000  ...  17.073333  0.160023  208.812402
2     12/09/2020       151.59  156.5631  ...  11.633333  0.067009  172.894826
3     12/08/2020       144.99  147.0700  ...   5.742500  0.007435   76.016080
4     12/07/2020       144.30  145.2000  ...   5.238333 -0.023945   54.284318
...          ...          ...       ...  ...        ...       ...         ...
1228  01/28/2016       156.94  162.7800  ... -10.569167 -0.099960 -126.986665
1229  01/27/2016       158.27  167.9100  ... -10.328333 -0.147298 -105.018416
1230  01/26/2016       167.08  168.3600  ...  -2.125833 -0.108288  -74.144197
1231  01/25/2016       167.54  173.7400  ...  -3.171667 -0.089605  -55.403515
1232  01/22/2016       171.17  174.7700  ...  -0.891667 -0.094530  -47.575141

[1233 rows x 22 columns]
