# RTSI Index EDA

## TOC

## Imports

In [1]:
import pandas as pd
import numpy as np
from pandas_datareader.data import DataReader
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from pylab import rcParams
import time
plt.style.use('fivethirtyeight')

ModuleNotFoundError: No module named 'pandas_datareader'

## Read RTSI Index data

In [None]:
def read_csv_rtsi():
    '''Moex from datareader doesn't provide intraday data, so read from downloaded csv'''
    url="https://raw.githubusercontent.com/DmitryPukhov/stock-predictor/master/data/RI.RTSI_180101_180313.csv"
    #ohlcv=pd.read_csv("/home/dima/projects/stock-predictor/data/RI.RTSI_180101_180313.csv", 
    ohlcv=pd.read_csv(url,
               parse_dates={'datetime':['<DATE>','<TIME>']}, 
               index_col='datetime', 
               usecols=['<DATE>','<TIME>','<OPEN>','<HIGH>','<LOW>','<CLOSE>','<VOL>'])
    ohlcv.columns=['open','high','low','close','vol']
    return ohlcv

# Read daily data, no intraday
ohlcv=read_csv_rtsi()
df=pd.DataFrame(ohlcv.close.diff().dropna())
df.tail()


In [None]:
df.info()

In [None]:
df.describe()

## Price plot


In [None]:
rcParams['figure.figsize'] = 11,9
ohlcv[['close', 'vol']].plot(secondary_y='vol', title='RTSI price and volume')
plt.show()

In [None]:
ax = df.plot(marker='.', linestyle='none', markersize=1, fontsize=6, title='RTSI')
plt.show()

In [None]:
fig=plt.figure()
# Price boxplot
ax1=fig.add_subplot(221, title='Price boxplot', ylabel='Price')
df.boxplot(ax=ax1)

# Price density plot
ax2 = fig.add_subplot(222)
df.plot(kind='density', ax=ax2, title='Price density')

ax3 = fig.add_subplot(223)
sns.distplot(df['close'], ax=ax3,bins=50,kde=False).set_title("Price distribution")
ax3.axvline(df['close'].mean(), ls='--', label='mean')
#ax3.axvline(df['close'].median(), ls='--', label='median')
#ax3.axvline(df['close'].quantile(0.25), ls='--', label='0.25%')
#ax3.axvline(df['close'].quantile(0.25), ls='--', label='0.75%')


ax3.legend()
plt.tight_layout()
plt.subplots_adjust(right=2)
plt.show()

## Average % change, aggregated by week and day


In [None]:
# Plot % change by week day
fig=plt.figure()
ax1 = fig.add_subplot(121)
df.groupby(df.index.dayofweek+1).mean().plot.bar(ax=ax1)
ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
ax1.set_xlabel("Day of week")
ax1.set_ylabel("Diff")
ax1.set_title("Avg diff by day of week")

# Plot # change by hour of the day
ax2=fig.add_subplot(122)
df.groupby(df.index.hour).mean().plot.bar(ax=ax2)
ax2.xaxis.set_major_locator(MaxNLocator(integer=True))
ax2.set_xlabel("Day hour")
ax1.set_ylabel("Diff")
ax2.set_title("Avg diff by hour of the day")
plt.subplots_adjust(right=2)
plt.show()


## Autocorrelation and partial autocorrelatoin
ToDo: normalize the data

In [None]:
# Plot % change by week day
fig=plt.figure()
ax1 = fig.add_subplot(121)
tsaplots.plot_acf(df, lags=60, ax=ax1)

# Plot # change by hour of the day
ax2=fig.add_subplot(122)
tsaplots.plot_pacf(df, lags=60, ax=ax2)

plt.subplots_adjust(right=2)
plt.show()

## Seasonality, trend and noise
Todo: detect seasonality properly

In [None]:
rcParams['figure.figsize'] = 11,9
# 1200 minutes in a day
decomposition = sm.tsa.seasonal_decompose(df, freq=1200)
decomposition.plot()
plt.show()