### Setting the Desired Frequency

In [1]:
import pandas as pd
import numpy as np

In [93]:
raw_data = pd.read_csv('Index2018.csv')
df_comp = raw_data.copy()

In [81]:
df_comp = df_comp.asfreq('b') #d = diarily, m = monthly, a = annual

In [82]:
df_comp.head()

Unnamed: 0,date,spx,dax,ftse,nikkei
1970-01-01,,,,,


### Handling Missing Values

In [83]:
df_comp.isna().sum()

date      1
spx       1
dax       1
ftse      1
nikkei    1
dtype: int64

In [85]:
df_comp.spx = df_comp.spx.fillna(method= 'ffill')

In [86]:
# Front filling(ffill) copies the last known value, where as backfilling(bfill) uses the next known value
df_comp.ftse = df_comp.ftse.fillna(method= 'bfill')

In [88]:
df_comp.dax = df_comp.dax.fillna(value = df_comp.dax.mean())

### Simplying the Dataset

In [94]:
# Creating a column
df_comp['market_value'] = df_comp.spx

In [95]:
# Cheking results
df_comp.describe()

Unnamed: 0,spx,dax,ftse,nikkei,market_value
count,6269.0,6269.0,6269.0,6269.0,6269.0
mean,1288.127542,6080.063363,5422.713545,14597.0557,1288.127542
std,487.586473,2754.361032,1145.572428,4043.122953,487.586473
min,438.92,1911.7,2876.6,7054.98,438.92
25%,990.671905,4069.35,4486.1,10709.29,990.671905
50%,1233.42,5773.34,5662.43,15028.17,1233.42
75%,1459.987747,7443.07,6304.25,17860.47,1459.987747
max,2872.867839,13559.6,7778.637689,24124.15,2872.867839


In [96]:
# Delet column
del df_comp['spx']

In [97]:
df_comp.describe()

Unnamed: 0,dax,ftse,nikkei,market_value
count,6269.0,6269.0,6269.0,6269.0
mean,6080.063363,5422.713545,14597.0557,1288.127542
std,2754.361032,1145.572428,4043.122953,487.586473
min,1911.7,2876.6,7054.98,438.92
25%,4069.35,4486.1,10709.29,990.671905
50%,5773.34,5662.43,15028.17,1233.42
75%,7443.07,6304.25,17860.47,1459.987747
max,13559.6,7778.637689,24124.15,2872.867839


In [99]:
# Revoming the time series for the other market indices.
# Removing redundant columns drecreases the size of the dataset and makes going through it faster
del df_comp['dax'], df_comp['ftse'], df_comp['nikkei']

In [100]:
df_comp.describe()

Unnamed: 0,market_value
count,6269.0
mean,1288.127542
std,487.586473
min,438.92
25%,990.671905
50%,1233.42
75%,1459.987747
max,2872.867839


### Splitting the Data

In [101]:
# Splitting the data into 2 sets: training set and testing set
size  = int(len(df_comp) * 0.8)

In [102]:
df = df_comp.iloc[:size] #iloc[:size] is a slice of dataframe containing all the values from the start up to the size index

In [103]:
df_test = df_comp.iloc[size:]

In [105]:
df.tail()

Unnamed: 0,date,market_value
5010,28/03/2013,1569.185872
5011,29/03/2013,1569.185872
5012,01/04/2013,1562.173837
5013,02/04/2013,1570.252238
5014,03/04/2013,1553.686978


In [106]:
df_test.head()

Unnamed: 0,date,market_value
5015,04/04/2013,1559.979316
5016,05/04/2013,1553.27893
5017,08/04/2013,1563.071269
5018,09/04/2013,1568.607909
5019,10/04/2013,1587.731827
