## This notebook inputs the missing values using Regression

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import preprocessing

#### Choose wether the imputation should be done on the data set with daily or weekly granularity

In [2]:
INPUT, OUTPUT = 'output/CompleteWeeklyIndexes', 'output/RegressionWeeklyImputed'

In [3]:
df = pd.read_csv(INPUT+'.csv', parse_dates=True, index_col=0)
df_test = df.loc['2015-01-02':] #Selecting test data
df = df.loc['2000-01-01':'2015-01-01'] #Selecting training data

In [4]:
#Scaled data has zero mean and unit variance
scaler = preprocessing.StandardScaler()
scaler.fit(df)
array = scaler.transform(df)
array_test = scaler.transform(df_test)
df_scaled = pd.DataFrame(data=array, index=df.index, columns=df.columns)
df_test_scaled = pd.DataFrame(data=array_test, index=df_test.index, columns=df_test.columns)
df_scaled.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,0.516845,-1.173684,0.458946,0.0,-0.097348,-0.398262,1.741088,-0.867106,2.026643,-1.613284,...,-0.301548,-1.182157,1.292603,-1.108292,,-0.478968,,,,
2000-01-14,0.622367,-1.195971,0.084189,0.0,0.052938,-0.219366,1.822917,-0.834847,2.142257,-1.613284,...,-0.393082,-1.182157,1.340506,-1.108292,,-0.540299,,,,
2000-01-21,0.625514,-1.144887,0.057589,0.0,-0.024923,-0.201874,2.060511,-0.411409,2.225496,-1.613284,...,-0.459789,-1.182157,1.345545,-1.108292,,-0.706121,,,,
2000-01-28,0.421959,-1.149579,0.340176,0.0,-0.223574,-0.227466,1.828617,-0.353183,2.244522,-1.613284,...,-0.425608,-1.182157,1.284557,-1.108292,0.365881,-0.101894,,0.508212,0.367186,1.245141
2000-02-04,0.487425,-1.191452,0.233777,0.0,-0.212513,-0.472543,1.981936,-0.916105,2.400759,-1.613284,...,-0.389843,-1.182157,1.418557,-1.108292,,0.297896,-0.796131,,,


In [5]:
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df_scaled)
df_imputed = imp.transform(df_scaled)
df_test_imputed = imp.transform(df_test_scaled)
df_final = pd.DataFrame(data=df_imputed, index=df.index, columns=df.columns)
df_test_final = pd.DataFrame(data=df_test_imputed, index=df_test.index, columns=df_test.columns)



In [7]:
# Descaling the data
array_2 = scaler.inverse_transform(df_final)
df_final2 = pd.DataFrame(data=array_2, index=df.index, columns=df.columns)

array_test_2 = scaler.inverse_transform(df_test_final)
df_test_final2 = pd.DataFrame(data=array_test_2, index=df_test.index, columns=df_test.columns)

df_final2.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1420.333984,1068760000.0,25.016,0.0,11250.781836,182562000.0,3542.894043,1598166000.0,18476.772461,0.0,...,16169.60625,0.0,949.868006,0.0,27.15336,0.252857,21.676217,73.345882,83.963067,0.026355
2000-01-14,1448.648023,1033940000.0,21.684,0.0,11587.958008,196256000.0,3611.343994,1609134000.0,18829.544922,0.0,...,15720.128125,0.0,957.357996,0.0,24.79857,0.214286,14.866689,69.843595,82.320136,0.026134
2000-01-21,1449.492493,1113750000.0,21.4475,0.0,11413.272461,197595000.0,3810.092468,1753105000.0,19083.530078,0.0,...,15392.563867,0.0,958.146008,0.0,24.08658,0.11,15.07024,70.800366,76.412735,0.025803
2000-01-28,1394.874023,1106420000.0,23.96,0.0,10967.58789,195636000.0,3616.111963,1772902000.0,19141.585938,0.0,...,15560.411914,0.0,948.61001,0.0,24.412899,0.49,22.936025,71.914247,117.577146,0.033573
2000-02-04,1412.43999,1041000000.0,23.014001,0.0,10992.404102,176876000.0,3744.364014,1581506000.0,19618.308203,0.0,...,15736.035157,0.0,969.562,0.0,27.587744,0.741429,34.391162,65.715073,62.443245,0.02919


In [8]:
df_final2.to_csv(OUTPUT+'_training.csv', index = True)
df_test_final2.to_csv(OUTPUT+ '_test.csv', index = True)