# Multiple LInear Regression Model
## Mimicking the process of building our trading model of SPY, base on the historical data of different stock markets. 

In [2]:
import pandas as pd
import statsmodels.formula.api as smf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import warnings 
warnings.filterwarnings("ignore")

In [4]:
#Import all stock market data into DataFrame
aord = pd.read_csv('data/indice/ALLOrdinary.csv')
nikkei = pd.read_csv('data/indice/Nikkei225.csv')
hsi = pd.read_csv('data/indice/HSI.csv')
daxi = pd.read_csv('data/indice/DAXI.csv')
cac40 = pd.read_csv('data/indice/CAC40.csv')
sp500 = pd.read_csv('data/indice/SP500.csv')
dji = pd.read_csv('data/indice/DJI.csv')
nasdaq = pd.read_csv('data/indice/nasdaq_composite.csv')
spy = pd.read_csv('data/indice/SPY.csv')

In [5]:
nasdaq.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2008-01-02,2653.909912,2661.5,2597.810059,2609.629883,2609.629883,2076690000
1,2008-01-03,2611.959961,2624.27002,2592.179932,2602.679932,2602.679932,1970200000
2,2008-01-04,2571.080078,2571.080078,2502.679932,2504.649902,2504.649902,2516310000
3,2008-01-07,2514.149902,2521.620117,2471.22998,2499.459961,2499.459961,2600100000
4,2008-01-08,2506.969971,2527.419922,2440.51001,2440.51001,2440.51001,2566480000


Showing the first five rows of the nasdaq data including adjusted close and number of stock traded on that day

## Step 1: Data Munging

In [6]:
# Due to the timezone issues, we extract and calculate appropriate stock market data for analysis
# Indicepanel is the DataFrame of our trading model
indicepanel=pd.DataFrame(index=spy.index)

indicepanel['spy']=spy['Open'].shift(-1)-spy['Open']
indicepanel['spy_lag1']=indicepanel['spy'].shift(1)
indicepanel['sp500']=sp500["Open"]-sp500['Open'].shift(1)
indicepanel['nasdaq']=nasdaq['Open']-nasdaq['Open'].shift(1)
indicepanel['dji']=dji['Open']-dji['Open'].shift(1)

indicepanel['cac40']=cac40['Open']-cac40['Open'].shift(1)
indicepanel['daxi']=daxi['Open']-daxi['Open'].shift(1)

indicepanel['aord']=aord['Close']-aord['Open']
indicepanel['hsi']=hsi['Close']-hsi['Open']
indicepanel['nikkei']=nikkei['Close']-nikkei['Open']
indicepanel['Price']=spy['Open']

In [7]:
indicepanel.head()

Unnamed: 0,spy,spy_lag1,sp500,nasdaq,dji,cac40,daxi,aord,hsi,nikkei,Price
0,-1.619995,,,,,,,15.5,-71.679688,-464.320313,146.529999
1,-1.570008,-1.619995,-20.419922,-41.949951,-217.70019,-71.779785,-104.450195,-50.100097,-162.75,-48.830078,144.910004
2,-1.529998,-1.570008,-3.540039,-40.879883,2.43945,5.489746,-27.990235,-2.300293,515.349609,99.370117,143.339996
3,0.270004,-1.529998,-29.940064,-56.930176,-245.40918,-111.689941,-102.709961,-117.399903,216.951171,234.450195,141.809998
4,-2.990006,0.270004,1.640015,-7.179931,19.75,44.509766,33.680176,-27.5,-354.060547,-158.209961,142.080002


In [8]:
#Checking whether we have NaN values in the indicepanel
indicepanel.isnull().sum()

spy          1
spy_lag1     1
sp500        1
nasdaq       1
dji          1
cac40        3
daxi        11
aord         2
hsi         57
nikkei      57
Price        0
dtype: int64

Based on the above, each variable has NaN apart from price