In [33]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Data Preparation

In [2]:
# Check dataset
DATA_DIR = 'data'
os.listdir(DATA_DIR)

['SupportData.csv', 'YieldCurve.txt']

In [3]:
# Load Datasets
yc_path = os.path.join(DATA_DIR, 'YieldCurve.txt')
yc_df = pd.read_csv(yc_path, sep='\t')
yc_df.head()

Unnamed: 0,Date,1 Mo,3 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr
0,01/02/90,,7.83,7.89,7.81,7.87,7.9,7.87,7.98,7.94,,8.0
1,01/03/90,,7.89,7.94,7.85,7.94,7.96,7.92,8.04,7.99,,8.04
2,01/04/90,,7.84,7.9,7.82,7.92,7.93,7.91,8.02,7.98,,8.04
3,01/05/90,,7.79,7.85,7.79,7.9,7.94,7.92,8.03,7.99,,8.06
4,01/08/90,,7.79,7.88,7.81,7.9,7.95,7.92,8.05,8.02,,8.09


In [4]:
# Load Datasets
sd_path = os.path.join(DATA_DIR, 'SupportData.csv')
sd_df = pd.read_csv(sd_path)
sd_df.head()

Unnamed: 0,Year,Month,Day,Federal Funds Target Rate,Federal Funds Upper Target,Federal Funds Lower Target,Effective Federal Funds Rate,Real GDP (Percent Change),Unemployment Rate,Inflation Rate
0,1954,7,1,,,,0.8,4.6,5.8,
1,1954,8,1,,,,1.22,,6.0,
2,1954,9,1,,,,1.06,,6.1,
3,1954,10,1,,,,0.85,8.0,5.7,
4,1954,11,1,,,,0.83,,5.3,


In [5]:
sd_df['Date'] = ''
sd_df.head()

Unnamed: 0,Year,Month,Day,Federal Funds Target Rate,Federal Funds Upper Target,Federal Funds Lower Target,Effective Federal Funds Rate,Real GDP (Percent Change),Unemployment Rate,Inflation Rate,Date
0,1954,7,1,,,,0.8,4.6,5.8,,
1,1954,8,1,,,,1.22,,6.0,,
2,1954,9,1,,,,1.06,,6.1,,
3,1954,10,1,,,,0.85,8.0,5.7,,
4,1954,11,1,,,,0.83,,5.3,,


In [6]:
for i in range(len(sd_df)):
    yy = '{}'.format(sd_df['Year'][i]).replace('19', '').replace('20', '')
    mm = '{0: >2}'.format(sd_df['Month'][i]).replace(' ', '0')
    dd = '{0: >2}'.format(sd_df['Day'][i]).replace(' ', '0')
    sd_df['Date'][i] = '{}/{}/{}'.format(mm, dd, yy)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [7]:
sd_df.head()

Unnamed: 0,Year,Month,Day,Federal Funds Target Rate,Federal Funds Upper Target,Federal Funds Lower Target,Effective Federal Funds Rate,Real GDP (Percent Change),Unemployment Rate,Inflation Rate,Date
0,1954,7,1,,,,0.8,4.6,5.8,,07/01/54
1,1954,8,1,,,,1.22,,6.0,,08/01/54
2,1954,9,1,,,,1.06,,6.1,,09/01/54
3,1954,10,1,,,,0.85,8.0,5.7,,10/01/54
4,1954,11,1,,,,0.83,,5.3,,11/01/54


In [8]:
data_df = pd.merge(sd_df, yc_df, how='inner', on=['Date'])
data_df.head()

Unnamed: 0,Year,Month,Day,Federal Funds Target Rate,Federal Funds Upper Target,Federal Funds Lower Target,Effective Federal Funds Rate,Real GDP (Percent Change),Unemployment Rate,Inflation Rate,...,3 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr
0,1990,2,1,8.25,,,8.24,,5.3,4.6,...,8.02,8.13,8.09,8.28,8.35,8.35,8.38,8.42,,8.44
1,1990,3,1,8.25,,,8.28,,5.2,4.9,...,8.08,8.19,8.21,8.53,8.53,8.53,8.62,8.59,,8.61
2,1990,5,1,8.25,,,8.18,,5.4,4.8,...,8.19,8.49,8.56,9.02,9.09,9.08,9.09,9.08,,9.04
3,1990,6,1,8.25,,,8.29,,5.2,4.9,...,7.94,8.01,8.06,8.33,8.36,8.38,8.47,8.44,,8.43
4,1990,7,13,8.0,,,,,,,...,7.85,7.91,7.89,8.13,8.25,8.32,8.43,8.45,,8.46


In [9]:
data_df = data_df.fillna(method='backfill').fillna(method='pad')

In [10]:
data_df.head()

Unnamed: 0,Year,Month,Day,Federal Funds Target Rate,Federal Funds Upper Target,Federal Funds Lower Target,Effective Federal Funds Rate,Real GDP (Percent Change),Unemployment Rate,Inflation Rate,...,3 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr
0,1990,2,1,8.25,0.25,0.0,8.24,-3.4,5.3,4.6,...,8.02,8.13,8.09,8.28,8.35,8.35,8.38,8.42,6.12,8.44
1,1990,3,1,8.25,0.25,0.0,8.28,-3.4,5.2,4.9,...,8.08,8.19,8.21,8.53,8.53,8.53,8.62,8.59,6.12,8.61
2,1990,5,1,8.25,0.25,0.0,8.18,-3.4,5.4,4.8,...,8.19,8.49,8.56,9.02,9.09,9.08,9.09,9.08,6.12,9.04
3,1990,6,1,8.25,0.25,0.0,8.29,-3.4,5.2,4.9,...,7.94,8.01,8.06,8.33,8.36,8.38,8.47,8.44,6.12,8.43
4,1990,7,13,8.0,0.25,0.0,8.13,-3.4,5.7,5.5,...,7.85,7.91,7.89,8.13,8.25,8.32,8.43,8.45,6.12,8.46


In [11]:
data_df.columns

Index(['Year', 'Month', 'Day', 'Federal Funds Target Rate',
       'Federal Funds Upper Target', 'Federal Funds Lower Target',
       'Effective Federal Funds Rate', 'Real GDP (Percent Change)',
       'Unemployment Rate', 'Inflation Rate', 'Date', '1 Mo', '3 Mo', '6 Mo',
       '1 Yr', '2 Yr', '3 Yr', '5 Yr', '7 Yr', '10 Yr', '20 Yr', '30 Yr'],
      dtype='object')

In [12]:
drop_list = ['Year', 'Month', 'Day', '1 Mo', '3 Mo', '6 Mo', '2 Yr', '3 Yr', 
             '5 Yr', '7 Yr', '10 Yr', '20 Yr', '30 Yr']
for col in drop_list:
    data_df = data_df.drop(col, 'columns')

In [13]:
data_df.head()

Unnamed: 0,Federal Funds Target Rate,Federal Funds Upper Target,Federal Funds Lower Target,Effective Federal Funds Rate,Real GDP (Percent Change),Unemployment Rate,Inflation Rate,Date,1 Yr
0,8.25,0.25,0.0,8.24,-3.4,5.3,4.6,02/01/90,8.09
1,8.25,0.25,0.0,8.28,-3.4,5.2,4.9,03/01/90,8.21
2,8.25,0.25,0.0,8.18,-3.4,5.4,4.8,05/01/90,8.56
3,8.25,0.25,0.0,8.29,-3.4,5.2,4.9,06/01/90,8.06
4,8.0,0.25,0.0,8.13,-3.4,5.7,5.5,07/13/90,7.89


# Data Cleaning

In [14]:
data_mat = np.array(data_df)[:, 0: -2]
data_mat = data_mat.astype(float)
data_mat, data_mat.shape

(array([[ 8.25,  0.25,  0.  , ..., -3.4 ,  5.3 ,  4.6 ],
        [ 8.25,  0.25,  0.  , ..., -3.4 ,  5.2 ,  4.9 ],
        [ 8.25,  0.25,  0.  , ..., -3.4 ,  5.4 ,  4.8 ],
        ...,
        [ 1.  ,  0.75,  0.5 , ...,  3.5 ,  4.7 ,  2.2 ],
        [ 1.  ,  0.75,  0.5 , ...,  3.5 ,  4.7 ,  2.2 ],
        [ 1.  ,  1.  ,  0.75, ...,  3.5 ,  4.7 ,  2.2 ]]), (287, 7))

# Normalization

In [15]:
data_mat = normalize(data_mat, 'max', axis=1)

In [16]:
data_mat

array([[ 1.        ,  0.03030303,  0.        , ..., -0.41212121,
         0.64242424,  0.55757576],
       [ 0.99637681,  0.03019324,  0.        , ..., -0.41062802,
         0.62801932,  0.59178744],
       [ 1.        ,  0.03030303,  0.        , ..., -0.41212121,
         0.65454545,  0.58181818],
       ...,
       [ 0.21276596,  0.15957447,  0.10638298, ...,  0.74468085,
         1.        ,  0.46808511],
       [ 0.21276596,  0.15957447,  0.10638298, ...,  0.74468085,
         1.        ,  0.46808511],
       [ 0.21276596,  0.21276596,  0.15957447, ...,  0.74468085,
         1.        ,  0.46808511]])

# PCA: Singular Value Decomposition

In [17]:
pca = PCA(n_components=3)
pca.fit(data_mat) 

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [18]:
np.set_printoptions(precision=5, suppress=True)
print('explained variance ratio:', pca.explained_variance_ratio_)  
print('singular values:', pca.singular_values_)

explained variance ratio: [0.65002 0.29853 0.03511]
singular values: [8.80571 5.96756 2.04651]


In [19]:
trans_mat = pca.fit_transform(data_mat)

In [27]:
trans_mat[0:5], trans_mat.shape

(array([[-0.54186,  0.92675, -0.14558],
        [-0.54853,  0.92755, -0.12736],
        [-0.53925,  0.92792, -0.11876],
        [-0.54781,  0.92688, -0.12831],
        [-0.54426,  0.94031, -0.01612]]), (287, 3))

In [28]:
trans_mat = normalize(trans_mat, 'max', axis=1)
trans_mat[0:5], trans_mat.shape

(array([[-0.58468,  1.     , -0.15708],
        [-0.59138,  1.     , -0.13731],
        [-0.58114,  1.     , -0.12799],
        [-0.59102,  1.     , -0.13844],
        [-0.57881,  1.     , -0.01714]]), (287, 3))

# Linear Regression

In [30]:
fea_mat = []
label_mat = []
for i in range(51, len(trans_mat)):
    feature = trans_mat[i - 51: i].reshape([-1])
    fea_mat.append(feature.tolist())
    label_mat.append(float(data_df['1 Yr'][i]))
fea_mat = np.array(fea_mat)
label_mat = np.array(label_mat)
fea_mat[0:5], fea_mat.shape, label_mat[0:5], label_mat.shape

(array([[-0.58468,  1.     , -0.15708, -0.59138,  1.     , -0.13731,
         -0.58114,  1.     , -0.12799, -0.59102,  1.     , -0.13844,
         -0.57881,  1.     , -0.01714, -0.57881,  1.     , -0.01714,
         -0.57247,  1.     , -0.01937, -4.81608,  1.     ,  0.61388,
         -4.81608,  1.     ,  0.61388, -2.75041,  0.55716,  1.     ,
         -2.40696,  0.42871,  1.     , -2.14315,  0.33005,  1.     ,
         -1.93417,  0.25189,  1.     , -1.72051,  0.19535,  1.     ,
         -1.85976,  0.23781,  1.     , -1.76742,  0.19956,  1.     ,
         -1.76742,  0.19956,  1.     , -1.52537,  0.86511,  1.     ,
         -1.52537,  0.86511,  1.     , -1.76569,  0.9494 ,  1.     ,
         -1.59488,  1.     ,  0.88168, -1.26677,  1.     ,  0.81833,
         -1.18403,  1.     ,  0.83816, -1.18403,  1.     ,  0.83816,
         -1.17105, -0.75209,  1.     , -1.17105, -0.75209,  1.     ,
         -0.401  , -1.17722,  1.     , -0.24389, -1.19154,  1.     ,
          0.06207, -1.21942,  1.  

In [32]:
X_train, X_test, y_train, y_test = \
    fea_mat[0:200], fea_mat[200:], label_mat[0:200], label_mat[200:]

In [36]:
reg = LinearRegression().fit(X_train, y_train)

In [37]:
reg.score(X_train, y_train)

0.9867969116777805

In [38]:
reg.coef_

array([ 0.02905, -0.1052 ,  0.10827,  0.0325 , -0.11537, -0.31399,
       -0.00266,  0.02023, -0.12379,  0.02266, -0.07715, -0.17471,
        0.03074, -0.10796, -0.2697 ,  0.01458, -0.05503,  0.32127,
       -0.00389,  0.01611, -0.58591,  0.02676, -0.0938 , -0.41479,
        0.03077, -0.11572,  0.30665, -0.01036,  0.04457,  0.00215,
        0.01377, -0.03679, -0.03059, -0.01081,  0.06227,  0.07543,
        0.00996, -0.02821, -0.15019,  0.00485, -0.01189,  0.04903,
        0.00561, -0.01094, -0.0498 , -0.01546,  0.06174, -0.20011,
        0.00447, -0.0009 , -0.26384,  0.0505 , -0.15332, -0.27886,
        0.02097, -0.03755,  0.01253,  0.02   , -0.06151,  0.3542 ,
        0.0122 ,  0.00136,  0.37607,  0.03783, -0.10204,  0.01243,
        0.03276, -0.12843, -0.11454,  0.00921, -0.03788,  0.30057,
        0.01518, -0.0132 , -0.13231,  0.02653, -0.06803, -0.0819 ,
       -0.00524,  0.01355,  0.76418,  0.00005,  0.03147,  0.84272,
       -0.00019,  0.01611,  0.30292,  0.01957, -0.06837, -0.07

In [39]:
reg.intercept_ 

2.1747740897359114

In [40]:
reg.score(X_test, y_test)

-50.51518164431626

In [42]:
reg.predict(X_test)

array([-0.05803, -0.09737,  0.11151,  0.10342,  0.23368,  0.97081,
        0.93675,  1.0901 ,  1.37167,  1.56284,  1.67239,  1.59918,
        1.73321,  1.72546,  1.80998,  2.03575,  2.29352,  2.44422,
        2.53797,  2.64693,  2.71927,  2.79606,  2.78192,  3.01572,
        3.11789,  3.10161,  3.19201,  3.37853,  3.49213,  3.4058 ,
        3.40729,  3.45426,  3.5357 ,  3.5758 ,  3.60981,  3.67218])

In [43]:
y_test

array([0.18, 0.15, 0.16, 0.14, 0.11, 0.15, 0.13, 0.1 , 0.1 , 0.13, 0.1 ,
       0.11, 0.13, 0.1 , 0.13, 0.27, 0.25, 0.26, 0.28, 0.39, 0.31, 0.51,
       0.7 , 0.47, 0.68, 0.62, 0.7 , 0.45, 0.5 , 0.6 , 0.65, 0.82, 0.92,
       0.83, 0.92, 1.01])

# Kernal PCA: Linear

In [21]:
transformer = KernelPCA(n_components=6, kernel='linear')
X_transformed = transformer.fit_transform(data_mat)

In [22]:
X_transformed.shape

(287, 6)

In [23]:
X_transformed

array([[-0.54186, -0.92675, -0.14558,  0.16047, -0.017  , -0.00091],
       [-0.54853, -0.92755, -0.12736,  0.19154, -0.02358, -0.00386],
       [-0.53925, -0.92792, -0.11876,  0.16663, -0.01313, -0.00362],
       ...,
       [ 0.47131,  0.38694,  0.14735,  0.10834,  0.01393,  0.12878],
       [ 0.47131,  0.38694,  0.14735,  0.10834,  0.01393,  0.12878],
       [ 0.47174,  0.38787,  0.15195,  0.11477,  0.02336,  0.20239]])

# Prepare data for linear regression

# Check Data

In [24]:
len(sd_df)

904

In [25]:
sd_df[500: 503]

Unnamed: 0,Year,Month,Day,Federal Funds Target Rate,Federal Funds Upper Target,Federal Funds Lower Target,Effective Federal Funds Rate,Real GDP (Percent Change),Unemployment Rate,Inflation Rate,Date
500,1990,2,1,8.25,,,8.24,,5.3,4.6,02/01/90
501,1990,3,1,8.25,,,8.28,,5.2,4.9,03/01/90
502,1990,4,1,8.25,,,8.26,1.6,5.4,4.8,04/01/90
