In [57]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import datetime
import csv
import math
import time
from ProgressBar import ProgressBar
import scipy
import pickle
import cPickle


import nltk
import string
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import Lasso
from sklearn.cross_validation import train_test_split

In [7]:
def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

# Load CCI data

In [77]:
CCI = pd.read_csv('CCI.csv')[['TIME','Value']]
CCI.set_index('TIME', inplace=True)
CCI.columns = ['CCI']
CCI.head()

Unnamed: 0_level_0,CCI
TIME,Unnamed: 1_level_1
1960-01,101.584
1960-02,101.3255
1960-03,101.1023
1960-04,100.9791
1960-05,101.0102


In [82]:
CCIlimited = CCI.iloc[np.where(CCI.index=='1990-01')[0][0]:(np.where(CCI.index=='2016-09')[0]+1)[0]]

# Load LDA data

In [6]:
num_topics=10
lda = LatentDirichletAllocation(n_topics=num_topics)

In [8]:
wordMatrix = load_sparse_csr('unigramWordMatrix.npz')

In [9]:
ldaDocs = lda.fit_transform(wordMatrix)

KeyboardInterrupt: 

In [11]:
topicsByMonth = np.load('topicsByMonth.npy')

# Assemble the features

In [97]:
allPredictors = pd.DataFrame(topicsByMonth)
allPredictors.columns = ['Topic_{}'.format(i) for i in range(1,(10+1))]
allPredictors.index = CCIlimited.index
allPredictors.head()

Unnamed: 0_level_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1990-01,0.69509,0.652963,1.61282,0.881739,0.806933,0.768514,1.578718,1.19666,10.205738,1.360887
1990-02,0.680714,0.540261,1.404959,0.809531,0.837775,0.674393,1.405153,1.207867,11.122098,1.293593
1990-03,0.776482,0.697811,1.60287,0.972046,1.211075,0.890776,1.657627,1.389039,8.920756,1.43653
1990-04,0.615586,0.52308,1.297542,0.793232,1.309289,1.286157,1.244166,1.454499,9.202706,1.241686
1990-05,0.650422,0.61745,1.527278,0.869329,1.480203,1.489422,1.615994,1.461718,7.798623,1.339317


In [100]:
allPredictorsCCI = pd.concat([CCIlimited, allPredictors], axis=1)

In [103]:
def applyShifts(df, shifts):
    finaldf = df
    for i in shifts:
        newdf = df.shift(periods=i)
        newdf.columns = [s + '_lag_{}'.format(i) for s in df.columns]
        finaldf = pd.concat([finaldf, newdf], axis=1)
    finaldf = finaldf.dropna(how="any", axis=0)
    return finaldf

In [139]:
allPredictors_withLags = applyShifts(allPredictors, [1,12,13])

In [140]:
allPredictorsCCI_withLags = applyShifts(allPredictorsCCI, [1,2,12,13,14]) # Uses lagged CCI

In [141]:
allPredictorsCCI_withLags.head()

Unnamed: 0_level_0,CCI,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,...,Topic_1_lag_14,Topic_2_lag_14,Topic_3_lag_14,Topic_4_lag_14,Topic_5_lag_14,Topic_6_lag_14,Topic_7_lag_14,Topic_8_lag_14,Topic_9_lag_14,Topic_10_lag_14
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1991-03,99.25845,0.651389,0.506475,1.264501,0.685694,0.766073,0.665447,1.24706,1.252197,11.888972,...,0.69509,0.652963,1.61282,0.881739,0.806933,0.768514,1.578718,1.19666,10.205738,1.360887
1991-04,99.46318,0.626874,0.543955,1.354595,0.842407,0.833072,0.643905,1.331632,1.261559,11.486755,...,0.680714,0.540261,1.404959,0.809531,0.837775,0.674393,1.405153,1.207867,11.122098,1.293593
1991-05,99.44612,0.823252,0.676102,1.495341,0.87431,1.164054,0.878346,1.598711,1.347254,9.257493,...,0.776482,0.697811,1.60287,0.972046,1.211075,0.890776,1.657627,1.389039,8.920756,1.43653
1991-06,99.51149,0.788818,0.656855,1.67805,0.992233,1.265443,0.996644,1.635461,1.303222,8.752645,...,0.615586,0.52308,1.297542,0.793232,1.309289,1.286157,1.244166,1.454499,9.202706,1.241686
1991-07,99.57294,0.685575,0.618746,1.439939,0.925509,1.133617,0.978196,1.486801,1.621665,9.170263,...,0.650422,0.61745,1.527278,0.869329,1.480203,1.489422,1.615994,1.461718,7.798623,1.339317


# Modeling

In [168]:
y = CCIlimited.iloc[np.where(CCIlimited.index=='1991-02')[0][0]:(np.where(CCIlimited.index=='2016-09')[0]+1)[0]]

In [173]:
y_train, y_test, X_train, X_test = train_test_split(y, allPredictors_withLags, train_size=0.8)

In [174]:
lasso = Lasso(alpha=.1)

In [175]:
lasso.fit(X_train, y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [176]:
lasso.score(X_test, y_test)

0.06586465897098015

In [109]:
y = allPredictorsCCI_withLags.CCI
X = allPredictorsCCI_withLags.drop(['CCI'], axis=1)

In [113]:
y_train, y_test, X_train, X_test = train_test_split(y, X, train_size=0.8)

In [114]:
lasso = Lasso(alpha=.01)

In [115]:
lasso.fit(X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [116]:
lasso.score(X_test, y_test)

0.98229630244570887

In [119]:
lasso.coef_

array([  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         4.45763009e-03,  -0.00000000e+00,  -3.72963293e-03,
         0.00000000e+00,   1.23185967e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,  -6.71150402e-03,   0.00000000e+00,
        -2.55494248e-01,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         3.15796295e-02,   0.00000000e+00,   0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
        -0.00000000e+00,  -0.00000000e+00,  -6.76699089e-03,
        -0.00000000e+00,  -0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,  -0.00000000e+00,   1.60872249e-04,
        -0.00000000e+00,   0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,  -0.00000000e+00,  -0.00000000e+00,
         0.00000000e+00,