In [45]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import datetime
import csv
import math
import time
from ProgressBar import ProgressBar
import scipy
import pickle
import cPickle


import nltk
import string
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import Lasso
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import r2_score
from pandas.tools.plotting import scatter_matrix

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')

# Load the data

In [4]:
CCI = pd.read_csv('CCI.csv')[['TIME','Value']]
CCI.set_index('TIME', inplace=True)
CCI.columns = ['CCI']
CCIlimited = CCI.iloc[np.where(CCI.index=='1990-01')[0][0]:(np.where(CCI.index=='2016-09')[0]+1)[0]]
CCIlimited.head()

Unnamed: 0_level_0,CCI
TIME,Unnamed: 1_level_1
1990-01,100.6027
1990-02,100.5847
1990-03,100.6537
1990-04,100.7061
1990-05,100.5763


In [5]:
topicsByMonthBigrams = np.load('topicsByMonthBigrams.npy')

In [6]:
topicsDF = pd.DataFrame(topicsByMonthBigrams)
topicsDF.columns = ['Topic_{}'.format(i) for i in range(1,(10+1))]
topicsDF.index = CCIlimited.index
topicsDF.head()

Unnamed: 0_level_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1990-01,1.970367,1.713751,13.720927,1.045895,0.964309,1.910242,0.944947,0.984133,4.037934,1.466336
1990-02,1.729297,1.475177,14.796426,0.927578,0.857963,1.864422,0.911159,0.830229,4.407042,1.361076
1990-03,2.007861,1.850794,11.473599,1.073358,1.093758,2.080598,1.439441,0.99172,4.51455,1.487123
1990-04,1.669137,1.466621,11.93781,0.890469,0.834877,1.712407,1.781014,0.799703,5.274281,1.255967
1990-05,2.046094,1.669316,9.873064,1.026934,0.923531,1.909158,2.11314,0.914692,5.249045,1.398266


In [7]:
topicsCCI = pd.concat([CCIlimited, topicsDF], axis=1)

In [8]:
def applyShifts(df, shifts):
    finaldf = df
    for i in shifts:
        newdf = df.shift(periods=i)
        newdf.columns = [s + '_lag_{}'.format(i) for s in df.columns]
        finaldf = pd.concat([finaldf, newdf], axis=1)
    finaldf = finaldf.dropna(how="any", axis=0)
    return finaldf

In [9]:
topicsCCI_withLags = applyShifts(topicsCCI, [1,2,12,13,14]) # Uses lagged CCI

In [10]:
topicsCCI_withLags.head()

Unnamed: 0_level_0,CCI,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,CCI_lag_1,Topic_1_lag_1,Topic_2_lag_1,Topic_3_lag_1,Topic_4_lag_1,Topic_5_lag_1,Topic_6_lag_1,Topic_7_lag_1,Topic_8_lag_1,Topic_9_lag_1,Topic_10_lag_1,CCI_lag_2,Topic_1_lag_2,Topic_2_lag_2,Topic_3_lag_2,Topic_4_lag_2,Topic_5_lag_2,Topic_6_lag_2,Topic_7_lag_2,Topic_8_lag_2,Topic_9_lag_2,Topic_10_lag_2,CCI_lag_12,Topic_1_lag_12,Topic_2_lag_12,Topic_3_lag_12,Topic_4_lag_12,Topic_5_lag_12,Topic_6_lag_12,Topic_7_lag_12,Topic_8_lag_12,Topic_9_lag_12,Topic_10_lag_12,CCI_lag_13,Topic_1_lag_13,Topic_2_lag_13,Topic_3_lag_13,Topic_4_lag_13,Topic_5_lag_13,Topic_6_lag_13,Topic_7_lag_13,Topic_8_lag_13,Topic_9_lag_13,Topic_10_lag_13,CCI_lag_14,Topic_1_lag_14,Topic_2_lag_14,Topic_3_lag_14,Topic_4_lag_14,Topic_5_lag_14,Topic_6_lag_14,Topic_7_lag_14,Topic_8_lag_14,Topic_9_lag_14,Topic_10_lag_14
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1
1991-03,99.25845,1.599829,1.335454,15.911069,1.001997,0.766385,1.565196,0.759154,0.749767,4.595518,1.240409,98.53362,2.012058,1.672801,14.355098,0.997745,0.96805,1.981923,1.038405,1.044541,3.90734,1.352808,97.89799,2.118595,1.927273,11.921826,1.111641,1.110824,2.082108,1.031599,1.041817,3.701967,1.740024,100.6537,2.007861,1.850794,11.473599,1.073358,1.093758,2.080598,1.439441,0.99172,4.51455,1.487123,100.5847,1.729297,1.475177,14.796426,0.927578,0.857963,1.864422,0.911159,0.830229,4.407042,1.361076,100.6027,1.970367,1.713751,13.720927,1.045895,0.964309,1.910242,0.944947,0.984133,4.037934,1.466336
1991-04,99.46318,1.626117,1.482205,15.286322,0.853019,0.767486,1.912118,0.857288,0.826584,4.622526,1.310451,99.25845,1.599829,1.335454,15.911069,1.001997,0.766385,1.565196,0.759154,0.749767,4.595518,1.240409,98.53362,2.012058,1.672801,14.355098,0.997745,0.96805,1.981923,1.038405,1.044541,3.90734,1.352808,100.7061,1.669137,1.466621,11.93781,0.890469,0.834877,1.712407,1.781014,0.799703,5.274281,1.255967,100.6537,2.007861,1.850794,11.473599,1.073358,1.093758,2.080598,1.439441,0.99172,4.51455,1.487123,100.5847,1.729297,1.475177,14.796426,0.927578,0.857963,1.864422,0.911159,0.830229,4.407042,1.361076
1991-05,99.44612,1.96885,1.778971,11.988917,1.073547,1.118675,1.960795,1.190443,0.899568,4.439156,1.464889,99.46318,1.626117,1.482205,15.286322,0.853019,0.767486,1.912118,0.857288,0.826584,4.622526,1.310451,99.25845,1.599829,1.335454,15.911069,1.001997,0.766385,1.565196,0.759154,0.749767,4.595518,1.240409,100.5763,2.046094,1.669316,9.873064,1.026934,0.923531,1.909158,2.11314,0.914692,5.249045,1.398266,100.7061,1.669137,1.466621,11.93781,0.890469,0.834877,1.712407,1.781014,0.799703,5.274281,1.255967,100.6537,2.007861,1.850794,11.473599,1.073358,1.093758,2.080598,1.439441,0.99172,4.51455,1.487123
1991-06,99.51149,2.071117,1.817834,11.251383,1.022682,1.078583,2.098459,1.615626,0.979567,4.495524,1.545568,99.44612,1.96885,1.778971,11.988917,1.073547,1.118675,1.960795,1.190443,0.899568,4.439156,1.464889,99.46318,1.626117,1.482205,15.286322,0.853019,0.767486,1.912118,0.857288,0.826584,4.622526,1.310451,100.2907,3.842707,3.044917,0.605528,1.549831,1.933248,3.946499,2.035643,1.988706,0.506042,2.930557,100.5763,2.046094,1.669316,9.873064,1.026934,0.923531,1.909158,2.11314,0.914692,5.249045,1.398266,100.7061,1.669137,1.466621,11.93781,0.890469,0.834877,1.712407,1.781014,0.799703,5.274281,1.255967
1991-07,99.57294,1.758529,1.621444,11.665783,0.966891,0.985065,1.923761,1.51894,0.88375,5.275947,1.374664,99.51149,2.071117,1.817834,11.251383,1.022682,1.078583,2.098459,1.615626,0.979567,4.495524,1.545568,99.44612,1.96885,1.778971,11.988917,1.073547,1.118675,1.960795,1.190443,0.899568,4.439156,1.464889,99.8083,2.248798,1.954451,12.243483,1.10916,1.038092,1.989577,1.534963,0.92225,3.608705,1.630049,100.2907,3.842707,3.044917,0.605528,1.549831,1.933248,3.946499,2.035643,1.988706,0.506042,2.930557,100.5763,2.046094,1.669316,9.873064,1.026934,0.923531,1.909158,2.11314,0.914692,5.249045,1.398266


In [11]:
y = topicsCCI_withLags.CCI
X = topicsCCI_withLags.drop(['CCI'], axis=1)

# Windowed Leave-one-out cross validation
The idea is the same as normal LOOCV but we exclude a region near the test datapoint from our training set because the data is correlated in time.

In [46]:
n = len(topicsCCI_withLags)
n_lags = 14
RSS = 0
for i in range(n):
    X_test = X.iloc[i]
    y_test = y.iloc[i]
    
    train_indexes = list(range(i-n_lags))
    train_indexes.extend(range(i+n_lags,n))
    
    X_train = X.iloc[train_indexes]
    y_train = y.iloc[train_indexes]
    
    lasso = Lasso(alpha=0.001)
    lasso.fit(X_train, y_train)
    RSS += np.square(lasso.predict(X_test)-y_test)

In [47]:
TSS = np.sum(np.square(y-np.mean(y)))

In [54]:
print 'R-squared = {:.4f}'.format((1-RSS/TSS)[0])

R-squared = 0.9891


In [59]:
def evaluateModel_LOOCV(model, X, y, n_lags=14):
    n = len(y)
    RSS = 0
    for i in range(n):
        X_test = X.iloc[i]
        y_test = y.iloc[i]

        train_indexes = list(range(i-n_lags))
        train_indexes.extend(range(i+n_lags,n))

        X_train = X.iloc[train_indexes]
        y_train = y.iloc[train_indexes]

        model.fit(X_train, y_train)
        RSS += np.square(model.predict(X_test)-y_test)
    return((1-RSS/TSS)[0])

In [58]:
lasso = Lasso(alpha=0.001)
evaluateModel_CV(lasso, X, y)

0.98905392442414708