In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import datetime
import csv
import math
import time
from ProgressBar import ProgressBar

import nltk
import string
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import Lasso



The raw output from the NY Times API is stored as separate csv files for each year. We currently have all the data from 1997 onwards, but will just show a subset as an example here. The data includes the date of publication, article id, headline, and lead paragraph. We then stem the lead paragragh to increase the accuracy of the bag of words by eliminating meaningless variations of a word.

In [4]:
all_data_list = []
for year in range(2000,2005):
    data = pd.read_csv('{}_Output.csv'.format(year), header=None)
    all_data_list.append(data) # list of dataframes
data = pd.concat(all_data_list, axis=0)
data.columns = ['id','date','headline', 'lead']
data['yearmonth'] = map(lambda x: x[:7], data.date)

In [5]:
def getStems(lead):
    stemmer = nltk.stem.SnowballStemmer("english")
    tokens = nltk.word_tokenize(''.join(ch for ch in lead if ch not in set(string.punctuation)))
    return map(lambda x: stemmer.stem(x.decode('utf-8')), tokens)

In [6]:
def joinstems(stemlist):
    return ' '.join(stem for stem in stemlist)

In [7]:
stems = map(getStems, data.lead)

In [8]:
data['leadstems'] = map(joinstems, stems)

In [9]:
data.head()

Unnamed: 0,id,date,headline,lead,yearmonth,leadstems
0,4fd233bf8eb7c8105d7c430b,2000-01-08T00:00:00Z,THE MARKETS: COMMODITIES,CRUDE OIL FALLS. Crude oil declined more than ...,2000-01,crude oil fall crude oil declin more than 2 pe...
1,4fd237718eb7c8105d7c9aa8,2000-01-08T00:00:00Z,DIMINISHED BRAZILIAN INFLATION,Concern over Brazilian price increases eased a...,2000-01,concern over brazilian price increas eas after...
2,4fd21c448eb7c8105d79c973,2000-01-08T00:00:00Z,Merrill Lynch Reimburses Client For Loss Linke...,Merrill Lynch & Company said today that it rei...,2000-01,merril lynch compani said today that it reimbu...
3,4fd1f22e8eb7c8105d7496d1,2000-01-08T00:00:00Z,NEW UNICOM-PECO MERGER TERMS INCLUDE STOCK BUY...,The Unicom Corporation and the Peco Energy Com...,2000-01,the unicom corpor and the peco energi compani ...
4,4fd203328eb7c8105d768db0,2000-01-08T00:00:00Z,SYSCO AGREES TO BUY FRUIT AND VEGETABLE DISTRI...,"The Sysco Corporation, North America's largest...",2000-01,the sysco corpor north america largest food se...


## Count vectorizing
We will perform topic modeling as a means of feature reduction. Using individual stems as the features led to a high dimensional problem where none of the stems were correlated with CCI beyond the noise. Therefore, we will extract how much each document corresponds to a given topic and then try using those topics as the features to a model.

In [None]:
countVec = CountVectorizer(stop_words='english', max_df=0.8, min_df=.005, strip_accents='unicode')
wordMatrix = countVec.fit_transform(data.leadstems)
unigramVocab = countVec.get_feature_names()

In [None]:
lda = LatentDirichletAllocation(n_topics=10)
ldaDocs = lda.fit_transform(wordMatrix)

In [None]:
num_top_words = 10
topic_words = []

for topic in lda.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words.append([unigramVocab[i] for i in word_idx])

In [None]:
topicDF = pd.DataFrame(topic_words)
topicDF.index = ['Topic {}'.format(i) for i in range(1,11)]
topicDF.columns = ['Stem {}'.format(i) for i in range(1,11)]
topicDF

## Group by month
Then, I'll group the articles by month and find the average topics by no