# Import module

In [6]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from nltk.sentiment import SentimentAnalyzer
import unicodedata
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [15]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /home/bf/nltk_data...


True

# Import CSV file with all technical data and news

In [7]:
# encoding to avoid UnicodeDecodeError
data = pd.read_hdf("data.h5", "data")

In [8]:
data.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,lrets,MACD,stochastics,ATR,News
1990-01-01,,,,,,,,,,,Warm-Weather Sissies?A Proud Beginning to 1990...
1990-01-02,,,,,,,,,,,Guest Supply Inc reports earnings for Qtr to S...
1990-01-03,,,,,,,,,,,"For Judaism's Remnant, Coup Is Mixed BlessingP..."
1990-01-04,358.76001,358.76001,352.890015,355.670013,355.670013,177000000.0,-0.00865,0.0,,5.869995,Group W Sports GainsCooney's Common Denominato...
1990-01-05,355.670013,355.670013,351.350006,352.200012,352.200012,158530000.0,-0.009804,-0.036878,,4.320007,President Wins Bipartisan Praise For Solution ...


# Remove data first day

In [9]:
data = data.ix["1990-01-02":]
data.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,lrets,MACD,stochastics,ATR,News
1990-01-02,,,,,,,,,,,Guest Supply Inc reports earnings for Qtr to S...
1990-01-03,,,,,,,,,,,"For Judaism's Remnant, Coup Is Mixed BlessingP..."
1990-01-04,358.76001,358.76001,352.890015,355.670013,355.670013,177000000.0,-0.00865,0.0,,5.869995,Group W Sports GainsCooney's Common Denominato...
1990-01-05,355.670013,355.670013,351.350006,352.200012,352.200012,158530000.0,-0.009804,-0.036878,,4.320007,President Wins Bipartisan Praise For Solution ...
1990-01-06,,,,,,,,,,,CorrectionsFrom Jewish Eden to EmbarrassmentAw...


# Remove holiday 
Feel free to use forward fill to replace Nan value

In [10]:
data.dropna(axis=0, inplace=True)
data.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,lrets,MACD,stochastics,ATR,News
1990-01-30,325.200012,325.730011,319.829987,322.980011,322.980011,186030000.0,-0.00685,-1.512383,0.0,5.900024,"The Moment, Missed; Mr. Bush on Defense: Too..."
1990-02-01,329.079987,329.859985,327.76001,328.790009,328.790009,154580000.0,-0.000882,-1.185424,18.857509,6.879974,"Samuel C. Phillips, Who Directed Apollo Lunar ..."
1990-02-05,330.920013,332.160004,330.450012,331.850006,331.850006,130950000.0,0.002806,-0.738707,28.78934,3.369995,Costly Pitfalls in Worker RetrainingMemorial o...
1990-02-08,333.75,336.089996,332.0,332.959991,332.959991,176240000.0,-0.00237,-0.328807,39.060605,4.23999,No End to the ThreatTempest Technologies repor...
1990-02-09,333.019989,334.600006,332.410004,333.619995,333.619995,146910000.0,0.00198,0.019442,41.643793,2.190002,LAWYER CONVICTED IN ABDUCTION PLOTOil Tanker i...


# Removing open high low close

In [11]:
data.drop(['Open', 'High', 'Low', 'Close'], axis=1, inplace=True)
data.head()

Unnamed: 0,Adj Close,Volume,lrets,MACD,stochastics,ATR,News
1990-01-30,322.980011,186030000.0,-0.00685,-1.512383,0.0,5.900024,"The Moment, Missed; Mr. Bush on Defense: Too..."
1990-02-01,328.790009,154580000.0,-0.000882,-1.185424,18.857509,6.879974,"Samuel C. Phillips, Who Directed Apollo Lunar ..."
1990-02-05,331.850006,130950000.0,0.002806,-0.738707,28.78934,3.369995,Costly Pitfalls in Worker RetrainingMemorial o...
1990-02-08,332.959991,176240000.0,-0.00237,-0.328807,39.060605,4.23999,No End to the ThreatTempest Technologies repor...
1990-02-09,333.619995,146910000.0,0.00198,0.019442,41.643793,2.190002,LAWYER CONVICTED IN ABDUCTION PLOTOil Tanker i...


# Using NLTK sentiment analyzer to generate a polarity score

Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.

https://www.nltk.org/_modules/nltk/sentiment/sentiment_analyzer.html

In [13]:
def sentimentanalysis(df):
    sid = SentimentIntensityAnalyzer()
    for date in df.index:
        try:
            sentence = unicodedata.normalize('NFKD', df.loc[date, 'News'])
            ss = sid.polarity_scores(sentence)
            df.at(date, 'neg') = ss['neg']
            df.at(date, 'neu') = ss['neu']
            df.at(date, 'pos') = ss['pos']
        except TypeError:
            print (df.loc[date, 'News'])
            print (date)
    return df

In [16]:
new_data = sentimentanalysis(data)

  import sys
  
  if __name__ == '__main__':


In [18]:
new_data.drop(['News'], axis=1, inplace=True)

In [19]:
new_data.to_hdf('data2', 'new_data')

# Check if there is missing data

In [20]:
new_data.isnull().any()

Adj Close      False
Volume         False
lrets          False
MACD           False
stochastics    False
ATR            False
neg            False
neu            False
pos            False
dtype: bool