# Import module

In [2]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from nltk.sentiment import SentimentAnalyzer
import unicodedata
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import seaborn as sns
%matplotlib inline



# Import CSV file with all technical data and news

In [95]:
# encoding to avoid UnicodeDecodeError
data = pd.read_csv("alldata.csv", index_col=0, encoding = "ISO-8859-1")

In [96]:
data.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,lrets,MACD,stochastics,ATR,News
1950-01-24,16.860001,16.860001,16.860001,16.860001,16.860001,1250000.0,-0.003552,0.002636,45.238333,0.059999,
1950-01-25,16.74,16.74,16.74,16.74,16.74,1700000.0,-0.007143,-0.003952,16.666667,0.120001,
1950-01-26,16.73,16.73,16.73,16.73,16.73,1150000.0,-0.000598,-0.008599,14.285714,0.01,
1950-01-27,16.82,16.82,16.82,16.82,16.82,1250000.0,0.005365,-0.006366,35.714286,0.09,
1950-01-30,17.02,17.02,17.02,17.02,17.02,1640000.0,0.01182,0.006641,83.333333,0.2,


# Remove data without news

In [97]:
data = data.ix["1990-01-02":]
data.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,lrets,MACD,stochastics,ATR,News
1990-01-02,353.399994,359.690002,351.980011,359.690002,359.690002,162070000.0,0.017642,1.018527,100.0,7.709991,Guest Supply Inc reports earnings for Qtr to S...
1990-01-03,359.690002,360.589996,357.890015,358.76001,358.76001,192330000.0,-0.002589,1.392774,94.602487,2.699981,"For Judaism's Remnant, Coup Is Mixed BlessingP..."
1990-01-04,358.76001,358.76001,352.890015,355.670013,355.670013,177000000.0,-0.00865,1.377343,76.66868,5.869995,Group W Sports GainsCooney's Common Denominato...
1990-01-05,355.670013,355.670013,351.350006,352.200012,352.200012,158530000.0,-0.009804,1.058777,56.529395,4.320007,President Wins Bipartisan Praise For Solution ...
1990-01-06,,,,,,,,,,,CorrectionsFrom Jewish Eden to EmbarrassmentAw...


# Remove holiday 
Feel free to use forward fill to replace Nan value

In [98]:
data.dropna(axis=0, inplace=True)
data.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,lrets,MACD,stochastics,ATR,News
1990-01-02,353.399994,359.690002,351.980011,359.690002,359.690002,162070000.0,0.017642,1.018527,100.0,7.709991,Guest Supply Inc reports earnings for Qtr to S...
1990-01-03,359.690002,360.589996,357.890015,358.76001,358.76001,192330000.0,-0.002589,1.392774,94.602487,2.699981,"For Judaism's Remnant, Coup Is Mixed BlessingP..."
1990-01-04,358.76001,358.76001,352.890015,355.670013,355.670013,177000000.0,-0.00865,1.377343,76.66868,5.869995,Group W Sports GainsCooney's Common Denominato...
1990-01-05,355.670013,355.670013,351.350006,352.200012,352.200012,158530000.0,-0.009804,1.058777,56.529395,4.320007,President Wins Bipartisan Praise For Solution ...
1990-01-08,352.200012,354.23999,350.540009,353.790009,353.790009,140110000.0,0.004504,0.898704,65.757462,3.699981,"Reviews/Dance; 'Cortege Hongrois,' Big and C..."


# Removing open high low close

In [99]:
data.drop(['Open', 'High', 'Low', 'Close'], axis=1, inplace=True)
data.head()

Unnamed: 0,Adj Close,Volume,lrets,MACD,stochastics,ATR,News
1990-01-02,359.690002,162070000.0,0.017642,1.018527,100.0,7.709991,Guest Supply Inc reports earnings for Qtr to S...
1990-01-03,358.76001,192330000.0,-0.002589,1.392774,94.602487,2.699981,"For Judaism's Remnant, Coup Is Mixed BlessingP..."
1990-01-04,355.670013,177000000.0,-0.00865,1.377343,76.66868,5.869995,Group W Sports GainsCooney's Common Denominato...
1990-01-05,352.200012,158530000.0,-0.009804,1.058777,56.529395,4.320007,President Wins Bipartisan Praise For Solution ...
1990-01-08,353.790009,140110000.0,0.004504,0.898704,65.757462,3.699981,"Reviews/Dance; 'Cortege Hongrois,' Big and C..."


# Using NLTK sentiment analyzer to generate a polarity score

https://www.nltk.org/_modules/nltk/sentiment/sentiment_analyzer.html

In [100]:
def sentimentanalysis(df):
    sid = SentimentIntensityAnalyzer()
    for date in df.index:
        try:
            sentence = unicodedata.normalize('NFKD', df.loc[date, 'News'])
            ss = sid.polarity_scores(sentence)
            df.set_value(date, 'neg', ss['neg'])
            df.set_value(date, 'neu', ss['neu'])
            df.set_value(date, 'pos', ss['pos'])
        except TypeError:
            print (df.loc[date, 'News'])
            print (date)
    return df

In [101]:
new_data = sentimentanalysis(data)

In [102]:
new_data.drop(['News'], axis=1, inplace=True)

In [106]:
#new_data.to_pickle('Final_df.pkl')

# Check if there is missing data

In [104]:
new_data.isnull().any()

Adj Close      False
Volume         False
lrets          False
MACD           False
stochastics    False
ATR            False
neg            False
neu            False
pos            False
Y              False
dtype: bool