## Dow Jones Industrial Average (DJIA) index movement prediction using Daily News from Reddit and past Historical Data

### Name: Aristotelis-Angelos Papadopoulos
### USC ID: 3804-2945-23

In [100]:
# Import the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

from numpy import nan as Nan
# In order to supress some warnings
import warnings
warnings.filterwarnings("ignore")

## 1. Import Data and Preprocessing

In [82]:
# Create a dataframe with the stock data
# This dataframe contains the 'Date', the 'Open', 'High', 'Low', 'Close' and the 'Adjusting Close' 
# prices of the stock as well as the volume of the stock from 2008 up to 2016. 
stock_data = pd.read_csv('DJIA_table.csv', sep = ",")

In order to deal with the Reddit news data, we used some preprocessing techniques borrowed from the NLP literature with which, we are able to extract the 'Subjectivity' and the 'Objectivity' of the news as well as the 'Positive', 'Neutral' or 'Negative' sentiment that the news have for the stock price movement. For more details regarding this preprocessing technique, please check the following GitHub [link](https://github.com/AristotelisPap/Stock-Price-Prediction-Model/tree/master/Sentence_Polarity).

In [83]:
# Create a dataframe with the Reddit News as well as the aforementioned extracted features 
news_data = pd.read_csv('combined_stock_data_proc.csv', parse_dates=[0])
del news_data['Unnamed: 0']

Now, I am going to merge the 'Subjectivity', the 'Objectivity', the 'Positive', the 'Neutral' and the 'Negative' feautures of the news_data dataframe with the stock_data dataframe.

In [240]:
# Create a dataframe by merging the headlines and the stock prices dataframe
news_data_sub = news_data.loc[:,["Date", "Subjectivity", "Objectivity", "Positive", "Neutral", "Negative", "Label"]]
dataset = news_data_sub.merge(stock_data, how='inner', on='Date', left_index=True)

Now, at each day, we are going to take into consideration what were the the 'Open', 'High', 'Low' and 'Close' prices of the stock during the past 4 days! Moreover, we will also take into cobsideration what was the movement of the index during those days (whether it increased or decreased). Note that we are going to throw the first 4 data points of the dataset in order to avoid missing values while taking into consideration the prices of the index for the last 4 days! So, let's modify our dataset accordingly! 

In [241]:
# Prices 1 day ago
prices_1_day_ago = dataset.loc[range(1985,0,-1), ["Open", "High", "Low", "Close", "Label"]]

# Prices 2 days ago
prices_2_days_ago = dataset.loc[range(1986,1,-1), ["Open", "High", "Low", "Close", "Label"]]

# Prices 3 days ago
prices_3_days_ago = dataset.loc[range(1987,2,-1), ["Open", "High", "Low", "Close", "Label"]]

# Prices 4 days ago
prices_4_days_ago = dataset.loc[range(1988,3,-1), ["Open", "High", "Low", "Close", "Label"]]

# Now, we can safely delete the first 4 data points of our dataset
dataset = dataset.drop(dataset.index[0:4])

In [242]:
# Now, in order to concatenate the dataframes, we are going to convert
# them into numpy arrays and then we will cocatenate the arrays!

dataset_array = dataset.values
prices_1_day_ago_array = prices_1_day_ago.values
prices_2_days_ago_array = prices_2_days_ago.values
prices_3_days_ago_array = prices_3_days_ago.values
prices_4_days_ago_array = prices_4_days_ago.values

# Now, let's concatenate the arrays
dataset_array = np.concatenate((dataset_array, prices_1_day_ago_array), axis=1)
dataset_array = np.concatenate((dataset_array, prices_2_days_ago_array), axis=1)
dataset_array = np.concatenate((dataset_array, prices_3_days_ago_array), axis=1)
dataset_array = np.concatenate((dataset_array, prices_4_days_ago_array), axis=1)

# Convert the dataset_array into a dataframe
dataset = pd.DataFrame(dataset_array)

In [243]:
# Name the columns of the dataframe
dataset.columns = ['Date', 'Subjectivity', 'Objectivity', 'Positive', 'Neutral', 'Negative', 'Label',
                   'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Open 1 day ago',
                   'High 1 day ago', 'Low 1 day ago', 'Close 1 day ago', 'Movement 1 day ago', 
                   'Open 2 days ago', 'High 2 days ago', 'Low 2 days ago', 'Close 2 days ago',
                   'Movement 2 days ago', 'Open 3 days ago', 'High 3 days ago', 'Low 3 days ago',
                   'Close 3 days ago', 'Movement 3 days ago', 'Open 4 days ago', 'High 4 days ago',
                   'Low 4 days ago', 'Close 4 days ago', 'Movement 4 days ago']

In [244]:
# Now let's do some final steps as preprocessing in our dataset!

# First, we are going to remove the features 'High', 'Low', 'Close', 'Volume', 'Adj Close'
# from each data point since these are values that are not known during the time of
# the prediction. However, we are going to assume the feature 'Open' is known during the
# time of the prediction!
del dataset['High']
del dataset['Low']
del dataset['Close']
del dataset['Volume']
del dataset['Adj Close']

# As a last step, we are going to move the Label into the last column of our dataframe!
cols = list(dataset)
print(cols)
cols.append(cols.pop(cols.index('Label')))
dataset = dataset.ix[:, cols]
dataset

Unnamed: 0,Date,Subjectivity,Objectivity,Positive,Neutral,Negative,Label,Open,Open 1 day ago,High 1 day ago,...,Open 3 days ago,High 3 days ago,Low 3 days ago,Close 3 days ago,Movement 3 days ago,Open 4 days ago,High 4 days ago,Low 4 days ago,Close 4 days ago,Movement 4 days ago
0,2008-08-14,45.4545,54.5455,36.3636,54.5455,9.09091,1,11532.1,11632.8,11633.8,...,11729.7,11867.1,11675.5,11782.3,1,11432.1,11760,11388,11734.3,0
1,2008-08-15,70,30,10,30,60,1,11611.2,11532.1,11718.3,...,11781.7,11782.3,11601.5,11642.5,0,11729.7,11867.1,11675.5,11782.3,1
2,2008-08-18,100,0,0,0,100,0,11659.7,11611.2,11709.9,...,11632.8,11633.8,11453.3,11533,0,11781.7,11782.3,11601.5,11642.5,0
3,2008-08-19,22.2222,77.7778,22.2222,77.7778,0,0,11478.1,11659.7,11690.4,...,11532.1,11718.3,11450.9,11615.9,1,11632.8,11633.8,11453.3,11533,0
4,2008-08-20,70,30,10,30,60,1,11345.9,11478.1,11478.2,...,11611.2,11709.9,11599.7,11659.9,1,11532.1,11718.3,11450.9,11615.9,1
5,2008-08-21,50,50,20,50,30,1,11415.2,11345.9,11454.2,...,11659.7,11690.4,11434.1,11479.4,0,11611.2,11709.9,11599.7,11659.9,1
6,2008-08-22,50,50,0,50,50,1,11426.8,11415.2,11476.2,...,11478.1,11478.2,11318.5,11348.5,0,11659.7,11690.4,11434.1,11479.4,0
7,2008-08-25,55.5556,44.4444,22.2222,44.4444,33.3333,0,11626.2,11426.8,11632.1,...,11345.9,11454.2,11290.6,11417.4,1,11478.1,11478.2,11318.5,11348.5,0
8,2008-08-26,66.6667,33.3333,0,33.3333,66.6667,1,11383.6,11626.2,11626.3,...,11415.2,11476.2,11315.6,11430.2,1,11345.9,11454.2,11290.6,11417.4,1
9,2008-08-27,30.7692,69.2308,30.7692,69.2308,0,1,11412.5,11383.6,11436.2,...,11426.8,11632.1,11426.8,11628.1,1,11415.2,11476.2,11315.6,11430.2,1
