# Apple Stock Price Prediction Based on News and Historical Stock Price Data



# 1. Import Libraries



In [None]:
# general usage
import numpy as np
import pandas as pd
from typing import Tuple

# for data preprocessing
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

# for data modelling
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# for evaluation
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 2. Data Understanding

## Apple Stock Data

• 2301 rows and 7 columns

• Dates:
( 08 June, 2008 to 30 June, 2016)

• Contains some missing days
(e.g. days on which the stock market doesn't open)

Apple stock data downloaded from Yahoo Finance - from June 8, 2008 to June 30, 2016

In [None]:
stock_prices = pd.read_csv("AAPL.csv")
stock_prices.head(2)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2008-06-09,6.599643,6.605,6.276786,6.486071,5.545707,1888392800
1,2008-06-10,6.446786,6.670714,6.393571,6.63,5.66877,1140941200


See if there is any null data in the dataset.

In [None]:
stock_prices.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

## News Data

Date: Time of the news 
08 June, 2008 to 01 January, 2016

News: News headline 

73607 rows and 2 columns 
(Some news are from the same day)


News data downloaded from Canvas

In [None]:
news = pd.read_csv('RedditNews.csv')
news.head(2)

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host


See if there is any null data in the dataset.

In [None]:
news.isnull().sum()

Date    0
News    0
dtype: int64

# 3. Data Preprocessing

## News Data: Merge the news that are posted in the same day into one row.

In [None]:
aggregation_functions = {'News': ' '.join}
news = news.groupby(news['Date']).aggregate(aggregation_functions)

print(news.iloc[0,0])

b'Nim Chimpsky: The tragedy of the chimp who thought he was a boy (and proved that humans were not humane)' b"Canada: Beware slippery slope' to censorship, hearing told " b'EU Vice-President Luisa Morgantini and the Irish Nobel laureate, Mairead Corrigan, have been tear gased and injured by the IDF while attending the "International Conference on Non-violent Resistance"' b"Israeli minister: Israel will attack Iran if it doesn't abandon its nuclear program" b'Albino Killings in Tanzania. At least 19 albinos, including several young children, have been killed in Tanzania in the past year. [video] ' b'Chiapas: army occupies Zapatista communities in "anti-drug" ops' b'Polar bear swims 200 miles, is shot dead upon arrival' b'News is a contraband item in Pakistan now, and it is being sold on the black market,' b'Albinos, Long Shunned, Face Threat in Tanzania where witch doctors are now marketing albino skin, bones and hair as ingredients in potions that are promised to make people rich.' b'T

## Inner join the stock price dataset and the news dataset.

## Drop data that we are not using.

In [None]:
# join the datasets
base_data = pd.merge(stock_prices, news, on='Date', how='inner')

# delete unnecessary data
base_data.drop(columns=['Adj Close', 'Date'], inplace=True)
del news
del stock_prices

base_data.head(2)

Unnamed: 0,Open,High,Low,Close,Volume,News
0,6.599643,6.605,6.276786,6.486071,1888392800,"b'United States quits Human Rights Council' b""..."
1,6.446786,6.670714,6.393571,6.63,1140941200,"b'Oil shortage a myth, says industry insider' ..."


## Build a Label column.
* 0 if the stock price drops
* 1 otherwise

In [None]:
base_data['Label'] = np.where(base_data['Close'].shift(-1) - base_data['Close'] < 0, 0, 1)
base_data.head(2)

Unnamed: 0,Open,High,Low,Close,Volume,News,Label
0,6.599643,6.605,6.276786,6.486071,1888392800,"b'United States quits Human Rights Council' b""...",1
1,6.446786,6.670714,6.393571,6.63,1140941200,"b'Oil shortage a myth, says industry insider' ...",0


## News Data: Remove punctuations and stopwords, and convert to lowercase.

In [None]:
# create a set to store the punctuations and stopwords
tokens_to_be_removed = set(stopwords.words())
tokens_to_be_removed.update(string.punctuation)

# remove punctuations and stopwords froma news, and convert the news to lowercase
def process_news(news: str) -> str:
    # tokenize the words
    tokens = word_tokenize(news)

    # remove punctuations and stopwords
    tokens = [token for token in tokens if token not in tokens_to_be_removed]

    # join the tokens and convert to lowercase
    return ' '.join(tokens).lower()

# apply process_news to the news
base_data['News'] = base_data['News'].apply(process_news)
base_data['News'].head(2)

0    b'united states quits human rights council b '...
1    b'oil shortage myth industry insider b '' isra...
Name: News, dtype: object

## Use CountVectorizer to vectorize the news data.
We create a function that returns the n-gram vectorized version of dataset.

In [None]:
def data_ngram(n: int) -> pd.DataFrame:
    '''
    Function that returns the dataset.
    The parameter n determines the n-gram range.
    '''

    # vectorize the news data
    countvectorizer = CountVectorizer(ngram_range=(1,n), max_features=100000)
    vectors = countvectorizer.fit_transform(base_data['News']).toarray()
    vectors = pd.DataFrame(vectors, columns=[str(i) for i in range(vectors.shape[1])])

    # return the dataframe that contains the neccessary data
    return pd.concat((base_data[['Open', 'High', 'Low', 'Close', 'Label']], vectors), axis=1, join='inner', copy=False)


## Separate data into training datasets and testing datasets that are standardized by StandardScaler.
We create a function to return the training and testing datasets.

The function should be used like this if you want to get the 4-gram version of dataset:
```
X_train, X_test, y_train, y_test = train_test_datasets(4)
```



In [None]:
def train_test_datasets(n: int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    # get the data
    data = data_ngram(n)

    # split the dataset into train and test datasets with ratio 8 : 2
    train_num = int(0.8 * len(data))
    train = data[:train_num]
    test = data[train_num:]

    # standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train.drop('Label', axis=1))
    X_test = scaler.transform(test.drop('Label', axis=1))

    # get the label
    y_train = train['Label'].to_numpy()
    y_test = test['Label'].to_numpy()

    return X_train, X_test, y_train, y_test

# 4. Modelling

We are using logistic regression model together with the 1-gram,2-gram,3-gram and 4-gram model respectively.

Modelling(1-gram)

Logistic Regression

In [None]:
# 1-gram model training
X_train, X_test, y_train, y_test = train_test_datasets(1)

# model 1: logistic regression
log_md = LogisticRegression()
log_md = log_md.fit(X_train, y_train)

# 2-gram model prediction
predictions = log_md.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.52      0.33      0.40       211
           1       0.48      0.67      0.56       196

    accuracy                           0.49       407
   macro avg       0.50      0.50      0.48       407
weighted avg       0.50      0.49      0.48       407



XGBoost

In [None]:
# XGBoost Model
org_gbm = XGBClassifier()
org_gbm.fit(X_train,y_train)
y_pred = org_gbm.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.54      0.41      0.47       211
           1       0.50      0.63      0.56       196

    accuracy                           0.52       407
   macro avg       0.52      0.52      0.51       407
weighted avg       0.52      0.52      0.51       407



Fine-tuned logistic regression

In [None]:
log=LogisticRegression()
logis_reg_cv = GridSearchCV(log,{'penalty': ['l2','13'], 'C': [0.0001,0.001]})
logis_reg_cv.fit(X_train,y_train)

10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    % (all_penalties, penalty)
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got 13.



GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.0001, 0.001], 'penalty': ['l2', '13']})

In [None]:
logis_reg_cv.best_params_

{'C': 0.0001, 'penalty': 'l2'}

In [None]:
logis_reg_para = LogisticRegression(**logis_reg_cv.best_params_) 
logis_reg_para.fit(X_train, y_train)

logis_reg_predictions_para = logis_reg_para.predict(X_test)
print(classification_report(logis_reg_predictions_para, y_test))

              precision    recall  f1-score   support

           0       0.20      0.49      0.28        85
           1       0.78      0.48      0.59       322

    accuracy                           0.48       407
   macro avg       0.49      0.48      0.44       407
weighted avg       0.66      0.48      0.53       407



Model(2-grams)

Logistic Regression

In [None]:
# 2-gram model training
X_train, X_test, y_train, y_test = train_test_datasets(2)

# model 1: logistic regression
log_md = LogisticRegression()
log_md = log_md.fit(X_train, y_train)

# 2-gram model prediction
predictions = log_md.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.53      0.28      0.37       211
           1       0.48      0.73      0.58       196

    accuracy                           0.50       407
   macro avg       0.51      0.50      0.47       407
weighted avg       0.51      0.50      0.47       407



XGBoost

In [None]:
# model 2: XGBoost Model
org_gbm = XGBClassifier()
org_gbm.fit(X_train,y_train)
y_pred = org_gbm.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.55      0.42      0.48       211
           1       0.50      0.62      0.55       196

    accuracy                           0.52       407
   macro avg       0.52      0.52      0.52       407
weighted avg       0.52      0.52      0.51       407



Fine-tuned logistic regression

In [None]:
log=LogisticRegression()
logis_reg_cv = GridSearchCV(log,{'penalty': ['l2','13'], 'C': [0.1,1,10]})
logis_reg_cv.fit(X_train,y_train)

15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    % (all_penalties, penalty)
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got 13.



GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.1, 1, 10], 'penalty': ['l2', '13']})

In [None]:
logis_reg_cv.best_params_

{'C': 0.1, 'penalty': 'l2'}

In [None]:
logis_reg_para = LogisticRegression(**logis_reg_cv.best_params_) 
logis_reg_para.fit(X_train, y_train)

logis_reg_predictions_para = logis_reg_para.predict(X_test)
print(classification_report(logis_reg_predictions_para, y_test))

              precision    recall  f1-score   support

           0       0.28      0.54      0.37       110
           1       0.74      0.49      0.59       297

    accuracy                           0.50       407
   macro avg       0.51      0.51      0.48       407
weighted avg       0.62      0.50      0.53       407



Model(3-grams)

Logistic regression

In [None]:
# 3-gram model training
X_train, X_test, y_train, y_test = train_test_datasets(3)

# model 1: logistic regression
log_md = LogisticRegression()
log_md = log_md.fit(X_train, y_train)

# 2-gram model prediction
predictions = log_md.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.57      0.29      0.39       211
           1       0.50      0.77      0.61       196

    accuracy                           0.52       407
   macro avg       0.54      0.53      0.50       407
weighted avg       0.54      0.52      0.49       407



XGBoost

In [None]:
# XGBoost Model
org_gbm = XGBClassifier()
org_gbm.fit(X_train,y_train)
y_pred = org_gbm.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.55      0.42      0.48       211
           1       0.50      0.62      0.55       196

    accuracy                           0.52       407
   macro avg       0.52      0.52      0.52       407
weighted avg       0.52      0.52      0.51       407



Model(4-gram)

Logistic regression

In [None]:
# 4-gram model training
X_train, X_test, y_train, y_test = train_test_datasets(4)

# model 1: logistic regression
log_md = LogisticRegression()
log_md = log_md.fit(X_train, y_train)

# 2-gram model prediction
predictions = log_md.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.54      0.30      0.39       211
           1       0.49      0.72      0.59       196

    accuracy                           0.51       407
   macro avg       0.52      0.51      0.49       407
weighted avg       0.52      0.51      0.48       407



# Findings
* XGBoost is more accurate than logistic regression in our case.
* 3-gram with XGBoost is better than other models in our case.