In [5]:
!pip install pandas scikit-learn tensorflow --upgrade keras nltk gensim transformers

Defaulting to user installation because normal site-packages is not writeable


In [6]:
import os
import re

import gensim
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize.treebank import TreebankWordDetokenizer
import tensorflow as tf


In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/alexa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
df = pd.read_csv("data/amazon.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1444 entries, 0 to 1443
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          1444 non-null   int64  
 1   Дата                1067 non-null   object 
 2   Время               1067 non-null   object 
 3   Цена до             1067 non-null   float64
 4   Цена после          1067 non-null   float64
 5   Разница в долларах  1067 non-null   float64
 6   Дельта в процентах  1067 non-null   float64
 7   Текст новости       1067 non-null   object 
dtypes: float64(4), int64(1), object(3)
memory usage: 90.4+ KB


In [9]:
for root, _, files in os.walk("data"):
    for filename in files:
        temp_df = pd.read_csv(os.path.join(root, filename))
        df = pd.concat([df, temp_df], axis=0, sort=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18447 entries, 0 to 668
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          18447 non-null  int64  
 1   Дата                10873 non-null  object 
 2   Время               10873 non-null  object 
 3   Цена до             10873 non-null  float64
 4   Цена после          10873 non-null  float64
 5   Разница в долларах  10873 non-null  float64
 6   Дельта в процентах  10873 non-null  float64
 7   Текст новости       10873 non-null  object 
dtypes: float64(4), int64(1), object(3)
memory usage: 1.3+ MB


In [10]:
df = df.drop_duplicates(subset=["Текст новости"], keep=False)
df.drop(columns=[df.columns[0], "Разница в долларах", "Дельта в процентах"], axis=1, inplace=True)
df.rename(columns={"Цена до": "price_before", "Цена после": "price_after", "Дата": "date", "Время": "Time", "Текст новости": "news_text"}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3593 entries, 0 to 667
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          3593 non-null   object 
 1   Time          3593 non-null   object 
 2   price_before  3593 non-null   float64
 3   price_after   3593 non-null   float64
 4   news_text     3593 non-null   object 
dtypes: float64(2), object(3)
memory usage: 168.4+ KB


In [11]:
df["absolute_price_difference"] = df["price_after"] - df["price_before"]
df["percentage_price_difference"] = df["absolute_price_difference"] / df["price_before"] * 100
df["price_change_direction"] = np.where(df["absolute_price_difference"] > 0, 1, -1)
df.head()

Unnamed: 0,date,Time,price_before,price_after,news_text,absolute_price_difference,percentage_price_difference,price_change_direction
0,2022-10-14,14:10:32,471.454987,468.269989,Check Out What Whales Are Doing With NOC\n,-3.184998,-0.675568,-1
1,2022-10-14,11:38:17,475.109985,472.109985,What 7 Analyst Ratings Have To Say About North...,-3.0,-0.631433,-1
2,2022-10-14,11:25:32,476.359985,475.109985,7 Analysts Have This to Say About Northrop Gru...,-1.25,-0.262407,-1
3,2022-10-14,10:37:16,476.359985,475.109985,"Benzinga's Top Ratings Upgrades, Downgrades Fo...",-1.25,-0.262407,-1
5,2022-10-10,15:21:32,505.105011,506.994995,Looking Into Northrop Grumman's Recent Short I...,1.889984,0.374176,1


In [12]:
df[df["price_change_direction"] == 1].count()

date                           1793
Time                           1793
price_before                   1793
price_after                    1793
news_text                      1793
absolute_price_difference      1793
percentage_price_difference    1793
price_change_direction         1793
dtype: int64

In [13]:
df[df["price_change_direction"] == -1].count()

date                           1800
Time                           1800
price_before                   1800
price_after                    1800
news_text                      1800
absolute_price_difference      1800
percentage_price_difference    1800
price_change_direction         1800
dtype: int64

In [14]:
news_df = pd.concat([df["news_text"], df["price_change_direction"]], axis=1)
news_df.head()

Unnamed: 0,news_text,price_change_direction
0,Check Out What Whales Are Doing With NOC\n,-1
1,What 7 Analyst Ratings Have To Say About North...,-1
2,7 Analysts Have This to Say About Northrop Gru...,-1
3,"Benzinga's Top Ratings Upgrades, Downgrades Fo...",-1
5,Looking Into Northrop Grumman's Recent Short I...,1


In [15]:
X = news_df["news_text"].values
y = news_df["price_change_direction"].values

In [16]:
processed_features = []

for sentence in range(len(X)):
    processed_feature = re.sub(r'\W', ' ', str(X[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [17]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [18]:
data_words = list(sent_to_words(processed_features))

print(data_words[:10])

[['check', 'out', 'what', 'whales', 'are', 'doing', 'with', 'noc'], ['what', 'analyst', 'ratings', 'have', 'to', 'say', 'about', 'northrop', 'grumman'], ['analysts', 'have', 'this', 'to', 'say', 'about', 'northrop', 'grumman'], ['benzinga', 'top', 'ratings', 'upgrades', 'downgrades', 'for', 'october'], ['looking', 'into', 'northrop', 'grumman', 'recent', 'short', 'interest'], ['this', 'is', 'what', 'whales', 'are', 'betting', 'on', 'northrop', 'grumman'], ['here', 'how', 'much', 'invested', 'in', 'northrop', 'grumman', 'years', 'ago', 'would', 'be', 'worth', 'today'], ['uber', 'nextera', 'are', 'cnbc', 'fast', 'money', 'mentions', 'and', 'this', 'struggling', 'sector', 'is', 'tuesday', 'call', 'of', 'the', 'day'], ['what', 'analyst', 'ratings', 'have', 'to', 'say', 'about', 'northrop', 'grumman'], ['northrop', 'grumman', 'ex', 'dividend', 'date', 'is', 'friday', 'here', 'what', 'you', 'need', 'to', 'know']]


In [19]:
def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

In [20]:
data = []
for i in range(len(data_words)):
    data.append(detokenize(data_words[i]))
print(data[:5])

['check out what whales are doing with noc', 'what analyst ratings have to say about northrop grumman', 'analysts have this to say about northrop grumman', 'benzinga top ratings upgrades downgrades for october', 'looking into northrop grumman recent short interest']


In [21]:
labels = np.array(news_df['price_change_direction'])
y = []
for i in range(len(labels)):
    if labels[i] == -1:
        y.append(0)
    if labels[i] == 1:
        y.append(1)
y = np.array(y)
labels = tf.keras.utils.to_categorical(y, 2, dtype="float32")
del y

In [22]:
len(labels)

3593

In [23]:
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
news = pad_sequences(sequences, maxlen=max_len)
print(news)
news

[[   0    0    0 ...  349   16 2867]
 [   0    0    0 ...   47  226  227]
 [   0    0    0 ...   47  226  227]
 ...
 [   0    0    0 ...   24   49  447]
 [   0    0    0 ...  137  820 1089]
 [   0    0    0 ...   49    2  285]]


array([[   0,    0,    0, ...,  349,   16, 2867],
       [   0,    0,    0, ...,   47,  226,  227],
       [   0,    0,    0, ...,   47,  226,  227],
       ...,
       [   0,    0,    0, ...,   24,   49,  447],
       [   0,    0,    0, ...,  137,  820, 1089],
       [   0,    0,    0, ...,   49,    2,  285]], dtype=int32)

In [24]:
print(labels)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [0. 1.]]


In [25]:
#  Split data to train and test sets

X_train, X_test, y_train, y_test = train_test_split(news ,labels, test_size=0.2, shuffle = True, random_state=0)
print (len(X_train),len(X_test),len(y_train),len(y_test))


2874 719 2874 719


In [26]:
from keras.layers import BatchNormalization

# Neural network without LSTM

model0 = Sequential()

model0.add(layers.Dense(64, activation="relu"))
model0.add(BatchNormalization())
model0.add(layers.Dropout(0.8))
model0.add(layers.Dense(32, activation="relu"))
model0.add(layers.Dropout(0.5))
model0.add(layers.Dense(32, activation="relu"))
model0.add(layers.Dropout(0.3))
model0.add(layers.Dense(16, activation="relu"))
model0.add(layers.Dense(2, activation="sigmoid"))

In [27]:
model0.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

checkpoint0 = ModelCheckpoint("best_model0.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1, save_weights_only=False)

history = model0.fit(X_train, y_train,
                     validation_data=(X_test, y_test),
                     verbose=2,
                     epochs=70,
                     batch_size=32,
                     shuffle=True,
                     callbacks=[checkpoint0])

Epoch 1/70

Epoch 1: val_accuracy improved from -inf to 0.53129, saving model to best_model0.hdf5
90/90 - 1s - loss: 0.9451 - accuracy: 0.4986 - val_loss: 0.6936 - val_accuracy: 0.5313 - 1s/epoch - 15ms/step
Epoch 2/70

Epoch 2: val_accuracy improved from 0.53129 to 0.56050, saving model to best_model0.hdf5
90/90 - 0s - loss: 0.7902 - accuracy: 0.4892 - val_loss: 0.6863 - val_accuracy: 0.5605 - 160ms/epoch - 2ms/step
Epoch 3/70

Epoch 3: val_accuracy did not improve from 0.56050
90/90 - 0s - loss: 0.7446 - accuracy: 0.5066 - val_loss: 0.6894 - val_accuracy: 0.5382 - 149ms/epoch - 2ms/step
Epoch 4/70

Epoch 4: val_accuracy did not improve from 0.56050
90/90 - 0s - loss: 0.7327 - accuracy: 0.4979 - val_loss: 0.6914 - val_accuracy: 0.5063 - 132ms/epoch - 1ms/step
Epoch 5/70

Epoch 5: val_accuracy did not improve from 0.56050
90/90 - 0s - loss: 0.7189 - accuracy: 0.5052 - val_loss: 0.6929 - val_accuracy: 0.5090 - 131ms/epoch - 1ms/step
Epoch 6/70

Epoch 6: val_accuracy did not improve from

In [28]:
# Neural network creation
model1 = Sequential()

model1.add(layers.Embedding(max_words, 20))
model1.add(layers.SpatialDropout1D(0.2))
model1.add(layers.LSTM(30, dropout=0.2, recurrent_dropout=0.2))

model1.add(layers.Dense(2, activation='sigmoid'))

In [29]:
model1.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1, save_weights_only=False)

history = model1.fit(X_train, y_train, epochs=70,
                     batch_size=32,
                     verbose=2,
                     validation_data=(X_test, y_test),callbacks=[checkpoint1])

Epoch 1/70

Epoch 1: val_accuracy improved from -inf to 0.48122, saving model to best_model1.hdf5
90/90 - 9s - loss: 0.6934 - accuracy: 0.5084 - val_loss: 0.6941 - val_accuracy: 0.4812 - 9s/epoch - 102ms/step
Epoch 2/70

Epoch 2: val_accuracy improved from 0.48122 to 0.48818, saving model to best_model1.hdf5
90/90 - 7s - loss: 0.6834 - accuracy: 0.5932 - val_loss: 0.6955 - val_accuracy: 0.4882 - 7s/epoch - 80ms/step
Epoch 3/70

Epoch 3: val_accuracy did not improve from 0.48818
90/90 - 7s - loss: 0.5822 - accuracy: 0.7039 - val_loss: 0.7645 - val_accuracy: 0.4812 - 7s/epoch - 79ms/step
Epoch 4/70

Epoch 4: val_accuracy improved from 0.48818 to 0.48957, saving model to best_model1.hdf5
90/90 - 7s - loss: 0.4192 - accuracy: 0.8065 - val_loss: 0.9007 - val_accuracy: 0.4896 - 7s/epoch - 79ms/step
Epoch 5/70

Epoch 5: val_accuracy improved from 0.48957 to 0.50487, saving model to best_model1.hdf5
90/90 - 7s - loss: 0.3296 - accuracy: 0.8469 - val_loss: 1.0391 - val_accuracy: 0.5049 - 7s/epo

In [30]:
model2 = Sequential()
model2.add(layers.Embedding(max_words, 15, input_length=max_len))
model2.add(layers.Bidirectional(layers.LSTM(10,dropout=0.6)))
model2.add(layers.Dense(2, activation='sigmoid'))
model2.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])

2023-05-08 19:57:38.846031: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-08 19:57:38.847216: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-08 19:57:38.848098: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [31]:
checkpoint2 = ModelCheckpoint("best_model2.hdf5", monitor='accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model2.fit(X_train, y_train, epochs=70,validation_data=(X_test, y_test),callbacks=[checkpoint2])

Epoch 1/70


2023-05-08 19:57:39.235893: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-08 19:57:39.237253: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-08 19:57:39.238228: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-05-08 19:57:45.162290: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-08 19:57:45.163484: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-08 19:57:45.164392: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus


Epoch 1: accuracy improved from -inf to 0.51705, saving model to best_model2.hdf5
Epoch 2/70
Epoch 2: accuracy improved from 0.51705 to 0.53097, saving model to best_model2.hdf5
Epoch 3/70
Epoch 3: accuracy improved from 0.53097 to 0.55324, saving model to best_model2.hdf5
Epoch 4/70
Epoch 4: accuracy improved from 0.55324 to 0.59290, saving model to best_model2.hdf5
Epoch 5/70
Epoch 5: accuracy improved from 0.59290 to 0.63048, saving model to best_model2.hdf5
Epoch 6/70
Epoch 6: accuracy improved from 0.63048 to 0.66458, saving model to best_model2.hdf5
Epoch 7/70
Epoch 7: accuracy improved from 0.66458 to 0.67711, saving model to best_model2.hdf5
Epoch 8/70
Epoch 8: accuracy improved from 0.67711 to 0.70390, saving model to best_model2.hdf5
Epoch 9/70
Epoch 9: accuracy improved from 0.70390 to 0.71816, saving model to best_model2.hdf5
Epoch 10/70
Epoch 10: accuracy improved from 0.71816 to 0.74008, saving model to best_model2.hdf5
Epoch 11/70
Epoch 11: accuracy improved from 0.7400

In [32]:
best_model = tf.keras.models.load_model("best_model0.hdf5")

In [33]:
test_loss, test_acc = best_model.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

23/23 - 0s - loss: 0.6863 - accuracy: 0.5605 - 122ms/epoch - 5ms/step
Model accuracy:  0.5605006814002991
