In [3]:
!pip install pandas scikit-learn tensorflow --upgrade keras imbalanced-learn nltk gensim

You should consider upgrading via the '/home/alexa/codingTime/share_prices_prediction/venv/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
import os
import re

import gensim
import numpy as np
import pandas as pd
import nltk
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import tensorflow as tf


In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/alexa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
df = pd.read_csv("data/amazon.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1444 entries, 0 to 1443
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          1444 non-null   int64  
 1   Дата                1067 non-null   object 
 2   Время               1067 non-null   object 
 3   Цена до             1067 non-null   float64
 4   Цена после          1067 non-null   float64
 5   Разница в долларах  1067 non-null   float64
 6   Дельта в процентах  1067 non-null   float64
 7   Текст новости       1067 non-null   object 
dtypes: float64(4), int64(1), object(3)
memory usage: 90.4+ KB


In [7]:
for root, _, files in os.walk("data"):
    for filename in files:
        temp_df = pd.read_csv(os.path.join(root, filename))
        df = pd.concat([df, temp_df], axis=0, sort=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18447 entries, 0 to 668
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          18447 non-null  int64  
 1   Дата                10873 non-null  object 
 2   Время               10873 non-null  object 
 3   Цена до             10873 non-null  float64
 4   Цена после          10873 non-null  float64
 5   Разница в долларах  10873 non-null  float64
 6   Дельта в процентах  10873 non-null  float64
 7   Текст новости       10873 non-null  object 
dtypes: float64(4), int64(1), object(3)
memory usage: 1.3+ MB


In [8]:
df = df.drop_duplicates(subset=["Текст новости"], keep=False)
df.drop(columns=[df.columns[0], "Разница в долларах", "Дельта в процентах"], axis=1, inplace=True)
df.rename(columns={"Цена до": "price_before", "Цена после": "price_after", "Дата": "date", "Время": "Time", "Текст новости": "news_text"}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3593 entries, 2 to 443
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          3593 non-null   object 
 1   Time          3593 non-null   object 
 2   price_before  3593 non-null   float64
 3   price_after   3593 non-null   float64
 4   news_text     3593 non-null   object 
dtypes: float64(2), object(3)
memory usage: 168.4+ KB


In [9]:
df["absolute_price_difference"] = df["price_after"] - df["price_before"]
df["percentage_price_difference"] = df["absolute_price_difference"] / df["price_before"] * 100
df["price_change_direction"] = np.where(df["absolute_price_difference"] > 0, 1, -1)
df.head()

Unnamed: 0,date,Time,price_before,price_after,news_text,absolute_price_difference,percentage_price_difference,price_change_direction
2,2022-10-21,11:57:55,144.250107,145.074997,When Can Elon Musk Sell Tesla Shares? Could Mo...,0.82489,0.571847,1
3,2022-10-21,10:43:20,144.574997,144.250107,Google's New Tweaks To Messages May Annoy iPho...,-0.32489,-0.224721,-1
5,2022-10-20,12:06:39,145.115005,144.869995,"Microsoft Takes On Apple And Google, Plans Xbo...",-0.24501,-0.168839,-1
7,2022-10-18,14:45:49,145.339996,142.779999,T-Mobile US' Pricing Power Of 5G Is A Pivot Po...,-2.559997,-1.761385,-1
8,2022-10-18,14:24:43,144.960007,145.339996,Company That Assembles The iPhone Hopes To Eve...,0.379989,0.262134,1


In [10]:
df[df["price_change_direction"] == 1].count()

date                           1793
Time                           1793
price_before                   1793
price_after                    1793
news_text                      1793
absolute_price_difference      1793
percentage_price_difference    1793
price_change_direction         1793
dtype: int64

In [11]:
df[df["price_change_direction"] == -1].count()

date                           1800
Time                           1800
price_before                   1800
price_after                    1800
news_text                      1800
absolute_price_difference      1800
percentage_price_difference    1800
price_change_direction         1800
dtype: int64

In [12]:
news_df = pd.concat([df["news_text"], df["price_change_direction"]], axis=1)
news_df.head()

Unnamed: 0,news_text,price_change_direction
2,When Can Elon Musk Sell Tesla Shares? Could Mo...,1
3,Google's New Tweaks To Messages May Annoy iPho...,-1
5,"Microsoft Takes On Apple And Google, Plans Xbo...",-1
7,T-Mobile US' Pricing Power Of 5G Is A Pivot Po...,-1
8,Company That Assembles The iPhone Hopes To Eve...,1


In [13]:
X = news_df["news_text"].values
y = news_df["price_change_direction"].values

In [14]:
processed_features = []

for sentence in range(len(X)):
    processed_feature = re.sub(r'\W', ' ', str(X[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [15]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [16]:
data_words = list(sent_to_words(processed_features))

print(data_words[:10])

[['when', 'can', 'elon', 'musk', 'sell', 'tesla', 'shares', 'could', 'more', 'shares', 'be', 'unloaded', 'to', 'finance', 'twitter', 'acquisition'], ['google', 'new', 'tweaks', 'to', 'messages', 'may', 'annoy', 'iphone', 'users', 'вђ', 'and', 'add', 'more', 'fuel', 'to', 'rcs', 'battle', 'with', 'appl'], ['microsoft', 'takes', 'on', 'apple', 'and', 'google', 'plans', 'xbox', 'mobile', 'game', 'store', 'on', 'heels', 'of', 'activision', 'blizzard', 'deal'], ['mobile', 'us', 'pricing', 'power', 'of', 'is', 'pivot', 'point', 'analyst', 'says'], ['company', 'that', 'assembles', 'the', 'iphone', 'hopes', 'to', 'eventually', 'make', 'tesla', 'cars'], ['why', 'these', 'apple', 'analysts', 'have', 'reduced', 'their', 'price', 'targets', 'ahead', 'of', 'results'], ['why', 'apple', 'looks', 'ready', 'to', 'break', 'trend', 'and', 'fly', 'higher'], ['apple', 'supplier', 'tsmc', 'is', 'attractive', 'analyst', 'says', 'what', 'results', 'may', 'mean', 'for', 'iphone', 'maker'], ['facebook', 'parent

In [17]:
def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

In [18]:
data = []
for i in range(len(data_words)):
    data.append(detokenize(data_words[i]))
print(data[:5])

['when can elon musk sell tesla shares could more shares be unloaded to finance twitter acquisition', 'google new tweaks to messages may annoy iphone users вђ and add more fuel to rcs battle with appl', 'microsoft takes on apple and google plans xbox mobile game store on heels of activision blizzard deal', 'mobile us pricing power of is pivot point analyst says', 'company that assembles the iphone hopes to eventually make tesla cars']


In [19]:
labels = np.array(news_df['price_change_direction'])
y = []
for i in range(len(labels)):
    if labels[i] == -1:
        y.append(0)
    if labels[i] == 1:
        y.append(1)
y = np.array(y)
labels = tf.keras.utils.to_categorical(y, 2, dtype="float32")
del y

In [20]:
len(labels)

3593

In [21]:
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
news = pad_sequences(sequences, maxlen=max_len)
print(news)

[[   0    0    0 ...  541   21  210]
 [   0    0    0 ...  842   16 2871]
 [   0    0    0 ... 1424 1892   84]
 ...
 [   0    0    0 ...  206 1163  329]
 [   0    0    0 ...  722   51 6237]
 [   0    0    0 ...  612  165  291]]


In [22]:
print(labels)

[[0. 1.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [1. 0.]
 [1. 0.]]


In [23]:
#  Split data to train and test sets

X_train, X_test, y_train, y_test = train_test_split(news ,labels, test_size=0.2, shuffle = True, random_state=0)
print (len(X_train),len(X_test),len(y_train),len(y_test))


2874 719 2874 719


In [24]:
from keras.layers import BatchNormalization

# Neural network without LSTM

model0 = Sequential()

model0.add(layers.Dense(64, activation="relu"))
model0.add(BatchNormalization())
model0.add(layers.Dropout(0.8))
model0.add(layers.Dense(32, activation="relu"))
model0.add(layers.Dropout(0.5))
model0.add(layers.Dense(32, activation="relu"))
model0.add(layers.Dropout(0.3))
model0.add(layers.Dense(16, activation="relu"))
model0.add(layers.Dense(2, activation="sigmoid"))

2023-04-23 10:52:52.285934: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-23 10:52:52.304983: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-23 10:52:52.305142: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [25]:
model0.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

checkpoint0 = ModelCheckpoint("best_model0.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1, save_weights_only=False)

history = model0.fit(X_train, y_train,
                     validation_data=(X_test, y_test),
                     verbose=2,
                     epochs=70,
                     batch_size=32,
                     shuffle=True,
                     callbacks=[checkpoint0])

Epoch 1/70


2023-04-23 10:52:54.391615: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-04-23 10:52:54.394206: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7fa0cd99a250 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-04-23 10:52:54.394219: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2023-04-23 10:52:54.397428: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-04-23 10:52:54.521511: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8700
2023-04-23 10:52:54.607699: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the p


Epoch 1: val_accuracy improved from -inf to 0.49096, saving model to best_model0.hdf5
90/90 - 3s - loss: 0.9017 - accuracy: 0.5101 - val_loss: 0.7030 - val_accuracy: 0.4910 - 3s/epoch - 34ms/step
Epoch 2/70

Epoch 2: val_accuracy did not improve from 0.49096
90/90 - 0s - loss: 0.7542 - accuracy: 0.5070 - val_loss: 0.6961 - val_accuracy: 0.4910 - 234ms/epoch - 3ms/step
Epoch 3/70

Epoch 3: val_accuracy improved from 0.49096 to 0.50070, saving model to best_model0.hdf5
90/90 - 0s - loss: 0.7261 - accuracy: 0.5045 - val_loss: 0.6943 - val_accuracy: 0.5007 - 250ms/epoch - 3ms/step
Epoch 4/70

Epoch 4: val_accuracy improved from 0.50070 to 0.50626, saving model to best_model0.hdf5
90/90 - 0s - loss: 0.7190 - accuracy: 0.4927 - val_loss: 0.6944 - val_accuracy: 0.5063 - 370ms/epoch - 4ms/step
Epoch 5/70

Epoch 5: val_accuracy improved from 0.50626 to 0.52017, saving model to best_model0.hdf5
90/90 - 0s - loss: 0.7135 - accuracy: 0.4889 - val_loss: 0.6939 - val_accuracy: 0.5202 - 334ms/epoch 

In [26]:
# Neural network creation
model1 = Sequential()

model1.add(layers.Embedding(max_words, 20))
model1.add(layers.SpatialDropout1D(0.2))
model1.add(layers.LSTM(30, dropout=0.2, recurrent_dropout=0.2))

model1.add(layers.Dense(2, activation='sigmoid'))



In [27]:
model1.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1, save_weights_only=False)

history = model1.fit(X_train, y_train, epochs=70,
                     batch_size=32,
                     verbose=2,
                     validation_data=(X_test, y_test),callbacks=[checkpoint1])

Epoch 1/70

Epoch 1: val_accuracy improved from -inf to 0.49930, saving model to best_model1.hdf5
90/90 - 40s - loss: 0.6935 - accuracy: 0.5059 - val_loss: 0.6932 - val_accuracy: 0.4993 - 40s/epoch - 446ms/step
Epoch 2/70

Epoch 2: val_accuracy did not improve from 0.49930
90/90 - 35s - loss: 0.6819 - accuracy: 0.6211 - val_loss: 0.7003 - val_accuracy: 0.4896 - 35s/epoch - 386ms/step
Epoch 3/70

Epoch 3: val_accuracy did not improve from 0.49930
90/90 - 33s - loss: 0.5789 - accuracy: 0.7035 - val_loss: 0.7624 - val_accuracy: 0.4965 - 33s/epoch - 371ms/step
Epoch 4/70

Epoch 4: val_accuracy did not improve from 0.49930
90/90 - 32s - loss: 0.4294 - accuracy: 0.7999 - val_loss: 0.9088 - val_accuracy: 0.4993 - 32s/epoch - 356ms/step
Epoch 5/70

Epoch 5: val_accuracy did not improve from 0.49930
90/90 - 32s - loss: 0.3218 - accuracy: 0.8525 - val_loss: 1.1353 - val_accuracy: 0.4951 - 32s/epoch - 357ms/step
Epoch 6/70

Epoch 6: val_accuracy did not improve from 0.49930
90/90 - 32s - loss: 0.

In [28]:
model2 = Sequential()
model2.add(layers.Embedding(max_words, 15, input_length=max_len))
model2.add(layers.Bidirectional(layers.LSTM(10,dropout=0.6)))
model2.add(layers.Dense(2, activation='sigmoid'))
model2.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])

2023-04-23 11:30:26.986851: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-23 11:30:26.987944: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-23 11:30:26.988707: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [29]:
checkpoint2 = ModelCheckpoint("best_model2.hdf5", monitor='accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model2.fit(X_train, y_train, epochs=70,validation_data=(X_test, y_test),callbacks=[checkpoint2])

Epoch 1/70


2023-04-23 11:30:27.354396: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-23 11:30:27.355752: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-23 11:30:27.356612: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-04-23 11:30:35.119961: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-23 11:30:35.120842: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-23 11:30:35.121680: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus


Epoch 1: accuracy improved from -inf to 0.49756, saving model to best_model2.hdf5
Epoch 2/70
Epoch 2: accuracy improved from 0.49756 to 0.51566, saving model to best_model2.hdf5
Epoch 3/70
Epoch 3: accuracy improved from 0.51566 to 0.53062, saving model to best_model2.hdf5
Epoch 4/70
Epoch 4: accuracy improved from 0.53062 to 0.56193, saving model to best_model2.hdf5
Epoch 5/70
Epoch 5: accuracy improved from 0.56193 to 0.60856, saving model to best_model2.hdf5
Epoch 6/70
Epoch 6: accuracy improved from 0.60856 to 0.64092, saving model to best_model2.hdf5
Epoch 7/70
Epoch 7: accuracy improved from 0.64092 to 0.66701, saving model to best_model2.hdf5
Epoch 8/70
Epoch 8: accuracy improved from 0.66701 to 0.68580, saving model to best_model2.hdf5
Epoch 9/70
Epoch 9: accuracy improved from 0.68580 to 0.70598, saving model to best_model2.hdf5
Epoch 10/70
Epoch 10: accuracy improved from 0.70598 to 0.71225, saving model to best_model2.hdf5
Epoch 11/70
Epoch 11: accuracy improved from 0.7122

In [30]:
best_model = tf.keras.models.load_model("best_model0.hdf5")



In [31]:
test_loss, test_acc = best_model.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

23/23 - 1s - loss: 2.5171 - accuracy: 0.5299 - 1s/epoch - 51ms/step
Model accuracy:  0.5299026370048523
