In [1]:
!pip install pandas scikit-learn tensorflow --upgrade keras nltk gensim transformers evaluate

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os

import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/alexa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv("data/amazon.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1444 entries, 0 to 1443
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          1444 non-null   int64  
 1   Дата                1067 non-null   object 
 2   Время               1067 non-null   object 
 3   Цена до             1067 non-null   float64
 4   Цена после          1067 non-null   float64
 5   Разница в долларах  1067 non-null   float64
 6   Дельта в процентах  1067 non-null   float64
 7   Текст новости       1067 non-null   object 
dtypes: float64(4), int64(1), object(3)
memory usage: 90.4+ KB


In [5]:
for root, _, files in os.walk("data"):
    for filename in files:
        temp_df = pd.read_csv(os.path.join(root, filename))
        df = pd.concat([df, temp_df], axis=0, sort=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18447 entries, 0 to 668
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          18447 non-null  int64  
 1   Дата                10873 non-null  object 
 2   Время               10873 non-null  object 
 3   Цена до             10873 non-null  float64
 4   Цена после          10873 non-null  float64
 5   Разница в долларах  10873 non-null  float64
 6   Дельта в процентах  10873 non-null  float64
 7   Текст новости       10873 non-null  object 
dtypes: float64(4), int64(1), object(3)
memory usage: 1.3+ MB


In [6]:
df = df.drop_duplicates(subset=["Текст новости"], keep=False)
df.drop(columns=[df.columns[0], "Разница в долларах", "Дельта в процентах"], axis=1, inplace=True)
df.rename(columns={"Цена до": "price_before", "Цена после": "price_after", "Дата": "date", "Время": "Time", "Текст новости": "news_text"}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3593 entries, 0 to 667
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          3593 non-null   object 
 1   Time          3593 non-null   object 
 2   price_before  3593 non-null   float64
 3   price_after   3593 non-null   float64
 4   news_text     3593 non-null   object 
dtypes: float64(2), object(3)
memory usage: 168.4+ KB


In [7]:
df["absolute_price_difference"] = df["price_after"] - df["price_before"]
df["percentage_price_difference"] = df["absolute_price_difference"] / df["price_before"] * 100
df["price_change_direction"] = np.where(df["absolute_price_difference"] > 0, 1, 0)
df.head()

Unnamed: 0,date,Time,price_before,price_after,news_text,absolute_price_difference,percentage_price_difference,price_change_direction
0,2022-10-14,14:10:32,471.454987,468.269989,Check Out What Whales Are Doing With NOC\n,-3.184998,-0.675568,0
1,2022-10-14,11:38:17,475.109985,472.109985,What 7 Analyst Ratings Have To Say About North...,-3.0,-0.631433,0
2,2022-10-14,11:25:32,476.359985,475.109985,7 Analysts Have This to Say About Northrop Gru...,-1.25,-0.262407,0
3,2022-10-14,10:37:16,476.359985,475.109985,"Benzinga's Top Ratings Upgrades, Downgrades Fo...",-1.25,-0.262407,0
5,2022-10-10,15:21:32,505.105011,506.994995,Looking Into Northrop Grumman's Recent Short I...,1.889984,0.374176,1


In [8]:
df[df["price_change_direction"] == 1].count()

date                           1793
Time                           1793
price_before                   1793
price_after                    1793
news_text                      1793
absolute_price_difference      1793
percentage_price_difference    1793
price_change_direction         1793
dtype: int64

In [9]:
df[df["price_change_direction"] == -1].count()

date                           0
Time                           0
price_before                   0
price_after                    0
news_text                      0
absolute_price_difference      0
percentage_price_difference    0
price_change_direction         0
dtype: int64

In [10]:
news_df = pd.concat([df["news_text"], df["price_change_direction"]], axis=1)
news_df.head()

Unnamed: 0,news_text,price_change_direction
0,Check Out What Whales Are Doing With NOC\n,0
1,What 7 Analyst Ratings Have To Say About North...,0
2,7 Analysts Have This to Say About Northrop Gru...,0
3,"Benzinga's Top Ratings Upgrades, Downgrades Fo...",0
5,Looking Into Northrop Grumman's Recent Short I...,1


In [11]:
X = news_df["news_text"].values
y = news_df["price_change_direction"].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, test_size=0.2, shuffle = True, random_state=0)
print(len(X_train),len(X_test),len(y_train),len(y_test))

X_train = X_train.tolist()

2874 719 2874 719


In [13]:
from transformers import AutoTokenizer,  DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_data = tokenizer(X_train, return_tensors="np", padding=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
tokenized_data = dict(tokenized_data)

labels = np.array(y_train)  # Label is already an array of 0 and 1

2023-05-08 20:44:21.381508: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-08 20:44:21.706206: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-08 20:44:21.707400: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
labels.shape

(2874,)

In [None]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam

# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(3e-5))

model.fit(tokenized_data, labels, epochs=10)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

In [None]:
import evaluate

acc = evaluate.load("accuracy")
# Round the predictions to turn them into "0" or "1" labels
X_test = np.array(X_test)
X_test.shape

In [None]:
test_preds = np.round(model.predict(X_test)["logits"])
test_labels = y_test

print("Test accuracy is : ", acc.compute(predictions = test_preds, references = test_labels))