In [1]:
!pip install datasets evaluate transformers==4.28.0 --upgrade accelerate



In [2]:
import os

# Data processing
import pandas as pd
import numpy as np

# Modeling
import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, BertTokenizer

# Hugging Face Dataset
from datasets import Dataset

# Model performance evaluation
import evaluate

2023-05-13 22:46:28.065514: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = pd.read_csv("data/amazon.csv", delimiter=";")
df.drop(columns=[df.columns[0], "Разница в долларах", "Дельта в процентах"], axis=1, inplace=True)
for root, _, files in os.walk("data"):
    for filename in files:
        temp_df = pd.read_csv(os.path.join(root, filename), delimiter=";")
        temp_df.drop(columns=[temp_df.columns[0], "Разница в долларах", "Дельта в процентах"], axis=1, inplace=True, errors="ignore")
        print(filename, temp_df.head())
        df = pd.concat([df, temp_df], axis=0, sort=False)

df.info()
df.drop(columns=[df.columns[5]])

northrop.csv          Дата     Время     Цена до  Цена после   
0  2022-10-14  14:10:32  471.454987  468.269989  \
1  2022-10-14  11:38:17  475.109985  472.109985   
2  2022-10-14  11:25:32  476.359985  475.109985   
3  2022-10-14  10:37:16  476.359985  475.109985   
4  2022-10-12  12:47:18  501.450012  501.660004   

                                       Текст новости  
0         Check Out What Whales Are Doing With NOC\n  
1  What 7 Analyst Ratings Have To Say About North...  
2  7 Analysts Have This to Say About Northrop Gru...  
3  Benzinga's Top Ratings Upgrades, Downgrades Fo...  
4  Credit Suisse Picks 7 Aerospace, Defense Stock...  
boeing.csv          Дата     Время     Цена до  Цена после   
0  2022-09-29  15:00:27  124.675903  124.540001  \
1  2022-09-28  14:30:52  133.009995  133.330002   
2  2022-09-28  11:42:56  130.130005  130.570007   
3  2022-09-26  14:05:53  127.320000  126.980003   
4  2022-09-23  11:38:28  131.759995  130.229996   

                                

Unnamed: 0,Дата,Время,Цена до,Цена после,Текст новости
0,2022-10-21,15:21:15,118.610001,119.364998,S&P 500 Rebounds From 2022 Lows This Week As B...
1,2022-10-21,15:05:00,118.610001,119.364998,What's Going On With Amazon Shares\n
2,2022-10-21,13:55:58,117.800003,118.610001,10 Consumer Discretionary Stocks Whale Activit...
3,2022-10-21,13:53:36,117.800003,118.610001,Amazon To $175? Plus JP Morgan Cuts Price Targ...
4,2022-10-20,14:20:03,116.440002,115.209999,Are All Streaming Services Losing Money? Netfl...
...,...,...,...,...,...
664,2020-12-10,14:47:29,41.845001,41.955002,10 Health Care Stocks With Unusual Options Ale...
665,2020-12-10,11:36:18,41.930000,41.735001,Rehearsal Exposes Gaps In COVID Vaccine Delive...
666,2020-12-10,11:24:03,41.939999,41.930000,"Investors Seem More Focused On Vaccine, Stimul..."
667,2020-12-09,11:29:30,41.520000,41.779999,Allergy Warning Issued For Pfizer-BioNTech COV...


In [4]:
df = df.drop_duplicates(subset=["Текст новости"], keep=False)
df.rename(columns={"Цена до": "price_before", "Цена после": "price_after", "Дата": "date", "Время": "Time", "Текст новости": "news_text"}, inplace=True)

df["absolute_price_difference"] = df["price_after"] - df["price_before"]
df["percentage_price_difference"] = df["absolute_price_difference"] / df["price_before"] * 100
df["label"] = np.where(df["absolute_price_difference"] > 0, 1, 0)
df = df[["news_text", "label"]].copy()
df.head()

Unnamed: 0,news_text,label
0,Check Out What Whales Are Doing With NOC\n,0
1,What 7 Analyst Ratings Have To Say About North...,0
2,7 Analysts Have This to Say About Northrop Gru...,0
3,"Benzinga's Top Ratings Upgrades, Downgrades Fo...",0
5,Looking Into Northrop Grumman's Recent Short I...,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5173 entries, 0 to 667
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   news_text  5173 non-null   object
 1   label      5173 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 121.2+ KB


In [6]:
train_data = df.sample(frac=0.8, random_state=42)

# Testing dataset
test_data = df.drop(train_data.index)

# Check the number of records in training and testing dataset.
print(f'The training dataset has {len(train_data)} records.')
print(f'The testing dataset has {len(test_data)} records.')

The training dataset has 4138 records.
The testing dataset has 69 records.


In [7]:
train_data["news_text"].tolist()

['Quantity Over Quality? Ben Affleck Questions Assembly Line-Style Of Streaming Giant Netflix',
 'Will Amazon, American Airlines And Netflix Lead The Market Higher? How To Play This Key Trend',
 "Why Nvidia Is This Investor's Highest Conviction Play Right Now\n",
 'What Does Apple Have To Do With DexCom Stock Trading Lower?',
 "What's Going On With Moderna Stock?\n",
 "RingCentral 's Most Resilient Enterprise Segment Gives In To Demand Weakness, Analysts Flag Post Q4 Disappointment",
 'These 3 Companies Did Well On Singles Day, Cramer Says\n',
 'Mid-Day Market Update: Crude Oil Rises 2%; CorMedix Shares Drop Following Q1 Results\n',
 'How To Trade Apple Stock Heading Into Q4 Earnings',
 'Elon Musk Has An Idea For A New Cologne, But You May Want To Think Twice Before Using It\n',
 "Terra's Mirror Protocol Suffers Another $2M Exploit",
 "Here Are Warren Buffett's Best Performing Dividend Stocks",
 'Why Amazon Shares Are Volatile Today',
 "Tesla Refuses To Integrate Apple's CarPlay — You 

In [8]:
hg_train_data = Dataset.from_pandas(train_data)
hg_test_data = Dataset.from_pandas(test_data)

In [9]:
print(f'The length of hg_train_data is {len(hg_train_data)}.\n')

# Check one review
hg_train_data[0]

The length of hg_train_data is 4138.



{'news_text': 'Quantity Over Quality? Ben Affleck Questions Assembly Line-Style Of Streaming Giant Netflix',
 'label': 1,
 '__index_level_0__': 311}

In [10]:
# Tokenizer from a pretrained model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Take a look at the tokenizer
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [11]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["news_text"], 
                     max_length=32,
                     truncation=True, 
                     padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_test = hg_test_data.map(tokenize_dataset)

Map:   0%|          | 0/4138 [00:00<?, ? examples/s]

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

In [12]:
# Take a look at the data
print(dataset_train)
print(dataset_test)

Dataset({
    features: ['news_text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4138
})
Dataset({
    features: ['news_text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 69
})


In [13]:

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [14]:
training_args = TrainingArguments(
    output_dir="./sentiment_transfer_learning_transformer/",          
    logging_dir='./sentiment_transfer_learning_transformer/logs',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-7,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

  return torch._C._cuda_getDeviceCount() > 0


In [15]:
print(f'There are {len(evaluate.list_evaluation_modules())} evaluation models in Hugging Face.\n')

# List all evaluation metrics
evaluate.list_evaluation_modules()

There are 157 evaluation models in Hugging Face.



['lvwerra/test',
 'precision',
 'code_eval',
 'roc_auc',
 'cuad',
 'xnli',
 'rouge',
 'pearsonr',
 'mse',
 'super_glue',
 'comet',
 'cer',
 'sacrebleu',
 'mahalanobis',
 'wer',
 'competition_math',
 'f1',
 'recall',
 'coval',
 'mauve',
 'xtreme_s',
 'bleurt',
 'ter',
 'accuracy',
 'exact_match',
 'indic_glue',
 'spearmanr',
 'mae',
 'squad',
 'chrf',
 'glue',
 'perplexity',
 'mean_iou',
 'squad_v2',
 'meteor',
 'bleu',
 'wiki_split',
 'sari',
 'frugalscore',
 'google_bleu',
 'bertscore',
 'matthews_correlation',
 'seqeval',
 'trec_eval',
 'rl_reliability',
 'jordyvl/ece',
 'angelina-wang/directional_bias_amplification',
 'cpllab/syntaxgym',
 'lvwerra/bary_score',
 'kaggle/amex',
 'kaggle/ai4code',
 'hack/test_metric',
 'yzha/ctc_eval',
 'codeparrot/apps_metric',
 'mfumanelli/geometric_mean',
 'daiyizheng/valid',
 'poseval',
 'erntkn/dice_coefficient',
 'mgfrantz/roc_auc_macro',
 'Vlasta/pr_auc',
 'gorkaartola/metric_for_tp_fp_samples',
 'idsedykh/metric',
 'idsedykh/codebleu2',
 'idsed

In [16]:
# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
y_test_predict = trainer.predict(dataset_test)

# Take a look at the predictions
y_test_predict