In [3]:
import sys
!{sys.executable} -m pip install datasets evaluate transformers

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import os

# Data processing
import pandas as pd
import numpy as np

# Modeling
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback

# Hugging Face Dataset
from datasets import Dataset

# Model performance evaluation
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df = pd.DataFrame()
for root, _, files in os.walk("data"):
    for filename in files:
        temp_df = pd.read_excel(os.path.join(root, filename), "Sheet1")
        temp_df.drop(columns=[temp_df.columns[0], "Разница в долларах", "Дельта в процентах"], axis=1, inplace=True, errors="ignore")
        df = pd.concat([df, temp_df], axis=0, sort=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24484 entries, 0 to 1497
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Время          16445 non-null  object 
 1   Цена до        16445 non-null  float64
 2   Цена после     16445 non-null  float64
 3   Текст новости  16445 non-null  object 
 4   Аннотация      4780 non-null   object 
 5   Дата           11488 non-null  object 
 6   Unnamed: 1     842 non-null    object 
 7   Unnamed: 2     842 non-null    float64
 8   Unnamed: 3     842 non-null    float64
 9   Unnamed: 4     842 non-null    object 
 10  Unnamed: 5     834 non-null    object 
dtypes: float64(4), object(7)
memory usage: 2.2+ MB


In [6]:
df = df.drop_duplicates(subset=["Текст новости"], keep=False)
df.rename(columns={"Цена до": "price_before", "Цена после": "price_after", "Дата": "date", "Время": "Time", "Текст новости": "news_text"}, inplace=True)

df["absolute_price_difference"] = df["price_after"] - df["price_before"]
df["percentage_price_difference"] = df["absolute_price_difference"] / df["price_before"] * 100
df["label"] = np.where(df["absolute_price_difference"] > 0, 1, 0)
df = df[["news_text", "label"]].copy()
df.head()

Unnamed: 0,news_text,label
0,Spirit AeroSystems Slides On Q1 Earnings Miss ...,1
1,Boeing Unusual Options Activity For May 01,0
2,If You Invested $1000 In This Stock 20 Years A...,0
4,Crude Oil Down 3%; Boeing Reports Better-Than-...,0
5,"US Stocks Rise, Aided By Tech Earnings, But Th...",0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6112 entries, 0 to 1009
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   news_text  6112 non-null   object
 1   label      6112 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 143.2+ KB


In [8]:
train_data = df.sample(frac=0.8, random_state=42)

# Testing dataset
test_data = df.drop(train_data.index)

# Check the number of records in training and testing dataset.
print(f'The training dataset has {len(train_data)} records.')
print(f'The testing dataset has {len(test_data)} records.')

The training dataset has 4890 records.
The testing dataset has 43 records.


In [9]:
train_data["news_text"].tolist()

["What's Going On With NVIDIA Stock Today?\n",
 'Bitcoin Rally Continues As Google Launch A Blockchain Division\n',
 'If You Invested $1000 In This Stock 20 Years Ago, You Would Have $640 Thousand Today',
 'Walt Disney Whale Trades For December 08\n',
 "All The Stocks That Moved From Thursday's CNBC's 'Fast Money: Halftime Report'",
 'Cher At 75: New Hollywood Biopic Planned For Decades-Spanning Superstar\n',
 '$1000 Invested In Apple 10 Years Ago Would Be Worth This Much Today',
 "10 Stocks Moved By Traders On Thursday's CNBC's 'Fast Money: Halftime Report'",
 '3 Most Popular Startups To Invest in on StartEngine This Week',
 'Alphabet, Facebook Shares Decline On Citi Rating Downgrade Over Ad Revenue Growth Concerns: Bloomberg\n',
 'Chaos Erupts At Apple Plant In China Due To Strict Lockdown',
 "Why Is Dave Chappelle's Stand-Up Not Included In Netflix's Upcoming Comedy Specials?\n",
 'Is Ford Planning To Sell Its Rivian Stake? Jim Cramer Weighs In\n',
 'Pixar Employees Accuse Disney Br

In [10]:
hg_train_data = Dataset.from_pandas(train_data)
hg_test_data = Dataset.from_pandas(test_data)

In [11]:
print(f'The length of hg_train_data is {len(hg_train_data)}.\n')

# Check one review
hg_train_data[0]

The length of hg_train_data is 4890.



{'news_text': "What's Going On With NVIDIA Stock Today?\n",
 'label': 0,
 '__index_level_0__': 711}

In [12]:
# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")

# Take a look at the tokenizer
tokenizer

Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 59.7kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 451/451 [00:00<00:00, 1.01MB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.11MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.01MB/s]


DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-distilled-squad', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [13]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["news_text"],
                     max_length=32,
                     truncation=True,
                     padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_test = hg_test_data.map(tokenize_dataset)

                                                                 

In [14]:
# Take a look at the data
print(dataset_train)
print(dataset_test)

Dataset({
    features: ['news_text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 4890
})
Dataset({
    features: ['news_text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 43
})


In [15]:

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-distilled-squad", num_labels=2)


Downloading pytorch_model.bin: 100%|██████████| 265M/265M [01:07<00:00, 3.92MB/s] 
Some weights of the model checkpoint at distilbert-base-uncased-distilled-squad were not used when initializing DistilBertForSequenceClassification: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-distilled-squad and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classif

In [16]:
training_args = TrainingArguments(
    output_dir="./sentiment_transfer_learning_transformer/",
    logging_dir='./sentiment_transfer_learning_transformer/logs',
    logging_strategy='epoch',
    logging_steps=100,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-7,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

In [17]:
print(f'There are {len(evaluate.list_evaluation_modules())} evaluation models in Hugging Face.\n')

# List all evaluation metrics
evaluate.list_evaluation_modules()

There are 159 evaluation models in Hugging Face.



['lvwerra/test',
 'precision',
 'code_eval',
 'roc_auc',
 'cuad',
 'xnli',
 'rouge',
 'pearsonr',
 'mse',
 'super_glue',
 'comet',
 'cer',
 'sacrebleu',
 'mahalanobis',
 'wer',
 'competition_math',
 'f1',
 'recall',
 'coval',
 'mauve',
 'xtreme_s',
 'bleurt',
 'ter',
 'accuracy',
 'exact_match',
 'indic_glue',
 'spearmanr',
 'mae',
 'squad',
 'chrf',
 'glue',
 'perplexity',
 'mean_iou',
 'squad_v2',
 'meteor',
 'bleu',
 'wiki_split',
 'sari',
 'frugalscore',
 'google_bleu',
 'bertscore',
 'matthews_correlation',
 'seqeval',
 'trec_eval',
 'rl_reliability',
 'jordyvl/ece',
 'angelina-wang/directional_bias_amplification',
 'cpllab/syntaxgym',
 'lvwerra/bary_score',
 'kaggle/amex',
 'kaggle/ai4code',
 'hack/test_metric',
 'yzha/ctc_eval',
 'codeparrot/apps_metric',
 'mfumanelli/geometric_mean',
 'daiyizheng/valid',
 'poseval',
 'erntkn/dice_coefficient',
 'mgfrantz/roc_auc_macro',
 'Vlasta/pr_auc',
 'gorkaartola/metric_for_tp_fp_samples',
 'idsedykh/metric',
 'idsedykh/codebleu2',
 'idsed

In [18]:
# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [19]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()



Epoch,Training Loss,Validation Loss


TrainOutput(global_step=2446, training_loss=0.6944197048832916, metrics={'train_runtime': 79.9353, 'train_samples_per_second': 611.745, 'train_steps_per_second': 152.999, 'total_flos': 80970697428480.0, 'train_loss': 0.6944197048832916, 'epoch': 2.0})

In [20]:
y_test_predict = trainer.predict(dataset_test)

# Take a look at the predictions
y_test_predict

PredictionOutput(predictions=array([[-0.14651422, -0.02656186],
       [-0.01435217, -0.17684273],
       [-0.05827999, -0.14387673],
       [-0.07445429, -0.09145605],
       [-0.08446365, -0.13341103],
       [-0.01883118, -0.11244381],
       [-0.00853827, -0.08725955],
       [-0.121534  ,  0.03929546],
       [-0.14962137, -0.02842832],
       [-0.02357537, -0.05938287],
       [-0.1273139 , -0.03408107],
       [ 0.01421725, -0.09632891],
       [-0.03844813, -0.12107057],
       [-0.09428611, -0.03583232],
       [-0.09340044, -0.07478787],
       [-0.10596628, -0.12071005],
       [-0.11335356, -0.05761638],
       [-0.01505647, -0.14491756],
       [-0.08978271, -0.00795708],
       [-0.14723212, -0.04856132],
       [ 0.01131136, -0.11004572],
       [-0.06977075, -0.0430489 ],
       [-0.01843527, -0.13264164],
       [-0.01622443, -0.18924184],
       [-0.03894333, -0.0459111 ],
       [-0.08025022,  0.05287984],
       [ 0.01153569, -0.10462063],
       [-0.05093184, -0.11