In [3]:
import argparse
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from pytorch_pretrained_bert import BertAdam

from mmbt.data.helpers import get_data_loaders
from mmbt.models import get_model
from mmbt.utils.logger import create_logger
from mmbt.utils.utils import *


In [4]:
# tokenizer インスタンスの生成
# 対象モデルは'bert-base-uncased'
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# tokenizer関数の動作確認
tokens = tokenizer.tokenize("What's going on?")
print(tokens)

# convert_tokens_to_ids関数の動作確認
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

['what', "'", 's', 'going', 'on', '?']
[2054, 1005, 1055, 2183, 2006, 1029]


In [8]:
# BERT固有の特殊トークン達
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
print(cls_token, sep_token, pad_token, unk_token)

# idによるトークン表記
cls_token_idx = tokenizer.cls_token_id
sep_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(cls_token_idx, sep_token_idx, pad_token_idx, unk_token_idx)

[CLS] [SEP] [PAD] [UNK]
101 102 0 100


In [6]:
# 入力テキストのトークン化関数
def tokenize(sentence):
    tokens = tokenizer.tokenize(sentence) 
    # 252までで切る
    tokens = tokens[:254-2]
    return tokens

In [43]:
text_2023_01_jp = "The Japanese stock market experienced a notable upswing in the first week of 2023, reflecting a positive sentiment among investors and financial analysts. This bullish trend can be attributed to a combination of factors that made Japanese equities particularly appealing. \
During the early part of 2023, the Japanese stock market was dominated by bullish sentiment, partly due to the country's continued negative interest rates. This contrasted with the trend in other G7 countries, where interest rates were raised to combat inflation. The Nikkei-225 index, a key indicator of the Japanese stock market, grew by 30% in the first half of the year. This growth was supported by a balance of supply and demand, as evidenced by the formation of a range framing the index’s fluctuations in the second half of the year. \
However, there was growing speculation that the Bank of Japan might begin raising interest rates after years of being stuck in negative territory. This speculation was fueled by expectations that the US, Europe, and other regions might be nearing the peak of their interest rate hikes. The possibility of a shift in Japan's monetary policy led to a growing bearish sentiment by the end of November, with the Nikkei 225 experiencing a drop of almost 5%. \
Despite these concerns, some of the world's most renowned investors and major Wall Street banks maintained a positive outlook on the Japanese stock market. They saw more upside potential, even as the broad Topix index reached its highest level since 1990. This optimism was partly due to the return of inflation, improving shareholder returns, and an endorsement by prominent investors like Warren Buffett. \
It's important to note that the future direction of the Japanese stock market would likely depend on several factors, including the Bank of Japan's monetary policy decisions, global economic trends, and investor sentiment. The early 2023 bullish trend in the Japanese market demonstrated the market's resilience and its attractiveness to global investors, despite the looming possibility of interest rate changes. \
For more detailed insights and analysis, you can refer to the original sources: The Japan Times and FXOpen. "

text_2023_01_us = """
The US stock market sentiment in the first week of January 2023 showed a cautiously optimistic outlook, influenced by various factors including sector performances, rate hike expectations, and company-specific news.

1. **January Indicator Trifecta and Sector Performance**: The stock market experienced what is known as the "January Indicator Trifecta." This refers to a Santa Claus rally, positive first five days of January, and a positive January Barometer. The occurrence of all three indicators historically suggests a favorable market in the following 11 months. In terms of sector performance, Consumer Discretionary and Communication Services led the gains. The Nasdaq Composite showed a strong performance, especially in technology stocks, while small-cap stocks indicated by the S&P 600 Small Cap index also rose significantly.

2. **Federal Reserve and Rate Hike Odds**: The market was anticipating a 25 basis point rate increase at the February Federal Reserve meeting. This expectation was reflected in the pricing of fed fund futures. Treasury yields saw some weakness, with the 10-year Treasury yield dropping to 3.51%, which was below the October peak of 4.25%.

3. **Corporate Earnings and Stock Performance**: About one-third of S&P 500 companies reported a 5% decline in Q4 profits, compared to an expected 3.2% decline. Despite this, there were sectors like Energy, Industrials, and Consumer Discretionary that saw significant earnings growth. Notably, the worst-performing stocks of 2022 saw an average increase of 20.1% in early 2023, suggesting a short-term reversion of oversold stocks rather than a fundamental shift in market leadership.

4. **Influence of Company-Specific News**: Individual companies also influenced market sentiment. For instance, Tesla's shares went up after announcing price cuts in China, while Bed Bath & Beyond's shares declined significantly due to bankruptcy considerations. Costco's stock gained after reporting positive December sales data.

5. **Overall Market Dynamics**: The first week of January 2023 closed higher for US stocks, spurred by a favorable jobs report and corporate news. The CBOE Volatility Index (VIX), often regarded as a fear gauge, decreased by 11% in January, indicating a decrease in market volatility.

In summary, the first week of January 2023 in the US stock market was marked by a mix of optimism driven by sector performances and cautious sentiment due to economic indicators and corporate earnings. While there was a positive outlook based on the January indicators, the market remained sensitive to rate hikes and individual corporate performances.

References: 
- StockCharts.com【6†source】
- Nasdaq【7†source】
- Yahoo Finance【8†source】【9†source】
"""

tokens = tokenize(text_2023_01_jp)
print(tokens)
tokens = tokenize(text_2023_01_us)
print(tokens)
print(len(tokens))


input_ids = tokenizer.encode(text_2023_01_us)
print(input_ids)
encoding = tokenizer(
    text_2023_01_jp, 
    max_length =500, 
    padding ="max_length", 
    truncation=True,
    return_tensors="pt"
)
print(encoding.input_ids)
print(encoding.input_ids.size())



['the', 'japanese', 'stock', 'market', 'experienced', 'a', 'notable', 'ups', '##wing', 'in', 'the', 'first', 'week', 'of', '202', '##3', ',', 'reflecting', 'a', 'positive', 'sentiment', 'among', 'investors', 'and', 'financial', 'analysts', '.', 'this', 'bull', '##ish', 'trend', 'can', 'be', 'attributed', 'to', 'a', 'combination', 'of', 'factors', 'that', 'made', 'japanese', 'e', '##qui', '##ties', 'particularly', 'appealing', '.', 'during', 'the', 'early', 'part', 'of', '202', '##3', ',', 'the', 'japanese', 'stock', 'market', 'was', 'dominated', 'by', 'bull', '##ish', 'sentiment', ',', 'partly', 'due', 'to', 'the', 'country', "'", 's', 'continued', 'negative', 'interest', 'rates', '.', 'this', 'contrasted', 'with', 'the', 'trend', 'in', 'other', 'g', '##7', 'countries', ',', 'where', 'interest', 'rates', 'were', 'raised', 'to', 'combat', 'inflation', '.', 'the', 'nik', '##kei', '-', '225', 'index', ',', 'a', 'key', 'indicator', 'of', 'the', 'japanese', 'stock', 'market', ',', 'grew', '

In [39]:
# GPU利用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# 事前学習済みモデルのロード
from transformers import BertModel
bert = BertModel.from_pretrained('bert-base-uncased')

# モデルの定義
# 事前学習済みモデルの後段に線形関数を追加し、この出力で感情分析をする
import torch.nn as nn

class BERTSentiment(nn.Module):
    def __init__(self,
                 bert,
                 output_dim):
        
        super().__init__()
        
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.out = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        #text = [batch size, sent len]

        #embedded = [batch size, emb dim]
        embedded = self.bert(text)[1]
        print("embedded" , embedded.size() )

        #output = [batch size, out dim]
        output = self.out(embedded)
        
        return output
# モデルインスタンスの生成
# 出力は感情分析なので2
OUTPUT_DIM = 2

model = BERTSentiment(bert, OUTPUT_DIM).to(device)
model.eval()

input = encoding.input_ids.to(device)
predictions = model(input)
print(predictions)


cuda
embedded torch.Size([1, 768])
tensor([[0.1933, 0.2893]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [35]:
# 学習データのデータ構造定義

# torchtextのバージョンアップに伴い、legacyを付ける必要あり
from torchtext.legacy import data

# 入力データ
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  # 上で定義したトークン化関数
                  tokenize = tokenize,
                  # 前処理として各トークンをIDに変換
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = cls_token_idx,
                  eos_token = sep_token_idx,
                  pad_token = pad_token_idx)
 
# 正解ラベル
LABEL = data.LabelField()

ModuleNotFoundError: No module named 'torchtext.legacy'