In [2]:
while True:
    PRETRAINED = input(f'''Would you like to use the last saved trained model (p) or overwrite it with a new training session (train_new_model)?
p: pretrained
new: overwrite with new session
''')
    if PRETRAINED == 'p': 
        PRETRAINED = True
        break
    elif PRETRAINED == 'new':
        PRETRAINED = False
        break
    else:
        print("Invalid response. type p: pretrained, or train_new_model: overwrite with new session")

while True:
    COLAB = input(f'''Type (g) if you are using Google Colab with GPU or (l) if you are running on your local machine's CPU.
g: Google Colab
l: local 
''')
    if COLAB == 'g':
        COLAB = True
        from google.colab import drive
        drive.mount('/content/drive')
        break
    elif COLAB == 'l':
        COLAB = False
        break
    else:
        print("Invalid response. type gc: Google Colab, or local: local machine")

RANDOM_SEED = 42
RISK_LEVEL = 0.2  # (conservative) 0.0 to 1.0 (risky)
#%pip install -r requirements.txt

Would you like to use the last saved trained model (p) or overwrite it with a new training session (train_new_model)?
p: pretrained
new: overwrite with new session
p
Type (g) if you are using Google Colab with GPU or (l) if you are running on your local machine's CPU.
g: Google Colab
l: local 
l


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
%matplotlib inline

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

path = './'
if COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive/')
    path = '/content/gdrive/My Drive/Transformer/'

  torch.utils._pytree._register_pytree_node(


In [4]:
df = pd.read_csv("stock_tweet_sentiment.csv", sep=',')
df.shape

(28440, 7)

In [5]:
df = df[["text", "Sentiment"]]

In [6]:
df

Unnamed: 0,text,Sentiment
0,VIDEO: “I was in my office. I was minding my o...,0
1,The price of lumber $LB_F is down 22% since hi...,0
2,Who says the American Dream is dead? https://t...,-1
3,Barry Silbert is extremely optimistic on bitco...,1
4,How satellites avoid attacks and space junk wh...,-1
...,...,...
28435,$FB : 29234a9c-7f08-4d5a-985f-cb1a5554ecf9,0
28436,【仮想通貨】ビットコインの価格上昇、８０万円台回復　約１カ月半ぶり $BTC ht...,0
28437,RT @invest_in_hd: 'Nuff said! $TEL #telcoin #...,0
28438,【仮想通貨】ビットコインの価格上昇、８０万円台回復　約１カ月半ぶり $BTC ht...,0


In [7]:
df["Sentiment"].replace(-1,0,inplace=True)
df.head()

Unnamed: 0,text,Sentiment
0,VIDEO: “I was in my office. I was minding my o...,0
1,The price of lumber $LB_F is down 22% since hi...,0
2,Who says the American Dream is dead? https://t...,0
3,Barry Silbert is extremely optimistic on bitco...,1
4,How satellites avoid attacks and space junk wh...,0


In [8]:
class StockTweetsDataset(Dataset):

    def __init__(self, tweets, targets, tokenizer, max_len):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            # pad_to_max_length=True, # deprecated
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')

        return {
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)}

In [9]:
# Split 80-10-10
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
df_train.shape, df_val.shape, df_test.shape

((22752, 2), (2844, 2), (2844, 2))

In [10]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = StockTweetsDataset(tweets=df["text"].to_numpy(), targets=df["Sentiment"].to_numpy(), tokenizer=tokenizer, max_len=max_len)
    return DataLoader(ds, batch_size=batch_size, num_workers=4)

In [11]:
BATCH_SIZE = 16
MAX_LEN = 80  # All tweets in the data set contain fewer than 80 tokens
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', return_dict=False)

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [12]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-cased', return_dict=False)
    self.dropout = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    _, output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    output = self.dropout(output)
    return self.out(output)

In [13]:
class_names = ['bearish', 'bullish']

if not PRETRAINED:
    data = next(iter(train_data_loader))
    data.keys()
    print(data['input_ids'].shape)
    print(data['attention_mask'].shape)
    print(data['targets'].shape)
    model = SentimentClassifier(len(class_names))
    model = model.to(device)
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    F.softmax(model(input_ids, attention_mask), dim=1)
    EPOCHS = 10
    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    loss_fn = nn.CrossEntropyLoss().to(device)

In [14]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [15]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
PATH = 'twitter_sentiment_model.pth'  # filename for the pretrained model

if not PRETRAINED:
    #%%time
    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)
        train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train))
        print(f'Train loss {train_loss} accuracy {train_acc}')
        val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device, len(df_val))
        print(f'Val   loss {val_loss} accuracy {val_acc}')
        print()
        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)

        if val_acc > best_accuracy:
            device = torch.device("cuda")
            model.to(device)
            torch.save({
                'history': history,
                'epoch': EPOCHS,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict()}, PATH)
            best_accuracy = val_acc
else:
    model = SentimentClassifier(len(class_names))
    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

    if COLAB:
        PATH = path + PATH
        device = torch.device('cuda')
        checkpoint = torch.load(PATH)
        model.to(device)
    else:
        device = torch.device('cpu')
        checkpoint = torch.load(PATH, map_location=device)

    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    # history = checkpoint['history']

In [None]:
if not PRETRAINED:
    test_acc, _ = eval_model(model, test_data_loader, loss_fn, device, len(df_test))
    test_acc.item()

In [2]:
def get_predictions(model, data_loader):
    model = model.eval()
    tweets = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["tweet_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            probs = F.softmax(outputs, dim=1)
            tweets.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return tweets, predictions, prediction_probs, real_values

In [None]:
# This section is used to calculate predictions and save them to a .csv file
# or load them from the saved .csv file from the pretrained model.
if not PRETRAINED:
    y_tweets, y_pred, y_pred_probs, y_test = get_predictions(
        model, test_data_loader)
    y_pred_probs = torch.squeeze(y_pred_probs.reshape(-1, 1))
    y_pred_probs_bearish = y_pred_probs[:y_pred_probs.size(0)//2]
    y_pred_probs_bullish = y_pred_probs[y_pred_probs.size(0)//2:]
    outcome_df = pd.DataFrame({"y_tweets": y_tweets})
    outcome_df['y_pred'] = y_pred
    outcome_df['y_pred_probs_bearish'] = y_pred_probs_bearish
    outcome_df['y_pred_probs_bullish'] = y_pred_probs_bullish
    outcome_df['y_test'] = y_test
    outcome_df.to_csv(f"{path}twitter_sentiment_outcomes.csv")
else:
    outcome_df = pd.read_csv(f"{path}twitter_sentiment_outcomes.csv")
    y_tweets = outcome_df.y_tweets.tolist()
    y_pred = torch.tensor(outcome_df.y_pred.tolist())
    y_pred_probs_bearish = torch.tensor(
        outcome_df.y_pred_probs_bearish.to_list()).reshape(-1, 2)
    y_pred_probs_bullish = torch.tensor(
        outcome_df.y_pred_probs_bullish.tolist()).reshape(-1, 2)
    y_pred_probs = torch.cat((y_pred_probs_bearish, y_pred_probs_bullish), 0)
    y_test = torch.tensor(outcome_df.y_test.tolist())

In [None]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment')


cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names))