<a href="https://colab.research.google.com/github/AlexH20/Supervised-ML-sentiment-measures/blob/main/BERT_sentimentAR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install torch
!pip3 install transformers -i https://pypi.python.org/simple


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.python.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 15.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 81.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hu

In [None]:
import numpy as np
import torch
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
from google.colab import drive 
import pandas as pd
from sklearn.model_selection import train_test_split
import csv

drive.mount("/content/gdrive")

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = df['AR_dummy'].tolist()
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['Text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.1):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.bert.trainable = False
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.sigmoid(linear_output)

        return final_layer

def train(model, train_data, learning_rate, epochs):

    train = Dataset(train_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f}')
            

def evaluate(model, test_data):

    test = Dataset(test_data)

    prediction = []

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=8)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
              prediction.append(output.argmax(dim=1).tolist())
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
    return sum(prediction, [])

def split_months(dt):
    return [dt[dt["ordered_month"] == y] for y in dt["ordered_month"].unique()]

data = pd.read_csv("gdrive/My Drive/Thesis/processed data/CAR_regression/datasets_final/data_whole_woScAR.csv", index_col = False)
print(data)

data_onlytext = data[data["word_count"] != 0]
data_onlytext["Date"] = pd.to_datetime(data_onlytext["Date"])
data_onlytext["Year"] = [x.year for x in data_onlytext["Date"]]
data_onlytext["Month"] = [x.month for x in data_onlytext["Date"]]
data_onlytext["ordered_month"] = [((x[1]["Year"]-2015)*12 + x[1]["Month"]) for x in data_onlytext.iterrows()]

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
EPOCHS = 5
LR = 1e-5
             
data_splt_months = split_months(data_onlytext)

i = -1

np.random.seed(9000)
for _, month in enumerate(data_splt_months):

        i += 1

        data_train = pd.concat([data_splt_months[i], data_splt_months[i+1], data_splt_months[i+2]])
        print(data_train)
        data_test = data_splt_months[i+3]
        print(data_test)

        print(len(data_train))
        print(len(data_test))

        data_train["AR_dummy"] = np.where(data_train["AR"] > 0, 1, 0)
        data_test["AR_dummy"] = np.where(data_test["AR"] > 0, 1, 0)

        model = BertClassifier()
        train(model, data_train, LR, EPOCHS)
        pred = evaluate(model, data_test)

        if i+1 < len(data_splt_months):

            data_splt_months[i+3]["AR_BERT"] = pred

            with open("gdrive/My Drive/Thesis/processed data/CAR_regression/BERT_sentiment_dummy_R/" + str(i+1) + ".csv", "w") as csv_file:
                  
                  writer = csv.writer(csv_file)
                  writer.writerow(
                      ["Date", "Ticker", "Nasdaq", "Turnover", "Size", "BTM", "pref_alpha", "CAR", "Text", "AR", "AR_dummy", "AR_BERT"])
                  for index, row in data_splt_months[i+3].iterrows():
                      writer.writerow([row["Date"], row["Ticker"], row["Nasdaq"], row["Turnover"], row["Size"], row["BTM"], row["pref_alpha"], row["CAR"], row["Text"], row["AR"], row["AR_dummy"], row["AR_BERT"]])










Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
                       Date Ticker  Nasdaq  Turnover          Size       BTM  \
0       2015-01-02 00:00:00   AAPL       1  1.336802  6.370024e+08  0.187370   
1       2015-01-05 00:00:00   AAPL       1  1.344416  6.190077e+08  0.192817   
2       2015-01-06 00:00:00   AAPL       1  1.347419  6.190077e+08  0.192817   
3       2015-01-07 00:00:00   AAPL       1  1.345351  6.190660e+08  0.192799   
4       2015-01-08 00:00:00   AAPL       1  1.351682  6.277467e+08  0.190132   
...                     ...    ...     ...       ...           ...       ...   
127053  2019-12-24 00:00:00    PEP       1  0.496903  1.910934e+08  0.077376   
127054  2019-12-26 00:00:00    PEP       1  0.498080  1.906053e+08  0.077574   
127055  2019-12-27 00:00:00    PEP       1  0.494766  1.906053e+08  0.077574   
127056  2019-12-30 00:00:00    PEP       1  0.497787  1.903027e+08  0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

             Date Ticker  Nasdaq  Turnover          Size       BTM  \
0      2015-01-02   AAPL       1  1.336802  6.370024e+08  0.187370   
1      2015-01-05   AAPL       1  1.344416  6.190077e+08  0.192817   
2      2015-01-06   AAPL       1  1.347419  6.190077e+08  0.192817   
7      2015-01-13   AAPL       1  1.376746  6.363537e+08  0.187561   
8      2015-01-14   AAPL       1  1.384635  6.420037e+08  0.185910   
...           ...    ...     ...       ...           ...       ...   
119559 2015-03-16    GLW       0  1.066449  3.014349e+07  0.623285   
120809 2015-03-04    ADP       1  0.631143  4.214598e+07  0.114092   
120824 2015-03-25    ADP       1  0.662422  4.132159e+07  0.116368   
124599 2015-03-26    ACN       0  0.724509  5.540726e+07  0.110703   
124601 2015-03-30    ACN       0  0.724411  5.930794e+07  0.103422   

        pref_alpha       CAR  \
0         0.001312 -0.018882   
1         0.001142 -0.000292   
2         0.000912  0.012774   
7         0.000836  0.012517   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly i

Epochs: 1 | Train Loss:  0.087                 | Train Accuracy:  0.488


 92%|█████████▏| 107/116 [00:26<00:02,  4.08it/s]