<a href="https://colab.research.google.com/github/MichaelKazerooni/MarketSentimentAnalysis/blob/master/Market_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
df = pd.read_csv('market_sentiment.csv')
df.set_index('id', inplace = True)
# df = df[1:]
# df =df[~df.category.str.contains('\|')]
# df = df[(df['category'] != 'nocode')]
label_dict = {}
for idx, label in enumerate(df.sentiment.unique()):
    label_dict[label] = idx
df['sentiment'] = df.sentiment.replace(label_dict)

In [21]:
df.sentiment.value_counts()
# print(df.index.values)
df.head(5)

Unnamed: 0_level_0,sentiment,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,According to Gran the company has no plans to...
1,0,Technopolis plans to develop in stages an area...
2,1,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company s updated strategy fo...


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.sentiment.values, test_size = 0.15, random_state = 17, stratify = df.sentiment.values)
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
df.groupby(['sentiment','data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,title
sentiment,data_type,Unnamed: 2_level_1
0,train,2689
0,val,475
1,train,786
1,val,139
2,train,1497
2,val,264


In [0]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type =='train'].title.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type == 'val'].title.values,
    add_special_tokens = True,
    return_attention_mask = True,
    pad_to_max_length = True,
    max_length = 256,
    return_tensors = 'pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].sentiment.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].sentiment.values)


dataset_train = TensorDataset(input_ids_train,
                             attention_masks_train,labels_train)
dataset_val = TensorDataset(input_ids_val,
                           attention_masks_val,
                            labels_val)

In [96]:
pip install transformers




In [24]:
print(df.data_type.value_counts())
print(attention_masks_val)

train    4972
val       878
Name: data_type, dtype: int64
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [0]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False
)

In [0]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 32
dataloader_train = DataLoader(
    dataset_train,
    sampler = RandomSampler(dataset_train),
    batch_size = batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler = RandomSampler(dataset_val),
    batch_size = 32
)

In [0]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)

epochs = 10
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = len(dataloader_train)*epochs
)

In [0]:
import numpy as np
from sklearn.metrics import f1_score


def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v:k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis = 1).flatten()
    # print(preds_flat)
    # print(labels)
    labels_flat = labels.flatten()
    print(preds_flat)
    print(labels_flat)
    for label in np.unique(labels_flat):
        y_pred = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_pred[y_pred==label])}/{len(y_true)}\n')

In [30]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)



def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

cuda


In [31]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train,
                       desc ="Epoch {:1d}".format(epoch),
                       leave = False,
                       disable = False)
    for batch in progress_bar:
        # print(len(batch))
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids'      : batch[0],
            'attention_mask' : batch[1],
            'labels'         : batch[2]
        }
        
        output = model(**inputs)
        loss = output[0]
        loss_train_total += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
    torch.save(model.state_dict(), f'market_sentiment_models/BERT_ft_epoch{epoch}.model')
    
    tqdm.write('\nEpoch {epoch}')
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'validation loss: {val_loss}')
    tqdm.write(f'F1 score (weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=156.0, style=ProgressStyle(description_widt…


Epoch {epoch}
Training loss: 0.7654145915920918


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


validation loss: 0.5149470376116889
F1 score (weighted): 0.8023036371255607


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=156.0, style=ProgressStyle(description_widt…


Epoch {epoch}
Training loss: 0.4354605359526781


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


validation loss: 0.4233129045792988
F1 score (weighted): 0.8264670351321023


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=156.0, style=ProgressStyle(description_widt…


Epoch {epoch}
Training loss: 0.3195293999444216


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


validation loss: 0.4554873670318297
F1 score (weighted): 0.8182932333990506


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=156.0, style=ProgressStyle(description_widt…


Epoch {epoch}
Training loss: 0.2365781141874882


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


validation loss: 0.4616622509700911
F1 score (weighted): 0.8312343955842441


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=156.0, style=ProgressStyle(description_widt…


Epoch {epoch}
Training loss: 0.17966692419483876


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


validation loss: 0.5056747285915273
F1 score (weighted): 0.8332054683469992


HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=156.0, style=ProgressStyle(description_widt…


Epoch {epoch}
Training loss: 0.1298908106266306


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


validation loss: 0.5490140489169529
F1 score (weighted): 0.8280969981950327


HBox(children=(FloatProgress(value=0.0, description='Epoch 7', max=156.0, style=ProgressStyle(description_widt…


Epoch {epoch}
Training loss: 0.1036614890759572


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


validation loss: 0.6003806420734951
F1 score (weighted): 0.8258330985865742


HBox(children=(FloatProgress(value=0.0, description='Epoch 8', max=156.0, style=ProgressStyle(description_widt…


Epoch {epoch}
Training loss: 0.08748362147106001


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


validation loss: 0.6223430250372205
F1 score (weighted): 0.8160399149173825


HBox(children=(FloatProgress(value=0.0, description='Epoch 9', max=156.0, style=ProgressStyle(description_widt…


Epoch {epoch}
Training loss: 0.06647376978817658


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


validation loss: 0.6591586014255881
F1 score (weighted): 0.8293609736629572


HBox(children=(FloatProgress(value=0.0, description='Epoch 10', max=156.0, style=ProgressStyle(description_wid…


Epoch {epoch}
Training loss: 0.06425875321460459


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


validation loss: 0.6647249524082456
F1 score (weighted): 0.8269291053923817



In [0]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = len(label_dict),
                                                      output_attentions = False,
                                                      output_hidden_states = False
)

In [0]:
model.to(device)
pass

In [36]:
model.load_state_dict(torch.load('market_sentiment_models/BERT_ft_epoch8.model'))
_ , predictions , true_vals = evaluate(dataloader_val)
accuracy_per_class(predictions, true_vals)

HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


[1 1 0 0 0 2 0 1 1 0 0 0 0 2 2 2 0 0 0 2 0 0 0 1 0 1 0 2 0 2 2 0 0 0 0 2 0
 0 2 1 2 1 0 0 2 1 0 0 0 0 0 0 2 0 0 2 0 2 0 1 0 2 2 0 0 0 0 0 0 0 0 2 0 2
 2 1 0 1 0 2 0 2 0 0 0 1 2 2 0 0 1 2 2 0 0 2 1 0 0 0 0 2 2 0 2 0 2 2 0 2 2
 0 0 1 0 1 0 0 1 2 0 0 2 0 0 2 2 2 0 1 0 0 0 0 0 1 1 2 0 0 0 2 2 2 1 0 2 1
 0 1 2 1 1 0 0 2 0 2 2 0 1 1 0 0 1 0 0 2 0 2 0 0 0 0 0 0 2 0 2 1 2 1 2 0 0
 0 0 0 0 0 1 2 0 0 0 1 0 0 2 0 0 0 2 2 0 2 0 2 0 1 0 0 0 1 2 0 2 0 0 1 2 0
 1 2 0 0 2 0 0 1 0 2 1 0 0 2 1 2 2 2 1 2 2 0 2 2 2 2 2 2 2 0 1 2 2 2 0 2 1
 2 0 0 0 2 2 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 2 0 0
 2 2 0 2 0 2 1 1 0 2 2 0 2 2 2 2 2 2 0 2 0 0 0 0 2 1 0 0 0 2 1 0 2 2 0 0 0
 0 0 1 2 0 0 0 0 1 2 0 0 0 2 1 2 0 2 1 0 2 0 2 0 2 1 2 0 0 0 2 1 0 0 0 2 0
 0 0 0 0 0 2 0 0 0 2 0 0 2 2 2 0 1 2 0 2 0 0 0 1 0 1 2 0 2 2 1 0 0 1 0 0 0
 0 2 0 2 1 0 2 0 2 2 2 2 0 0 0 1 1 0 0 0 0 2 2 1 0 0 2 2 2 2 0 1 0 1 0 0 0
 0 1 0 0 2 1 2 1 0 2 1 0 0 1 1 0 0 0 2 0 1 1 0 2 2 1 1 2 0 1 2 0 2 0 0 2 1
 2 2 2 0 0 2 2 2 1 0 0 2

[[ 2.6347425e+00 -2.3178182e+00  5.0645918e-01]
 [-1.3902928e+00  8.1331462e-01  5.2205586e-01]
 [ 2.4809678e+00 -2.4942782e+00  5.3023237e-01]
 ...
 [ 2.4051940e+00 -2.3796372e+00  4.4714141e-01]
 [ 2.3079386e+00 -2.4101739e+00  5.1296854e-01]
 [-4.4864681e-01  6.7036778e-01 -6.5131101e-04]]
