In [None]:
# Modelo Bert en proceso

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from textwrap import wrap

In [None]:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

In [None]:
# Inicialización
RANDOM_SEED = 42
MAX_LEN = 200
BATCH_SIZE = 16
DATASET_PATH = "/content/drive/My Drive/TesisProject/apis/tweet_data.csv"
#DATASET_PATH = "/content/drive/My Drive/02_Thesis/TesisProject/apis/tweet_data.csv" #Raúl
NCLASSES = 2

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
# Cargar dataset
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv(DATASET_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df['Sentiment']=df['Sentiment'].replace(-1,0)
print(df)
print(df.shape)

                                                   Text  Sentiment
0     Kickers on my watchlist XIDE TIT SOQ PNK CPW B...          1
1     user: AAP MOVIE. 55% return for the FEA/GEED i...          1
2     user I'd be afraid to short AMZN - they are lo...          1
3                                     MNTA Over 12.00            1
4                                      OI  Over 21.37            1
...                                                 ...        ...
5786  Industry body CII said #discoms are likely to ...          0
5787  #Gold prices slip below Rs 46,000 as #investor...          0
5788  Workers at Bajaj Auto have agreed to a 10% wag...          1
5789  #Sharemarket LIVE: Sensex off day’s high, up 6...          1
5790  #Sensex, #Nifty climb off day's highs, still u...          1

[5791 rows x 2 columns]
(5791, 2)


In [None]:
# TOKENIZACIÓN
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
# CREACIÓN DATASET

class IMDBDataset(Dataset):

  def __init__(self,reviews,labels,tokenizer,max_len):
    self.reviews = reviews
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.reviews)
    
  def __getitem__(self, item):
    review = str(self.reviews[item])
    label = self.labels[item]
    encoding = tokenizer.encode_plus(
        review,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
        )
    

    return {
          'review': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
      } 

In [None]:
# Data loader:

def data_loader(df, tokenizer, max_len, batch_size):
  dataset = IMDBDataset(
      reviews = df.Text.to_numpy(),
      labels = df.Sentiment.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )

  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 4)

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.2, random_state=RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

  cpuset_checked))


In [None]:
# EL MODELO!

class BERTSentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(BERTSentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)
    self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, cls_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask
    )
    drop_output = self.drop(cls_output)
    output = self.linear(drop_output)
    return output

In [None]:
model = BERTSentimentClassifier(NCLASSES)
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
print(model)

BERTSentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
# ENTRENAMIENTO
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)



In [None]:
# Iteración entrenamiento
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  predsList = []
  labelList = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      for i in range(len(preds)):
        predsList.append(preds[i].item())
        labelList.append(labels[i].item())
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return predsList, labelList, correct_predictions.double()/n_examples, np.mean(losses)


In [None]:
df_test

Unnamed: 0,Text,Sentiment
1891,MCP take-over chatter... (I know don't laugh...),0
1550,"AMZN 1,200 lot bid in the Feb weekly 255P. 27 ...",0
1049,KO made with sugar is sold at local COST. Cons...,1
2523,HNZ Another American Institution sold to forei...,0
156,Will watch the close carefully then decide whe...,0
...,...,...
5684,"Sensex jumps over 750 points to cross 29,000 m...",1
3090,VBD may be about move back up,1
203,"CSX enko view, PF Box size = 1, VEY bullish a...",1
339,AYI Q1 Operational Cash Flow turns Negative,0


In [None]:
# Entrenamiento!!!

for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss = train_model(
      model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
  )
  predsValues, labelValues, test_acc, test_loss = eval_model(
      model, test_data_loader, loss_fn, device, len(df_test)
  )
  print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
  print('Validación: Loss: {}, accuracy: {}'.format(test_loss, test_acc))
  print('')

Epoch 1 de 5
------------------


  cpuset_checked))


Entrenamiento: Loss: 0.6701094516392412, accuracy: 0.6135578583765112
Validación: Loss: 0.5475429357731179, accuracy: 0.727351164797239

Epoch 2 de 5
------------------




Entrenamiento: Loss: 0.5149397865451616, accuracy: 0.7411485319516408
Validación: Loss: 0.4693682508109367, accuracy: 0.8041415012942192

Epoch 3 de 5
------------------




Entrenamiento: Loss: 0.3216607441547616, accuracy: 0.8685233160621761
Validación: Loss: 0.5316443167473763, accuracy: 0.8153580672993961

Epoch 4 de 5
------------------




Entrenamiento: Loss: 0.1950465888724309, accuracy: 0.9337219343696027
Validación: Loss: 0.7140983776380755, accuracy: 0.817083692838654

Epoch 5 de 5
------------------




Entrenamiento: Loss: 0.12077110255345831, accuracy: 0.9654576856649395
Validación: Loss: 0.7413933568072748, accuracy: 0.825711820534944



In [None]:
import math

def p0_calc(tn, tp, fn, fp):
  return (tn + tp) / (tn + fp + fn + tp)

def pe_calc(tn, tp, fn, fp):  
  pemp = ((tn + fp) / (tn + fp + fn + tp))*((tn + fn) / (tn + fp + fn + tp))
  ptheo = ((fn + tp) / (tn + fp + fn + tp))*((fp + tp)/(tn + fp + fn + tp))

  return pemp + ptheo

def kappa_metric(tn, tp, fn, fp):
  p0 = p0_calc(tn, tp, fn, fp)
  pe = pe_calc(tn, tp, fn, fp)

  return (p0 - pe) / (1 - pe)

truePositive = 0
trueNegative = 0
falsePositive = 0
falseNegative = 0

for i in range(len(predsValues)):
  if predsValues[i] == labelValues[i] and labelValues[i] == 1:
    truePositive += 1
  elif predsValues[i] == labelValues[i] and labelValues[i] == 0:
    trueNegative += 1
  elif predsValues[i] != labelValues[i] and labelValues[i] == 1:
    falseNegative += 1
  elif predsValues[i] != labelValues[i] and labelValues[i] == 0:
    falsePositive += 1

precision = truePositive/(truePositive + falsePositive)
recall = truePositive/(truePositive + falseNegative)
f1 = 2*((recall*precision)/(recall+precision))
mcc = ((truePositive*trueNegative)-(falsePositive + falseNegative))/math.sqrt((truePositive + falsePositive)*(truePositive + falseNegative)*(trueNegative + falsePositive)*(trueNegative + falseNegative))


print("Accuracy: ", test_acc.item())
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)
print("MCC: ", mcc)
print("Kappa: ", kappa_metric(trueNegative, truePositive, falseNegative, falsePositive))

Accuracy:  0.825711820534944
Precision:  0.8523936170212766
Recall:  0.8756830601092896
F1:  0.8638814016172507
MCC:  0.6542386404334036
Kappa:  0.621797398502168


In [None]:
def classifySentiment(review_text):
  encoding_review = tokenizer.encode_plus(
      review_text,
      max_length = MAX_LEN,
      truncation = True,
      add_special_tokens = True,
      return_token_type_ids = False,
      pad_to_max_length = True,
      return_attention_mask = True,
      return_tensors = 'pt'
      )
  
  input_ids = encoding_review['input_ids'].to(device)
  attention_mask = encoding_review['attention_mask'].to(device)
  output = model(input_ids, attention_mask)
  _, prediction = torch.max(output, dim=1)
  print("\n".join(wrap(review_text)))
  if prediction:
    print('Sentimiento predicho: * * * * *')
  else:
    print('Sentimiento predicho: *')

In [None]:
review_text = "Avengers: Infinity War at least had the good taste to abstain from Jeremy Renner. No such luck in Endgame."

classifySentiment(review_text)

Avengers: Infinity War at least had the good taste to abstain from
Jeremy Renner. No such luck in Endgame.
Sentimiento predicho: *




In [None]:
test_loss

0.7413933568072748