In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#cd drive/MyDrive/

In [None]:
cd drive/MyDrive/univer/Expociencia/Code/Beto

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertModel
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import torch
from sklearn.metrics import f1_score
import time
from sklearn.metrics import accuracy_score, mean_absolute_error
from pathlib import Path
from torch.nn import functional as F
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
name_dataset = "yelp_clean"
#train_data = pd.read_csv('train_yelp_clean.csv')
#val_data = pd.read_csv('val_yelp_clean.csv')
test_data = pd.read_csv('test_yelp_clean.csv')

In [None]:
#train_data['Score'] = train_data['Score'].apply(lambda x: x-1) 
#val_data['Score'] = val_data['Score'].apply(lambda x: x-1) 
test_data['Score'] = test_data['Score'].apply(lambda x: x-1)

In [None]:
MAX_LEN = 128 
BATCH_SIZE = 32
NCLASSES= 5
DropOut = 0.1
EPOCHS= 1
LEARNING_RATE=2e-5
is_with_pretrain = False 
NAME_BERT_MODEL="dccuchile/bert-base-spanish-wwm-cased"
#NAME="bert-base-spanish-wwm-cased"+"_"+name_dataset
#if is_with_pretrain:
#    NAME+="_with_pretrain"+"_"+name_dataset
#myfile = Path(f'{NAME} LR {LEARNING_RATE}.txt')
#myfile.touch(exist_ok=True)
BERT = BertModel.from_pretrained(NAME_BERT_MODEL)
RANDOM_SEED =  42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

In [None]:
#if is_with_pretrain:
#    loaded_checkpoint = torch.load('Pretrined_NewBERT_CLEAN_weightModelBERT.pth', map_location=device)
#    BERT.load_state_dict(loaded_checkpoint['weight_model'])
#    print('ok')

In [None]:
class LoadDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len, mode_truncation):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode_truncation = mode_truncation

    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item]) 
        label = self.labels[item]
        tokens = self.tokenizer.tokenize(review) 
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < self.max_len:
            tokens = tokens + ['[PAD]' for item in range(self.max_len-len(tokens))]
        elif len(tokens) > self.max_len:
            if self.mode_truncation == "head":
                tokens = tokens[:self.max_len-1] + ['[SEP]']    
            elif self.mode_truncation == "head+tail":
                tokens = tokens[:int(self.max_len/2)+1] + tokens[int((-1)*self.max_len/2)+1:]
                #tokens = ['[CLS]'] + tokens + ['[SEP]']
            elif self.mode_truncation == "tail":
                tokens = ['[CLS]'] + tokens[-int((self.max_len)/2):]
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens_ids_tensor = torch.tensor(token_ids)
        attn_mask = (tokens_ids_tensor != 1).long()
        return{
            'review':review,
            'input_ids':tokens_ids_tensor.flatten(),
            'attention_mask':attn_mask.flatten(),
            'label':torch.tensor(label, dtype=torch.long)}

def data_loader(df, tokenizer, max_len, batch_size, modo):
  dataset = LoadDataset(
    reviews=df.Text.to_numpy(),
    labels = df.Score.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len,
    mode_truncation = modo
  )
  return DataLoader(dataset, batch_size= BATCH_SIZE, num_workers=2)

In [None]:
#train_data_loader = data_loader(train_data, tokenizer, MAX_LEN, BATCH_SIZE, "head+tail")
#validation_data_loader = data_loader(val_data, tokenizer, MAX_LEN, BATCH_SIZE, "head+tail")
test_data_loader = data_loader(test_data, tokenizer, MAX_LEN, BATCH_SIZE, "head+tail")

In [None]:
class BERTModel(nn.Module):
    def __init__(self, n_class):
        super(BERTModel, self).__init__()
        self.bert = BERT
        self.do = nn.Dropout(DropOut)
        self.linear = nn.Linear(self.bert.config.hidden_size, n_class)
        #self.softmax = torch.nn.Softmax(dim=1)
        nn.init.xavier_uniform_(self.linear.weight)
    def forward(self, input_ids, attention_mask):
        _, cls_output = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            return_dict=False
        )
        dropout = self.do(cls_output)
        output = self.linear(dropout)
        return output

In [None]:
model = BERTModel(NCLASSES)
model.to(device)
print('ok')

In [None]:
loss_fn = nn.CrossEntropyLoss().to(device)
#optimizer = torch.optim.AdamW(model.parameters(),  eps=1e-6, betas=(0.9, 0.99), weight_decay=0.01, lr=LEARNING_RATE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

"""total_steps = len(train_data_loader)*EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=10000,
  num_training_steps = total_steps
)"""

In [None]:
#int(len(train_data)/BATCH_SIZE)

In [None]:
def train_model (model, data_loader, loss_fn, optimizer, device, epoch, n_examples):
  model=model.train()
  losses = []
  accuracy_global = []
  f1_score_global = []
  f1_weight_global = []

  i = 0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    f1_score_global.append(np.mean(f1_score(preds.cpu().clone().detach().numpy(), labels.cpu().clone().detach().numpy(), average=None)))
    f1_weight_global.append(np.mean(f1_score(preds.cpu().clone().detach().numpy(), labels.cpu().clone().detach().numpy(), average='weighted')))
    ac = accuracy_score(preds.cpu().clone().detach().numpy(), labels.cpu().clone().detach().numpy())
    accuracy_global.append(ac)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    #scheduler.step()
    optimizer.zero_grad()
    if i%5==0:
        print('Ejemplo {}/{} , Entrenamiento: Loss: {}, accuracy: {} -- f1_score: {} -- f1 weight: {} '.format(i, int(n_examples/BATCH_SIZE),
                                                                                               np.mean(losses), np.mean(accuracy_global), np.mean(f1_score_global), 
                                                                                                   np.mean(f1_weight_global)))
    if (i%1000==0 and i>9999) or (i == int(n_examples/BATCH_SIZE)-1):
        validation_acc, validation_loss, validation_f1, val_weight, history_val = eval_model(
            model, validation_data_loader, loss_fn, device, len(val_data), 'Validación'
        )
        print('Entrenamiento: Loss: {}, accuracy: {} -- f1_score: {} -- f1 weight: {} '.format(np.mean(losses), np.mean(accuracy_global), np.mean(f1_score_global), 
                                                                                                              np.mean(f1_weight_global)))
        print('Validación: Loss: {}, accuracy: {} -- f1_score: {} -- f1 weight: {}'.format(validation_loss, validation_acc, validation_f1,
                                                                                                          val_weight, 
                                                                                                          ))
        weight = {f'weight_model':model.state_dict()}
        torch.save(weight, f'epoch_{epoch}_bert_{i}_weightModelBERT.pth')

        with open (f'{NAME} LR {LEARNING_RATE}.txt','a') as f:
                f.write(f'''    name: {NAME_BERT_MODEL}
                Iteracion : {i}
                Train:
                   -loss: {np.mean(losses)}, 
                   Accuracy: mean: {np.mean(accuracy_global)}, std: {np.std(accuracy_global)}, var: {np.std(accuracy_global)**2}  
                   f1_score: {np.mean(f1_score_global)}, std: {np.std(f1_score_global)}, var: {np.std(f1_score_global)**2}
                   f1_weight: {np.mean(f1_weight_global)}, std: {np.std(f1_weight_global)}, var: {np.std(f1_weight_global)**2}

                Validation
                   -loss: {validation_loss}, 
                   Accuracy: mean: {validation_acc}, std: {np.std(history_val['accuracy'])}, var: {np.std(history_val['accuracy'])**2}  
                   f1_score: {validation_f1}, std: {np.std(history_val['f1_score'])}, var: {np.std(history_val['f1_score'])**2}
                   f1_weight: {val_weight}, std: {np.std(history_val['f1_weight'])}, var: {np.std(history_val['f1_weight'])**2}

                ---------------------------------    
    ''')
    i+=1
  return np.mean(accuracy_global), np.mean(losses), np.mean(f1_score_global), np.mean(f1_weight_global),    {'loss':losses, 
                                                                                                            'accuracy': accuracy_global,
                                                                                                            'f1_score': f1_score_global,
                                                                                                            'f1_weight':f1_weight_global}

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples, modo):
  model = model.eval()
  losses = []
  accuracy_global = []
  f1_score_global = []
  f1_weigh_global = []
  i=0 
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      f1_score_global.append(np.mean(f1_score(preds.cpu().clone().detach().numpy(), labels.cpu().clone().detach().numpy(), average=None)))
      f1_weigh_global.append(np.mean(f1_score(preds.cpu().clone().detach().numpy(), labels.cpu().clone().detach().numpy(), average='weighted')))
      ac = accuracy_score(preds.cpu().clone().detach().numpy(), labels.cpu().clone().detach().numpy())
      accuracy_global.append(ac)
      losses.append(loss.item())
      if i%5==0:
        print('Ejemplo {}/{} , {}: Loss: {}, accuracy: {} -- f1_score: {} -- f1 weight: {}'.format(i, int(n_examples/BATCH_SIZE),
                                                                                               modo, np.mean(losses), np.mean(accuracy_global), np.mean(f1_score_global), 
                                                                                               np.mean(f1_weigh_global)))
      i+=1
  return np.mean(accuracy_global), np.mean(losses), np.mean(f1_score_global), np.mean(f1_weigh_global), {'loss':losses, 
                                                                                                        'accuracy': accuracy_global,
                                                                                                        'f1_score': f1_score_global,
                                                                                                    'f1_weight': f1_weigh_global}

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'El modelo tiene {count_parameters(model)} de parámetros')

In [None]:
for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  start_time = time.time() 

  train_acc, train_loss, train_f1, train_weight , history_train = train_model(
    model,train_data_loader, loss_fn, optimizer, device, epoch, len(train_data)
  )
  finish_time = time.time()
    
  validation_acc, validation_loss, validation_f1, val_weight, history_val = eval_model(
    model, validation_data_loader, loss_fn, device, len(val_data), 'Validación'
  )
  checkpoint = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
                 'optimizer': optimizer.state_dict(),
                 'history_by_epoch': {
                 'train':history_train,
                 'val':history_val
                                  }
                }
  torch.save(checkpoint, f'yelp_CLEAN_checkpoint_{epoch+1}.pth')
  print('Entrenamiento: Loss: {}, accuracy: {}, f1_score: {}, f1_weight: {}'.format(train_loss, train_acc, train_f1, train_weight))
  print('Validación: Loss: {}, accuracy: {},  f1: {}, f1_weight: {}'.format(validation_loss, validation_acc, validation_f1, val_weight))
  print('')
  elapsed_time = finish_time - start_time
  with open (f'{NAME} LR {LEARNING_RATE}.txt','a') as f:
    f.write(f'''    name: {NAME_BERT_MODEL}
    epoch: {epoch+1}
    learning rate: {LEARNING_RATE}
    bach size: {BATCH_SIZE}
    max len: {MAX_LEN}
    n clases: {NCLASSES}
    dropout: {DropOut}
    Train:
        -loss: {train_loss}, Accuracy: {train_acc}, f1_score: {train_f1}, f1_weight: {train_weight}
    Validation
        -loss: {validation_loss}, Accuracy: {validation_acc}, f1_score: {validation_f1}, f1_weight: {val_weight}
    time: {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))}

    ---------------------------------   
    
    ''')

In [None]:
ls

In [None]:
checkpoint = torch.load('epoch_0_bert_18000_weightModelBERT.pth')
checkpoint.keys()

In [None]:
model.load_state_dict(checkpoint['weight_model'])

In [None]:
validation_acc, validation_loss, validation_f1, val_weight, history_val = eval_model(
    model, test_data_loader, loss_fn, device, len(test_data), 'test'
  )