In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd drive/MyDrive/univer/Expociencia/Code/distillmbert

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
from transformers import  DistilBertModel, DistilBertTokenizerFast , DistilBertForMaskedLM
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import torch
from sklearn.metrics import f1_score
import time
from sklearn.metrics import accuracy_score
from pathlib import Path
from torch.nn import functional as F

In [None]:
name = "yelp"

In [None]:
#train_data = pd.read_csv('train_'+name+'_clean.csv')
#val_data = pd.read_csv('val_'+name+'_clean.csv')
test_data = pd.read_csv('test_'+name+'_clean.csv')

In [None]:
#train_data['Score'] = train_data['Score'].apply(lambda x: x-1) 
#val_data['Score'] = val_data['Score'].apply(lambda x: x-1) 
print('ok')

In [None]:
test_data['Score'] = test_data['Score'].apply(lambda x: x-1) 


In [None]:
MAX_LEN = 128 #Fijo
BATCH_SIZE = 32
NCLASSES= 5 #Fijo
DropOut = 0.1
RANDOM_SEED = 42
EPOCHS= 1
LEARNING_RATE=2e-5 

In [None]:

#NAME_BERT_MODEL="distilbert-base-es-cased"
#NAME="distilbert-base-es-cased"
#LEARNING_RATE=2e-5 ## <- hiperparametro mas sensible
#myfile = Path(f'{NAME} LR {LEARNING_RATE}.txt')
#myfile.touch(exist_ok=True)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device= torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
distill_bert = DistilBertModel.from_pretrained("Geotrend/distilbert-base-es-cased")
tokenizer = DistilBertTokenizerFast.from_pretrained("Geotrend/distilbert-base-es-cased")

In [None]:
#distil_mlm = DistilBertForMaskedLM.from_pretrained('Geotrend/distilbert-base-es-cased')
#ckp = torch.load('Pretrined_distill_CLEAN_1.pth', map_location=device)
#distil_mlm.load_state_dict(ckp['state_dict'])
#distill_bert.load_state_dict(distil_mlm.distilbert.state_dict())

In [None]:
class DataModel(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len, mode_truncation):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode_truncation = mode_truncation

    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item]) 
        label = self.labels[item]
        tokens = self.tokenizer.tokenize(review) 
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < self.max_len:
            tokens = tokens + ['[PAD]' for item in range(self.max_len-len(tokens))]
        elif len(tokens) > self.max_len:
            if self.mode_truncation == "head":
                tokens = tokens[:self.max_len-1] + ['[SEP]']    
            elif self.mode_truncation == "head+tail":
                tokens = tokens[:65] + tokens[-63:]#int((self.max_len)/2)
            elif self.mode_truncation == "tail":
                tokens = ['[CLS]'] + tokens[-int((self.max_len)/2):]
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens_ids_tensor = torch.tensor(token_ids)
        attn_mask = (tokens_ids_tensor != 0).long()
        return{
            'review':review,
            'input_ids':tokens_ids_tensor.flatten(),
            'attention_mask':attn_mask.flatten(),
            'label':torch.tensor(label, dtype=torch.long)}

def data_loader(df, tokenizer, max_len, batch_size, modo):
  dataset = DataModel(
    reviews=df.Text.to_numpy(),
    labels = df.Score.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len,
    mode_truncation = modo
  )
  return DataLoader(dataset, batch_size= BATCH_SIZE, num_workers=2)

In [None]:
#train_data_loader = data_loader(train_data, tokenizer, MAX_LEN, BATCH_SIZE, "head+tail")
#validation_data_loader = data_loader(val_data, tokenizer, MAX_LEN, BATCH_SIZE, "head+tail")
test_data_loader = data_loader(test_data, tokenizer, MAX_LEN, BATCH_SIZE, "head+tail")

In [None]:
class DistillBERTModel(nn.Module):
    def __init__(self,n_class):
        super(DistillBERTModel, self).__init__()
        self.num_labels = n_class
        self.distilbert = distill_bert
        self.classifier = nn.Linear(self.distilbert.config.hidden_size, self.num_labels)
        self.dropout = nn.Dropout(DropOut)
        nn.init.xavier_normal_(self.classifier.weight)

    def forward(self, input_ids=None, attention_mask=None):
        distilbert_output = self.distilbert(input_ids=input_ids,
                                            attention_mask=attention_mask)
                                            #,return_dict=False)
        hidden_state = distilbert_output[0]                    
        pooled_output = hidden_state[:, 0]                   
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output) 
        return logits

In [None]:
model = DistillBERTModel(NCLASSES)
model.to(device)
print('ok')

In [None]:
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
def train_model (model, data_loader, loss_fn, optimizer, device, epoch, n_examples):
  model=model.train()
  losses = []
  accuracy_global = []
  f1_score_global = []
  f1_weight_global = []

  i = 0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    f1_score_global.append(np.mean(f1_score(preds.cpu().detach().numpy(), labels.cpu().detach().numpy(), average=None)))
    f1_weight_global.append(np.mean(f1_score(preds.cpu().detach().numpy(), labels.cpu().detach().numpy(), average='weighted')))
    ac = accuracy_score(preds.cpu().detach().numpy(), labels.cpu().detach().numpy())
    accuracy_global.append(ac)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    optimizer.zero_grad()
    if i%5==0:
        print('Ejemplo {}/{} , Entrenamiento: Loss: {}, accuracy: {} -- f1_score: {} -- f1 weighted: {}'.format(i, int(n_examples/BATCH_SIZE),
                                                                                               np.mean(losses), np.mean(accuracy_global), np.mean(f1_score_global), 
                                                                                               np.mean(f1_weight_global)))
    if (i%1000==0 and i>9999) or (i == int(n_examples/BATCH_SIZE)-1):
        validation_acc, validation_loss, validation_f1, val_weight, history = eval_model(
            model, validation_data_loader, loss_fn, device, len(val_data), 'Validación'
        )
        print('Entrenamiento: Loss: {}, accuracy: {} -- f1_score: {} -- f1 weighted: {}'.format(np.mean(losses), np.mean(accuracy_global), np.mean(f1_score_global), 
                                                                                                              np.mean(f1_weight_global)))
        print('Validación: Loss: {}, accuracy: {} -- f1_score: {} -- f1 weighted: {}'.format(validation_loss, validation_acc, validation_f1,
                                                                                                          val_weight
                                                                                                          ))
        weight = {f'weight_model':model.state_dict()}
        torch.save(weight, f'epoch_{epoch}_distilbert_{i}.pth')
        with open (f'{NAME} LR {LEARNING_RATE}.txt','a') as f:
                f.write(f'''    name: {NAME_BERT_MODEL}
                Iteracion : {i}
                Train:
                   -loss: {np.mean(losses)} , 
                    Mean :Accuracy: {np.mean(accuracy_global)}, f1_score: {np.mean(f1_score_global)}, f1_micro: {np.mean(f1_weight_global)}
                    std : Accuracy: {np.std(accuracy_global)}, f1_score: {np.std(f1_score_global)}, f1_micro: {np.std(f1_weight_global)}
                    var : Accuracy: {np.std(accuracy_global)**2}, f1_score: {np.std(f1_score_global)**2}, f1_micro: {np.std(f1_weight_global)**2}
                Validation
                   -loss: {np.mean(history['loss'])},
                   -Mean  Accuracy: {np.mean(history['accuracy'])}, f1_score: {np.mean(history['f1_score'])}, f1_weighted: {np.mean(history['f1_weight'])}
                   -std:  Accuracy: {np.std(history['accuracy'])}, f1_score: {np.std(history['f1_score'])}, f1_weighted: {np.std(history['f1_weight'])}
                   -var:  Accuracy: {np.std(history['accuracy'])**2}, f1_score: {np.std(history['f1_score'])**2}, f1_weighted: {np.std(history['f1_weight'])**2}
                ---------------------------------    
    ''')
    i+=1
  return np.mean(accuracy_global), np.mean(losses), np.mean(f1_score_global), np.mean(f1_weight_global), {'loss':losses, 
                                                                                                        'accuracy': accuracy_global,
                                                                                                        'f1_score': f1_score_global,
                                                                                                        'f1_weight':f1_weight_global}

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples, modo):
  model = model.eval()
  losses = []
  correct_predictions = 0
  accuracy_global = []
  f1_score_global = []
  f1_weighted_global = []
  i=0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      f1_score_global.append(np.mean(f1_score(preds.cpu().detach().numpy(), labels.cpu().detach().numpy(), average=None)))
      f1_weighted_global.append(np.mean(f1_score(preds.cpu().detach().numpy(), labels.cpu().detach().numpy(), average='weighted')))
      ac = accuracy_score(preds.cpu().detach().numpy(), labels.cpu().detach().numpy())
      accuracy_global.append(ac)
      losses.append(loss.item())
      if i%5==0:
        print('Ejemplo {}/{} , {}: Loss: {}, accuracy: {} -- f1_score: {} -- f1 weighted: {}'.format(i, int(n_examples/BATCH_SIZE),
                                                                                               modo, np.mean(losses), np.mean(accuracy_global), np.mean(f1_score_global), 
                                                                                               np.mean(f1_weighted_global)))
      i+=1
  return np.mean(accuracy_global), np.mean(losses), np.mean(f1_score_global), np.mean(f1_weighted_global),{'loss':losses, 
                                                                                                            'accuracy': accuracy_global,
                                                                                                            'f1_score': f1_score_global,
                                                                                                            'f1_weight':f1_weighted_global}

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'El modelo tiene {count_parameters(model)} de parámetros')

In [None]:
for epoch in range(EPOCHS):#5126
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  start_time = time.time() 

  train_acc, train_loss, train_f1, train_weight , history_train = train_model(
    model, train_data_loader, loss_fn, optimizer, device, epoch, len(train_data)
  )
  finish_time = time.time()
  validation_acc, validation_loss, validation_f1, val_weight, history_val = eval_model(
    model, validation_data_loader, loss_fn, device, len(val_data), 'Validación'
  )
  checkpoint = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
             'optimizer': optimizer.state_dict(),
             'history_by_epoch': {
                 'train':history_train,
                 'val':history_val
                                  }
                }  

  torch.save(checkpoint, f'DistilBERT_checkpoint_{epoch+1}.pth')
  print('Entrenamiento: Loss: {}, accuracy: {}, f1_score: {}, f1_weight: {}'.format(train_loss, train_acc, train_f1, train_weight))
  print('Validación: Loss: {}, accuracy: {}, f1_score: {}, f1_weight: {}'.format(validation_loss, validation_acc, validation_f1, val_weight))
  print('')
  elapsed_time = finish_time - start_time
  with open (f'{NAME} LR {LEARNING_RATE}.txt','a') as f:
    f.write(f'''    name: {NAME_BERT_MODEL}
    epoch: {epoch+1}
    learning rate: {LEARNING_RATE}
    bach size: {BATCH_SIZE}
    max len: {MAX_LEN}
    n clases: {NCLASSES}
    dropout: {DropOut}
    Train:
      -loss: {np.mean(history_train['loss'])},
      -Mean  Accuracy: {np.mean(history_train['accuracy'])}, f1_score: {np.mean(history_train['f1_score'])}, f1_weighted: {np.mean(history_train['f1_weight'])}
      -std:  Accuracy: {np.std(history_train['accuracy'])}, f1_score: {np.std(history_train['f1_score'])}, f1_weighted: {np.std(history_train['f1_weight'])}
      -var:  Accuracy: {np.std(history_train['accuracy'])**2}, f1_score: {np.std(history_train['f1_score'])**2}, f1_weighted: {np.std(history_train['f1_weight'])**2}
    Validation
      -loss: {np.mean(history_val['loss'])},
      -Mean  Accuracy: {np.mean(history_val['accuracy'])}, f1_score: {np.mean(history_val['f1_score'])}, f1_weighted: {np.mean(history_val['f1_weight'])}
      -std:  Accuracy: {np.std(history_val['accuracy'])}, f1_score: {np.std(history_val['f1_score'])}, f1_weighted: {np.std(history_val['f1_weight'])}
      -var:  Accuracy: {np.std(history_val['accuracy'])**2}, f1_score: {np.std(history_val['f1_score'])**2}, f1_weighted: {np.std(history_val['f1_weight'])**2}

    time: {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))}

    ---------------------------------   
    
    ''')

In [None]:
checkpoint = torch.load('epoch_0_distilbert_18000.pth')
checkpoint.keys()

In [None]:
model.load_state_dict(checkpoint['weight_model'])

In [None]:
checkpoint['weight_model']

In [None]:
validation_acc, validation_loss, validation_f1, val_weight, history_val = eval_model(
    model, test_data_loader, loss_fn, device, len(test_data), 'test'
  )