# Instalacion de dependencias

In [1]:
#pip install happytransformer

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn, optim #Pytorch
from datasets import load_dataset

In [2]:
RANDOM_SEED = 42
BATCH_SIZE = 16
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Cargar datos

In [3]:
dataset = load_dataset('csv', data_files=['C:/Users/anabe/Documents/NLP/nueva/APPSTORE/corpus/book4.csv'], split='train')
datasets = dataset.train_test_split(test_size=0.20) # Test del 10%

Using custom data configuration default-c2dc502b58cbcb80
Reusing dataset csv (C:\Users\anabe\.cache\huggingface\datasets\csv\default-c2dc502b58cbcb80\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)
Loading cached split indices for dataset at C:\Users\anabe\.cache\huggingface\datasets\csv\default-c2dc502b58cbcb80\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-86eef27c27466863.arrow and C:\Users\anabe\.cache\huggingface\datasets\csv\default-c2dc502b58cbcb80\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-dd84bf939a90360a.arrow


In [4]:
datasets

DatasetDict({
    train: Dataset({
        features: ['name', 'general', 'descrip'],
        num_rows: 2496
    })
    test: Dataset({
        features: ['name', 'general', 'descrip'],
        num_rows: 625
    })
})

In [5]:
def etiquetas(examples):
    if examples["general"] == 'Linguistics':
        label = 0
    else: 
        label = 1
    return {"labels": label}

In [6]:
def etiquetas(examples):
    if examples ["general"] == 'Applied Science':
        label = 0
    elif examples ["general"] == 'Natural Sciences':
        label = 1
    elif examples ["general"] == 'Math':
        label = 2
    elif examples ["general"] == 'Social Sciences ':
        label = 3
    elif examples ["general"] == 'Art':
        label = 4
    elif examples ["general"] == 'Linguistics':
        label = 5
    else:
        label = 6
    return {"labels": label}

In [7]:
dataset = datasets.map(etiquetas)

  0%|          | 0/2496 [00:00<?, ?ex/s]

  0%|          | 0/625 [00:00<?, ?ex/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['name', 'general', 'descrip', 'labels'],
        num_rows: 2496
    })
    test: Dataset({
        features: ['name', 'general', 'descrip', 'labels'],
        num_rows: 625
    })
})

In [9]:
dataset['train'][5]

{'name': 'Ananas Language Exchange',
 'general': 'linguistics',
 'descrip': 'The language app for today. Learn and exchange languages. Think global, be local.',
 'labels': 6}

## Tokenizacion DataSet

In [10]:
model_checkpoint = "roberta-base"

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
#, use_fast=False

In [12]:
#Vamos a pasar el tokenizador en cada fila del dataset
def tokenize_reviews(examples):
    return tokenizer(examples["descrip"], padding=True, truncation=True)

In [13]:
type(dataset['train']['descrip'][0])

str

In [14]:
encoded_dataset = dataset.map(tokenize_reviews, batched=True )

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
tokenizer.vocab_size

50265

## Cargar el modelo preentrenado

In [16]:
from transformers import AutoModelForSequenceClassification

num_labels = 7
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [17]:
encoded_dataset['train'].features
#encoded_dataset

{'name': Value(dtype='string', id=None),
 'general': Value(dtype='string', id=None),
 'descrip': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [18]:
encoded_dataset.set_format("torch", columns=['input_ids','attention_mask','labels'])
encoded_dataset['train'].features

{'name': Value(dtype='string', id=None),
 'general': Value(dtype='string', id=None),
 'descrip': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [19]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['name', 'general', 'descrip', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 2496
    })
    test: Dataset({
        features: ['name', 'general', 'descrip', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 625
    })
})

In [20]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [21]:
from transformers import TrainingArguments

model_name = model_checkpoint.split("/")[-1]

batch_size = 64
num_train_epochs=4
num_train_samples = 2_000
train_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(num_train_samples))
#train_dataset = encoded_dataset["train"]

#logging_steps = len(encoded_dataset['train'])//batch_size
logging_steps = len(train_dataset) // (2 * batch_size * num_train_epochs)

training_args = TrainingArguments(
    output_dir="12jul-un",
    num_train_epochs=num_train_epochs,     
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch", 
    logging_steps=logging_steps,
    #push_to_hub=True,
    #push_to_hub_model_id=f"{model_name}-finetuned-wiki_multi"
)

In [22]:
from transformers import Trainer

trainer = Trainer(
    model=model, 
    args=training_args, 
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer
)

In [23]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: descrip, name, general. If descrip, name, general are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2000
  Num Epochs = 4
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 128


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0181,0.007474,1.0,1.0
2,0.0034,0.002099,1.0,1.0
3,0.0025,0.001572,1.0,1.0
4,0.0023,0.001441,1.0,1.0


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: descrip, name, general. If descrip, name, general are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 625
  Batch size = 64
Saving model checkpoint to 12jul-un\checkpoint-32
Configuration saved in 12jul-un\checkpoint-32\config.json
Model weights saved in 12jul-un\checkpoint-32\pytorch_model.bin
tokenizer config file saved in 12jul-un\checkpoint-32\tokenizer_config.json
Special tokens file saved in 12jul-un\checkpoint-32\special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: descrip, name, general. If descrip, name, general are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
****

TrainOutput(global_step=128, training_loss=0.15266482347578858, metrics={'train_runtime': 9293.3385, 'train_samples_per_second': 0.861, 'train_steps_per_second': 0.014, 'total_flos': 1200498081600000.0, 'train_loss': 0.15266482347578858, 'epoch': 4.0})