In [1]:
# Data processing
import pandas as pd
import numpy as np

# Modeling
import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

# Hugging Face Dataset
from datasets import Dataset

# Model performance evaluation
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Read the data
df = pd.read_csv('data/train.csv')
df_standalone = df[['text', 'label']]

# Split the data into train and validation
train_df, val_df = np.split(df_standalone.sample(frac=1, random_state=42), [int(.8*len(df))])

In [6]:
train_df

Unnamed: 0,text,label
63,where close to 12 hours but at least it is rep...,0
2808,they last about 1 1 2 years until the problems...,0
102,"even better , with the optional headset you ca...",1
2692,i began taking pics as soon as i got this came...,1
416,"second , the menu system and controls are poor...",0
...,...,...
1172,it is extremely light and disappears in your p...,1
814,wish i could give it 4 1 2 stars,1
1852,"overall the nikon 4300 is a very dependable , ...",1
2220,this did not feel like quality construction an...,0


In [9]:
val_df

Unnamed: 0,text,label
2105,i ' ve had no problems at all so far and i ' m...,1
133,4 megapixels is enough for anybody and the pho...,1
1140,i have shopped with amazon before and have bee...,1
933,"the player usually plays dvd ' s , but has occ...",0
928,i found it very useful for transferring large ...,1
...,...,...
1638,on several different occasions it has displaye...,0
1095,less than a month later the screen freezes and...,0
1130,"anything this phone does , it does perfectly",1
1294,1 it uses regular garbage bags instead of bran...,1


In [7]:
# Convert pyhton dataframe to Hugging Face arrow dataset
hg_train_df = Dataset.from_pandas(train_df)
hg_val_df = Dataset.from_pandas(val_df)

In [8]:
# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Take a look at the tokenizer
tokenizer

Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 3.40kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 47.3kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 2.07MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 6.38MB/s]


BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["review"], 
                     max_length=32, 
                     truncation=True, 
                     padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_test = hg_test_data.map(tokenize_dataset)

In [11]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["text"], 
                     max_length=32, 
                     truncation=True, 
                     padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_df.map(tokenize_dataset)
dataset_val = hg_val_df.map(tokenize_dataset)

                                                                

In [12]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading pytorch_model.bin: 100%|██████████| 436M/436M [02:42<00:00, 2.68MB/s] 
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

In [13]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./sentiment_transfer_learning_transformer/",          
    logging_dir='./sentiment_transfer_learning_transformer/logs',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=6,              
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,  
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

In [14]:
# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [15]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2412
  Num Epochs = 6
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3618
  Number of trainable parameters = 108311810
  3%|▎         | 126/3618 [04:07<2:20:16,  2.41s/it]

KeyboardInterrupt: 

# FALTA POR PONER LA PARTE DE LAS PREDICCIONES SISISISISISISISISI