In [None]:
!pip install sentencepiece
!pip install transformers

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 3.4MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.91
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████▍                   | 296kB 3.4MB/s eta 0:00:01

Importamos

In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import util
import gc
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Embedding, GlobalMaxPooling1D, Dropout, Input
from sklearn.model_selection import StratifiedKFold
import numpy as np
from transformers import RobertaTokenizer, RobertaConfig, TFRobertaPreTrainedModel
from transformers.modeling_tf_roberta import TFRobertaMainLayer
from transformers.modeling_tf_utils import get_initializer

Cargamos los datasets

In [None]:
train_df = pd.read_csv('train.csv', dtype={'id': np.int16, 'target': np.int8})
test_df = pd.read_csv('test.csv', dtype={'id': np.int16, 'target': np.int8})

Activamos el uso de TPU en tensorflow

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

Definimos roBERTa usando la librearia transformers de huggingface

In [None]:
class roBERTaModel(TFRobertaPreTrainedModel):

    def __init__(self, config, *inputs, **kwargs):
        super(roBERTaModel, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
        self.roberta = TFRobertaMainLayer(config, name="roberta")
        self.dropout_1 = tf.keras.layers.Dropout(0.3)
        self.classifier = tf.keras.layers.Dense(units=config.num_labels, name='classifier', kernel_initializer=get_initializer(config.initializer_range))

    def call(self, inputs, **kwargs):
        outputs = self.roberta(inputs, **kwargs)
        pooled_output = outputs[1]
        pooled_output = self.dropout_1(pooled_output, training=kwargs.get('training', False))
        logits = self.classifier(pooled_output)
        outputs = (logits,) + outputs[2:]

        return outputs

class roBERTaClassifier():
    
    def __init__(self, max_seq_length, lr, epochs, batch_size, splits):
        self.model_name = 'roberta-base'
        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)
        self.max_seq_length = max_seq_length
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.splits = splits
        
        
    def encode(self, text_column):

      def tokenize(x):
        return self.tokenizer.encode_plus(x, max_length=self.max_seq_length, pad_to_max_length=True, truncation=True)

      output = text_column.apply(lambda x: tokenize(x))
      input_ids = np.array([feature['input_ids'] for feature in output])
      masks = np.array([feature['attention_mask'] for feature in output])

      return (input_ids, masks)
    
    
    def _build_model(self):

      with tpu_strategy.scope():
          config = RobertaConfig.from_pretrained(self.model_name, num_labels=2)
          model = roBERTaModel.from_pretrained(self.model_name)
          optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr)
          loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
          metric = tf.keras.metrics.BinaryAccuracy('accuracy')
          model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
      return model
    
    
    def train_and_predict(self, df, test):
        model_count = self.splits
        X_test_encoded = self.encode(test['text'])
        y_pred = np.zeros((1, X_test_encoded[0].shape[0], 2))

        skf = StratifiedKFold(n_splits=model_count, random_state=None, shuffle=False)
        for fold, (trn_idx, val_idx) in enumerate(skf.split(df['text'], df['target'])):
            
            print('\nFold {}\n'.format(fold))
        
            model = self._build_model()
            X_trn = df.loc[trn_idx, 'text']
            X_val = df.loc[val_idx, 'text']

            X_trn_encoded = self.encode(X_trn)
            y_trn = df.loc[trn_idx, 'target'].values.reshape(-1, 1)
            X_val_encoded = self.encode(X_val)
            y_val = df.loc[val_idx, 'target'].values.reshape(-1, 1)

            y_trn_encoded, y_val_encoded = tf.keras.utils.to_categorical(y_trn), tf.keras.utils.to_categorical(y_val)

            callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)]

            history = model.fit([X_trn_encoded[0], X_trn_encoded[1]], y_trn_encoded, validation_data=([X_val_encoded[0], X_val_encoded[1]], y_val_encoded), epochs=self.epochs, batch_size=self.batch_size, callbacks=callbacks)
            util.plot_history(history)

            y_pred += model.predict(X_test_encoded)
            del model
            gc.collect()

        return y_pred / model_count
        

In [None]:
%%time
roBERTa = roBERTaClassifier(max_seq_length=128, lr=3e-5, epochs=10, batch_size=128, splits=2)
y_pred = roBERTa.train_and_predict(train_df, test_df)

In [None]:
y_pred[0].shape

In [None]:
final_df = pd.read_csv('sample_submission.csv')
final_df['target'] = np.argmax(y_pred[0], axis=1).flatten()
final_df['target'].value_counts()

In [None]:
final_df.to_csv('roBERTa.csv', index=False)