# Data processing

This notebook preprocesses our dataset for compatibility with BERT. You should feel free to investigate other solutions (both models and tokenizers)!

In [None]:
import os, time
import random
import pandas as pd
import numpy as np
import gc
import tensorflow as tf
import tensorflow_hub as hub
from kaggle_datasets import KaggleDatasets

# We'll use a tokenizer for the BERT model from the modelling demo notebook.
!pip install bert-tensorflow
import bert.tokenization

print(tf.version.VERSION)

# Set global variables

Set maximum sequence length and path variables.

In [None]:
SEQUENCE_LENGTH = 512

DATA_PATH =  "../input/data-jigsaw1/"
BERT_PATH = "../input/bertmulti"
BERT_PATH_SAVEDMODEL = os.path.join(BERT_PATH, "bert_multi_cased_L-12_H-768_A-12_2")

OUTPUT_PATH = "/kaggle/working"

In [None]:
df_train = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
df_validation = pd.read_csv(os.path.join(DATA_PATH, "validation.csv"))
df_test = pd.read_csv(os.path.join(DATA_PATH, "test.csv"))

In [None]:
def get_tokenizer(bert_path=BERT_PATH_SAVEDMODEL):
    """Obtenez le tokenizer pour une couche BERT."""
    bert_layer = tf.saved_model.load(bert_path)
    bert_layer = hub.KerasLayer(bert_layer, trainable=False)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    cased = bert_layer.resolved_object.do_lower_case.numpy()
    tf.gfile = tf.io.gfile  # bert.tokenization.load_vocab dans tokenizer
    tokenizer = bert.tokenization.FullTokenizer(vocab_file, cased)
  
    return tokenizer

tokenizer = get_tokenizer()

# Preprocessing

Process individual sentences for input to BERT using the tokenizer, and then prepare the entire dataset. The same code will process the other training data files, as well as the validation and test data.

In [None]:
def process_sentence(sentence, max_seq_length=SEQUENCE_LENGTH, tokenizer=tokenizer):
    """Convertit la phrase sous la forme ['input_word_ids', 'input_mask', 'segment_ids']."""
    # Tokenize, et tronque à max_seq_length si necessaire.
    tokens = tokenizer.tokenize(str(sentence))
    if len(tokens) > max_seq_length - 2:
        tokens = tokens[:(max_seq_length - 2)]

    # Convertir les tokens de la phrase en IDs
    input_ids = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens + ["[SEP]"])

    # 1 pour les vrais tokens et un 0 pour les tokens de rembourrage.
    input_mask = [1] * len(input_ids)

    # Compléter par des zéros si la séquence est inférieur à max_seq_length
    pad_length = max_seq_length - len(input_ids)
    input_ids.extend([0] * pad_length)
    input_mask.extend([0] * pad_length)

    # Nous avons un seul segment d'entrée
    segment_ids = [0] * max_seq_length

    return (input_ids, input_mask, segment_ids)

def preprocess_and_save_dataset(unprocessed_filename, text_label='comment_text',
                                seq_length=SEQUENCE_LENGTH, verbose=True):
    """Preprocess a CSV to the expected TF Dataset form for multilingual BERT,
    and save the result."""
    dataframe = pd.read_csv(os.path.join(DATA_PATH, unprocessed_filename), index_col='id')
    processed_filename = (unprocessed_filename.rstrip('.csv') +
                          "-processed-seqlen{}.csv".format(SEQUENCE_LENGTH))

    pos = 0
    start = time.time()

    while pos < len(dataframe):
        processed_df = dataframe[pos:pos + 100000].copy()

        processed_df['input_word_ids'], processed_df['input_mask'], processed_df['all_segment_id'] = (
            zip(*processed_df[text_label].apply(process_sentence)))
        
        processed_df.drop(["comment_text"], axis=1, inplace=True)

        if pos == 0:
            processed_df.to_csv(processed_filename, index_label='id', mode='w')
        else:
            processed_df.to_csv(processed_filename, index_label='id', mode='a', header=False)

        if verbose:
            print('Processed {} examples in {}'.format(pos + 100000, time.time() - start))
        pos += 100000
    return

In [None]:
# Process the validation dataset.
preprocess_and_save_dataset("validation.csv")

In [None]:
# Process the test dataset.
preprocess_and_save_dataset("test.csv")

In [None]:
# Process the training dataset.
preprocess_and_save_dataset("train.csv")