In [1]:
import pandas as pd
import transformers
from datasets import load_dataset
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

### Importing Data

In [2]:
sample_submission = pd.read_csv('data/sample_submission.csv')
test_labels = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
dataset = load_dataset('csv', data_files={'train': 'data/train.csv', 'test': 'data/train.csv'})


Using custom data configuration default-6d7c7cc95b48419d
Reusing dataset csv (/Users/aziz.mosbah/.cache/huggingface/datasets/csv/default-6d7c7cc95b48419d/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 159571
    })
    test: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 159571
    })
})

In [7]:
model_checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
batch_size = 16

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

#### PreProcessing Pipelines

In [18]:
def preprocess_function(examples):
    return tokenizer(examples['comment_text'], truncation=True)

In [19]:
pre_tokenizer_columns = set(dataset["train"].features)
encoded_dataset = dataset.map(preprocess_function, batched=True)
tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
print("Columns added by tokenizer:", tokenizer_columns)

Loading cached processed dataset at /Users/aziz.mosbah/.cache/huggingface/datasets/csv/default-6d7c7cc95b48419d/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-11f04c72b2289992.arrow


  0%|          | 0/160 [00:00<?, ?ba/s]

Columns added by tokenizer: ['attention_mask', 'input_ids']


In [22]:
dataset['train']

Dataset({
    features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
    num_rows: 159571
})

In [21]:
encoded_dataset["train"]

Dataset({
    features: ['attention_mask', 'comment_text', 'id', 'identity_hate', 'input_ids', 'insult', 'obscene', 'severe_toxic', 'threat', 'toxic'],
    num_rows: 159571
})

In [24]:
label = ['Design', 'Pricing', 'Bugs']

['attention_mask', 'input_ids']

In [25]:
tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
    columns=tokenizer_columns,
    label_cols=["toxic"],
    shuffle=True,
    batch_size=16,
    collate_fn=tokenizer.pad,
)
tf_validation_dataset = encoded_dataset['test'].to_tf_dataset(
    columns=tokenizer_columns,
    label_cols=["toxic"],
    shuffle=False,
    batch_size=16,
    collate_fn=tokenizer.pad,
)

In [32]:
loss = tf.keras.losses.MeanSquaredError()
num_labels = 2

In [33]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=num_labels
)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
from transformers import create_optimizer

num_epochs = 5
batches_per_epoch = len(encoded_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps
)
model.compile(optimizer=optimizer, loss=loss)

In [35]:
metric_name = ('accuracy')

In [36]:
def compute_metrics(predictions, labels):
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# from transformers.keras_callbacks import PushToHubCallback

# model_name = model_checkpoint.split("/")[-1]
# model.push_to_hub(f"{model_name}-finetuned-{task}")
# username = "Rocketknight1"

# callback = PushToHubCallback(
#     output_dir="./tc_model_save",
#     tokenizer=tokenizer,
#     hub_model_id=f"{username}/{push_to_hub_model_id}",
# )


model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=3,
#     callbacks=[callback],
)

Epoch 1/3


In [7]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=3
)

Epoch 1/3
