In [1]:

import os
import pandas as pd
import tensorflow as tf
import numpy as np
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from sklearn.metrics import classification_report

In [5]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
train.shape

(1280000, 7)

In [6]:
test.shape

(320000, 7)

In [7]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [10]:
X_train, y_train = train['stemmed_content'].tolist()[:50000], train['target'].tolist()[:50000]
X_test, y_test   = test['stemmed_content'].tolist()[:8000], test['target'].tolist()[:8000]


In [11]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
test_encodings  = tokenizer(X_test, truncation=True, padding=True, max_length=128)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [12]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).shuffle(10000).batch(8)
test_dataset  = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(8)

In [13]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, from_pt=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSeq

In [15]:
model.fit(train_dataset, validation_data=test_dataset, epochs=2)

Epoch 1/2
Epoch 2/2


<tf_keras.src.callbacks.History at 0x7896a3f931d0>

In [17]:
y_pred_logits = model.predict(test_dataset).logits
y_pred = np.argmax(y_pred_logits, axis=1)
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))

              precision    recall  f1-score   support

    Negative       0.78      0.71      0.75      4007
    Positive       0.74      0.80      0.77      3993

    accuracy                           0.76      8000
   macro avg       0.76      0.76      0.76      8000
weighted avg       0.76      0.76      0.76      8000



In [18]:
TRANSFORMER_MODEL_DIR = "/content/transformer_model"
os.makedirs(TRANSFORMER_MODEL_DIR, exist_ok=True)

model.save_pretrained(TRANSFORMER_MODEL_DIR)
tokenizer.save_pretrained(TRANSFORMER_MODEL_DIR)


('/content/transformer_model/tokenizer_config.json',
 '/content/transformer_model/special_tokens_map.json',
 '/content/transformer_model/vocab.txt',
 '/content/transformer_model/added_tokens.json',
 '/content/transformer_model/tokenizer.json')

In [19]:
!zip -r transformer_model.zip /content/transformer_model


  adding: content/transformer_model/ (stored 0%)
  adding: content/transformer_model/config.json (deflated 42%)
  adding: content/transformer_model/tf_model.h5 (deflated 8%)
  adding: content/transformer_model/vocab.txt (deflated 53%)
  adding: content/transformer_model/tokenizer_config.json (deflated 75%)
  adding: content/transformer_model/tokenizer.json (deflated 71%)
  adding: content/transformer_model/special_tokens_map.json (deflated 42%)


In [20]:
from google.colab import files
files.download("transformer_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>