In [13]:
# Install dependencies
!pip install numpy
!pip install pandas
!pip install tensorflow
!pip install scikit-learn
!pip install transformers



In [17]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizerFast, TFBertForSequenceClassification, TFTrainer, TFTrainingArguments

# Load the dataset
df = pd.read_csv('../data/training.csv')

# Encode labels
le = LabelEncoder()
df['output_class'] = le.fit_transform(df['output_class'])

# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['input_text'].tolist(), df['output_class'].tolist(), test_size=.2)

# Initialize the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Prepare TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

# Load pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Define training arguments
training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
)

# Initialize the Trainer
trainer = TFTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
)

# Train the model
trainer.train()


KeyError: 'output_class'