In [1]:
# Mount Google Drive
from google.colab import drive
import os

# Disable Weights & Biases tracking
os.environ["WANDB_DISABLED"] = "true"

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define directories and file paths
main_dir = "/content/drive/MyDrive/ITCS6156_BERT"
result_dir = os.path.join(main_dir, "Results")
data_dir = os.path.join(main_dir, "Data")
train_path = os.path.join(data_dir, "train_data.txt")
test_path = os.path.join(data_dir, "test_data.txt")
test_solution_path = os.path.join(data_dir, "test_data_solution.txt")


Mounted at /content/drive


In [2]:
import pandas as pd

# Load training and test data
train_data = pd.read_csv(train_path, sep=":::", engine="python", header=None, names=["ID", "TITLE", "GENRE", "DESCRIPTION"])
test_data = pd.read_csv(test_path, sep=":::", engine="python", header=None, names=["ID", "TITLE", "DESCRIPTION"])
test_solution = pd.read_csv(test_solution_path, sep=":::", engine="python", header=None, names=["ID", "TITLE", "GENRE", "DESCRIPTION"])

# Combine title and description for text input
train_data["TEXT"] = train_data["TITLE"] + " " + train_data["DESCRIPTION"]
test_data["TEXT"] = test_data["TITLE"] + " " + test_data["DESCRIPTION"]

# Encode genres to numeric labels
train_data["GENRE_LABEL"] = train_data["GENRE"].factorize()[0]
genre_mapping = dict(enumerate(train_data["GENRE"].factorize()[1]))

print("Genre Mapping:", genre_mapping)


Genre Mapping: {0: ' drama ', 1: ' thriller ', 2: ' adult ', 3: ' documentary ', 4: ' comedy ', 5: ' crime ', 6: ' reality-tv ', 7: ' horror ', 8: ' sport ', 9: ' animation ', 10: ' action ', 11: ' fantasy ', 12: ' short ', 13: ' sci-fi ', 14: ' music ', 15: ' adventure ', 16: ' talk-show ', 17: ' western ', 18: ' family ', 19: ' mystery ', 20: ' history ', 21: ' news ', 22: ' biography ', 23: ' romance ', 24: ' game-show ', 25: ' musical ', 26: ' war '}


In [3]:
from sklearn.model_selection import train_test_split

# Split training data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data["TEXT"], train_data["GENRE_LABEL"], test_size=0.2, random_state=42
)


In [4]:
from transformers import DistilBertTokenizer

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize text data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_data["TEXT"]), truncation=True, padding=True, max_length=512)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [5]:
import tensorflow as tf

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(16)


In [6]:
from transformers import TFDistilBertForSequenceClassification

# Load pre-trained DistilBERT model for sequence classification
num_labels = len(genre_mapping)
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [7]:
# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3  # Adjust based on dataset size
)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [8]:
# Evaluate the model
results = model.evaluate(val_dataset)
print("Validation Loss and Accuracy:", results)


Validation Loss and Accuracy: [1.4756394624710083, 0.6360785961151123]


In [None]:
# Convert test_encodings to TensorFlow-compatible format
test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_encodings)).batch(16)

# Predict on test data
test_predictions = model.predict(test_dataset)
test_predicted_labels = tf.argmax(test_predictions.logits, axis=1).numpy()

# Map predicted labels to genre names
test_data["PREDICTED_GENRE_LABEL"] = test_predicted_labels
test_data["PREDICTED_GENRE"] = test_data["PREDICTED_GENRE_LABEL"].map({v: k for k, v in genre_mapping.items()})

# Save predictions for review
predictions_file = os.path.join(result_dir, "predictions.csv")
test_data[["ID", "TITLE", "PREDICTED_GENRE"]].to_csv(predictions_file, index=False)
print(f"Predictions saved to {predictions_file}")


 673/3388 [====>.........................] - ETA: 13:52