In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1) Load your balanced CSV
df = pd.read_csv("../Data/NLP/news_sentiment_balanced.csv")

In [3]:
# 2) Rebuild a single 'text' column from your cleaned fields
#    (adjust column names if yours differ)
df["text"] = (
    df["title_clean"].fillna("") + " "
  + df["description_clean"].fillna("") + " "
  + df["content_clean"].fillna("")
)

In [4]:
# 3) Encode sentiment labels to integers
le = LabelEncoder()
df["label"] = le.fit_transform(df["sentiment"])

In [5]:
# 4) Stratified train/validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

In [6]:
# 5) Compute class weights on the training labels
weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels),
    y=train_labels
)
class_weight_dict = dict(enumerate(weights))

In [7]:
# 6) Tokenize with DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=64)
val_enc   = tokenizer(val_texts,   truncation=True, padding=True, max_length=64)

def to_tf_dataset(encodings, labels):
    return tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        tf.convert_to_tensor(labels)
    ))

train_ds = to_tf_dataset(train_enc, train_labels).shuffle(1000).batch(8)
val_ds   = to_tf_dataset(val_enc,   val_labels).batch(8)



In [8]:
# 7) Load and compile DistilBERT for classification
model = TFDistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(le.classes_)
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [9]:
# 8) Train with EarlyStopping and class weights
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=2, restore_best_weights=True
)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=[early_stop],
    class_weight=class_weight_dict
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


In [10]:
# 9) Save your fine-tuned model & tokenizer
model.save_pretrained("./Models/saved_model_distilbert_balanced")
tokenizer.save_pretrained("./Models/saved_model_distilbert_balanced")

('./Models/saved_model_distilbert_balanced\\tokenizer_config.json',
 './Models/saved_model_distilbert_balanced\\special_tokens_map.json',
 './Models/saved_model_distilbert_balanced\\vocab.txt',
 './Models/saved_model_distilbert_balanced\\added_tokens.json')