In [None]:
!pip install datasets transformers tensorflow pandas




In [None]:
# Step 1: Imports
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import pandas as pd


In [None]:
# Step 2: Load CSVs
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")
val_df = pd.read_csv("Val.csv")


In [None]:
print(train_df.head())


                                                Data  Label
0  মুগ্ধ হয়ে গেলাম মামু. আর তোমায় কি কমু. বলো তোম...      1
1  এই কুত্তার বাচ্চাদের জন্য দেশটা আজ এমন অবস্তায়...      2
2                          ভাই আপনার কথাই যাদু রয়েছে      1
3                        উওরটা আমার অনেক ভাল লেগেছে       1
4  আমার নিজের গাড়ী নিয়ে কি সাজেক যেতে পারবো না ?...      0


In [None]:
# Step 3: Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": val_dataset,
})


In [None]:
# Step 4: Load Tokenizer and Model
model_name = "csebuetnlp/banglabert"  # pretrained on Bengali corpus
# model_name = "xlm-roberta-base" # Multilingual including Bengali

# Load Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, from_pt=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'electra.embeddings.position_ids']
- This IS expected if you are initializing TFElectraForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
So

In [None]:
# Step 5: Preprocessing Function
def preprocess_function(examples):
    return tokenizer(examples["Data"], truncation=True, padding="max_length", max_length=128)

# Step 6: Tokenize the Dataset
encoded_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/12575 [00:00<?, ? examples/s]

Map:   0%|          | 0/1586 [00:00<?, ? examples/s]

Map:   0%|          | 0/1567 [00:00<?, ? examples/s]

In [None]:
# Step 7: Convert to TensorFlow Datasets (Manual tf.data.Dataset conversion)
def convert_to_tf_dataset(encoded_split):
    input_ids = [example["input_ids"] for example in encoded_split]
    attention_mask = [example["attention_mask"] for example in encoded_split]
    labels = [example["Label"] for example in encoded_split]

    dataset = tf.data.Dataset.from_tensor_slices((
        {
            "input_ids": tf.convert_to_tensor(input_ids, dtype=tf.int32),
            "attention_mask": tf.convert_to_tensor(attention_mask, dtype=tf.int32),
        },
        tf.convert_to_tensor(labels, dtype=tf.int64),
    ))

    return dataset.shuffle(640).batch(64)

train_tf_dataset = convert_to_tf_dataset(encoded_dataset["train"])
val_tf_dataset = convert_to_tf_dataset(encoded_dataset["validation"])
test_tf_dataset = convert_to_tf_dataset(encoded_dataset["test"])


In [None]:
# Step 8: Compile the Model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)


In [None]:
# Step 9: Train the Model
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True,
    verbose=1
)

model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='best_model',
    monitor='val_accuracy',
    save_best_only=True,
    save_format='tf',
    verbose=1
)

history = model.fit(
    train_tf_dataset,
    validation_data=val_tf_dataset,
    epochs=10,
    callbacks=[early_stop, model_checkpoint],
    verbose=1
)


Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.72431, saving model to best_model
Epoch 2/10
Epoch 2: val_accuracy improved from 0.72431 to 0.74984, saving model to best_model
Epoch 3/10
Epoch 3: val_accuracy improved from 0.74984 to 0.75622, saving model to best_model
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.75622
Epoch 4: early stopping
Restoring model weights from the end of the best epoch: 2.


In [None]:
for key, values in history.history.items():
    print(f"{key}: {values}")


loss: [0.815580427646637, 0.601538360118866, 0.4529055058956146, 0.33370450139045715]
accuracy: [0.6450099349021912, 0.7575348019599915, 0.8305367827415466, 0.885407567024231]
val_loss: [0.6569945812225342, 0.6059777736663818, 0.6536471843719482, 0.7329315543174744]
val_accuracy: [0.7243139743804932, 0.7498404383659363, 0.7562220692634583, 0.7466496229171753]


In [None]:
# Step 10: Evaluate on test set
results = model.evaluate(test_tf_dataset)
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])


Test Loss: 0.6536477208137512
Test Accuracy: 0.7395964860916138


In [None]:
def predict(text: str):
    # Tokenize the input text
    inputs = tokenizer(
        text,
        return_tensors="tf",
        padding="max_length",
        truncation=True,
        max_length=128
    )

    # Get logits from the model
    outputs = model(inputs)
    logits = outputs.logits

    # Get predicted class
    predicted_class = tf.argmax(logits, axis=1).numpy()[0]

    return predicted_class

id2label = {
    0: "Neutral",
    1: "Positive",
    2: "Negative"
}

# text = "আমি আজ সিনেমা দেখেছি।"
# text = "অভিনয়টা খুবই ভালো লেগেছে।"
text = "বইটা পড়তে পেরে আমি আশাহত ।"


pred = predict(text)
print(f"Predicted class ID: {pred} -> Label: {id2label[pred]}")


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


Predicted class ID: 2 -> Label: Negative
