In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import BertTokenizer, TFBertForSequenceClassification

In [2]:
train_df = pd.read_csv('/content/train_E6oV3lV.csv')

In [3]:
X_train, X_val, y_train, y_val = train_test_split(train_df['tweet'], train_df['label'], test_size=0.2, random_state=42)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [5]:
def tokenize_tweets(tweets):
    return tokenizer(tweets.tolist(), padding=True, truncation=True, max_length=200, return_tensors='tf')


In [6]:
train_encodings = tokenize_tweets(X_train)
val_encodings = tokenize_tweets(X_val)

In [7]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).shuffle(1000).batch(32)

In [8]:
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_val)).batch(32)

In [9]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [11]:
model.fit(train_dataset, validation_data=val_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7bf533298640>

In [13]:
test_df = pd.read_csv('/content/test_tweets_anuFYb8.csv')

In [14]:
test_encodings = tokenize_tweets(test_df['tweet'])

In [15]:
test_predictions = model.predict(test_encodings['input_ids'])



In [16]:
test_predictions_labels = np.argmax(test_predictions.logits, axis=1)

In [38]:
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    class_report = classification_report(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)

    print(f'Accuracy on test set: {accuracy:.4f}')
    print('Classification Report:\n', class_report)
    print('Confusion Matrix:\n', conf_matrix)


In [39]:
# Check if the test dataset has true labels
if 'label' in test_df.columns:
    # Ensure y_test is a Numpy array for compatibility
    y_test = test_df['label'].values  # or use np.array(test_df['label'])

    # Calculate and display metrics
    calculate_metrics(y_test, test_predictions_labels)
else:
    # If there are no true labels, just output predictions
    test_df['predicted_label'] = test_predictions_labels
    print(test_df[['tweet', 'predicted_label']].head())


                                               tweet  predicted_label
0  #studiolife #aislife #requires #passion #dedic...                0
1   @user #white #supremacists want everyone to s...                1
2  safe ways to heal your #acne!!    #altwaystohe...                0
3  is the hp and the cursed child book up for res...                0
4    3rd #bihday to my amazing, hilarious #nephew...                0


In [40]:
test_df['predicted_label'] = test_predictions_labels
test_df.to_csv('bert_tweet_test_predictions.csv', index=False)
print("Predictions saved to 'bert_tweet_test_predictions.csv'")

Predictions saved to 'bert_tweet_test_predictions.csv'
