In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from google.colab import drive

# 1. Mount Google Drive
drive.mount('/content/drive')





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3


In [None]:
# 1. Uninstall the libraries causing conflicts (to be safe)
!pip uninstall -y transformers datasets

# 2. Install datasets and transformers, ignoring deep dependencies that cause the error
!pip install datasets --no-deps
!pip install transformers[tf] --no-deps

Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: datasets 4.4.0
Uninstalling datasets-4.4.0:
  Successfully uninstalled datasets-4.4.0
Collecting datasets
  Using cached datasets-4.4.0-py3-none-any.whl.metadata (19 kB)
Using cached datasets-4.4.0-py3-none-any.whl (511 kB)
Installing collected packages: datasets
Successfully installed datasets-4.4.0
Collecting transformers[tf]
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Using cached transformers-4.57.1-py3-none-any.whl (12.0 MB)
Installing collected packages: transformers
Successfully installed transformers-4.57.1


In [None]:
df=pd.read_csv("/content/drive/MyDrive/My_works/DATASCIENCE /datasetoftweets.zip")

In [None]:
df.columns=['tweet','type']

In [None]:
df['tweet']=df['tweet'].apply(lambda x: x.lower())

In [None]:
df.head()

Unnamed: 0,tweet,type
0,"in other words #katandandre, your food was cra...",not_cyberbullying
1,why is #aussietv so white? #mkr #theblock #ima...,not_cyberbullying
2,@xochitlsuckkks a classy whore? or more red ve...,not_cyberbullying
3,"@jason_gio meh. :p thanks for the heads up, b...",not_cyberbullying
4,@rudhoeenglish this is an isis account pretend...,not_cyberbullying


#skipping Removal of Special characters for BERT as it is pretrained to handle such character

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Assuming your text column is 'tweet' and your multiclass column is 'type'
TEXT_COLUMN = 'tweet'
LABEL_COLUMN = 'type'

# 1. Multiclass Label Encoding (Convert string classes to integers 0-5)
label_encoder = LabelEncoder()
df['target_id'] = label_encoder.fit_transform(df[LABEL_COLUMN])

# Save the target names and number of classes
unique_classes = label_encoder.classes_
num_classes = len(unique_classes)

# 2. Stratified Train-Test Split (20% for testing)
X_train_text, X_test_text, y_train_id, y_test_id = train_test_split(
    df[TEXT_COLUMN],
    df['target_id'],
    test_size=0.2,
    random_state=42,
    stratify=df['target_id'] # Ensures each class has proportional representation in the split
)

print("Label Encoding and Stratified Split Complete.")
print(f"Total classes: {num_classes}. Class names: {unique_classes}")

Label Encoding and Stratified Split Complete.
Total classes: 6. Class names: ['age' 'ethnicity' 'gender' 'not_cyberbullying' 'other_cyberbullying'
 'religion']


In [None]:
from transformers import AutoTokenizer
import tensorflow as tf

# --- 1. Load Pre-trained Tokenizer ---
MODEL_NAME = 'bert-base-uncased'
bert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LENGTH = 100 # We will use a sequence length of 100 for BERT

def tokenize_data(texts, labels):
    # Apply tokenization, padding, and truncation in one step
    encodings = bert_tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=MAX_LENGTH
    )

    # Convert encodings and labels to a TensorFlow Dataset
    return tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels.values
    ))

# --- 3. Create Datasets ---
BATCH_SIZE = 32

train_dataset = tokenize_data(X_train_text, y_train_id).shuffle(1000).batch(BATCH_SIZE)
test_dataset = tokenize_data(X_test_text, y_test_id).batch(BATCH_SIZE)

print("\nBERT Tokenization and Dataset Creation Complete.")
print(f"BERT will use sequences of length: {MAX_LENGTH} tokens.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



BERT Tokenization and Dataset Creation Complete.
BERT will use sequences of length: 100 tokens.


In [None]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification, create_optimizer
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

MODEL_NAME = 'bert-base-uncased'
num_classes = 6 # Assuming you have 6 classes
bert_model = TFBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_classes,
    from_pt=False,
    use_safetensors=False
)

# --- 2. Define Learning Rate and Optimizer Steps ---
# These parameters are now required for the create_optimizer utility.
EPOCHS_BERT = 3
BATCH_SIZE = 32 # Must match the batch size from your dataset creation
total_train_steps = tf.data.experimental.cardinality(train_dataset).numpy() * EPOCHS_BERT

# 3. Define the BERT-Specific Optimizer
# This utility creates the Adam optimizer with weight decay, essential for BERT fine-tuning.
# We set the learning rate to the standard 5e-5.
bert_optimizer, bert_lr_scheduler = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0, # Typically used for large pretraining, not simple fine-tuning
    num_train_steps=total_train_steps
)


# --- 4. Compile the Model ---
bert_model.compile(
    # PASS THE OPTIMIZER OBJECT CREATED BY THE UTILITY
    optimizer=bert_optimizer,
    # Loss remains the same
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

print("BERT Classification Model Architecture Defined and Compiled Successfully.")
bert_model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Classification Model Architecture Defined and Compiled Successfully.
Model: "tf_bert_for_sequence_classification_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_303 (Dropout)       multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  4614      
                                                                 
Total params: 109486854 (417.66 MB)
Trainable params: 109486854 (417.66 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
import os
import tensorflow as tf
from google.colab import drive

# --- 1. Remount Drive (Safety check) ---
drive.mount('/content/drive')

# --- 2. Training Setup (No Callbacks) ---
# Assuming bert_model, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor are available
EPOCHS_BERT = 3
BATCH_SIZE = 32

# Define Base Path for Saving Models after each epoch
DRIVE_TARGET_DIR = '/content/drive/MyDrive/My_works/DATASCIENCE'
if not os.path.exists(DRIVE_TARGET_DIR):
    os.makedirs(DRIVE_TARGET_DIR)

print(f"\nStarting BERT Fine-Tuning for fixed {EPOCHS_BERT} epochs (No Callbacks)...")
print("We will manually save the model after each epoch.")

# --- 3. Manual Training Loop (Iterating over epochs) ---
for epoch in range(EPOCHS_BERT):
    print(f"\n--- Starting Epoch {epoch + 1}/{EPOCHS_BERT} ---")

    # Train for one epoch
    history = bert_model.fit(
        X_train_tensor,
        y_train_tensor,
        epochs=1,
        batch_size=BATCH_SIZE,
        validation_data=(X_test_tensor, y_test_tensor),
        # Start training from the current state (Epoch 0 is where we start the loop)
        initial_epoch=epoch
    )

    # Manual Save after each epoch completes
    EPOCH_MODEL_NAME = f'bert_epoch_{epoch + 1}_model.h5'
    EPOCH_SAVE_PATH = os.path.join(DRIVE_TARGET_DIR, EPOCH_MODEL_NAME)

    # Save the entire model (architecture + weights) for easy reloading
    bert_model.save(EPOCH_SAVE_PATH)
    print(f"✅ Saved model for Epoch {epoch + 1} to Drive.")

print("\nBERT Fine-Tuning Complete. Three model versions saved.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Starting BERT Fine-Tuning for fixed 3 epochs (No Callbacks)...
We will manually save the model after each epoch.

--- Starting Epoch 1/3 ---


  saving_api.save_model(


NotImplementedError: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using `save_weights`.

In [None]:
# Assuming the bert_model is still loaded in your current session's memory after the failed save.
from sklearn.metrics import classification_report # Import classification_report

# --- 1. Define Evaluation Function (using a dummy path since we are testing memory) ---
def evaluate_current_bert_model(model_instance, unique_classes):
    print("\n--- Evaluating BERT Model currently in memory (End of Epoch 1) ---")

    # We must compile the loaded model for prediction setup
    model_instance.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
    )

    # Predict on the test data (X_test_tensor)
    y_pred_logits = model_instance.predict(X_test_tensor, verbose=0, batch_size=64).logits
    y_pred_classes = np.argmax(y_pred_logits, axis=1)

    # Calculate report
    report = classification_report(
        y_test_tensor, y_pred_classes, target_names=unique_classes, output_dict=True
    )

    # Store key metrics
    metrics = {
        'Macro F1': report['macro avg']['f1-score'],
        'Not Cyberbullying F1': report['not_cyberbullying']['f1-score'],
        'Other Cyberbullying F1': report['other_cyberbullying']['f1-score'],
        'Overall Accuracy': report['accuracy']
    }

    print(f"Overall Accuracy: {metrics['Overall Accuracy']:.4f}")
    print(f"Macro F1: {metrics['Macro F1']:.4f}")
    print("Classification Report:")
    print(classification_report(y_test_tensor, y_pred_classes, target_names=unique_classes))
    return metrics

# Run the evaluation
metrics_epoch_1 = evaluate_current_bert_model(bert_model, unique_classes)


--- Evaluating BERT Model currently in memory (End of Epoch 1) ---
Overall Accuracy: 0.8645
Macro F1: 0.8630
Classification Report:
                     precision    recall  f1-score   support

                age       0.99      0.98      0.98      1598
          ethnicity       0.99      0.97      0.98      1592
             gender       0.87      0.89      0.88      1595
  not_cyberbullying       0.68      0.65      0.66      1589
other_cyberbullying       0.70      0.71      0.71      1565
           religion       0.94      0.98      0.96      1600

           accuracy                           0.86      9539
          macro avg       0.86      0.86      0.86      9539
       weighted avg       0.86      0.86      0.86      9539



In [None]:
from transformers import TFBertForSequenceClassification, create_optimizer
import tensorflow as tf
import pickle
import os
from google.colab import drive

# --- 1. Remount Drive and Setup Paths ---
drive.mount('/content/drive')
DRIVE_TARGET_DIR = '/content/drive/MyDrive/My_works/DATASCIENCE'
HISTORY_SAVE_PATH = os.path.join(DRIVE_TARGET_DIR, 'bert_training_history.pkl')

# --- 2. Reload Fresh BERT Model (Resets weights to pre-trained state) ---
# Assuming num_classes, X_train_tensor, and X_test_tensor are available
MODEL_NAME = 'bert-base-uncased'
EPOCHS_BERT_HISTORY = 1 # Only run one epoch for history collection
BATCH_SIZE = 32

bert_model_fresh = TFBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_classes,
    from_pt=False,
    use_safetensors=False
)

# --- 3. Define and Compile with Fixed Optimizer Utility ---
# These calculations are needed for the optimizer utility
total_train_steps = tf.data.experimental.cardinality(train_dataset).numpy() * EPOCHS_BERT_HISTORY
bert_optimizer_fixed, _ = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps
)

bert_model_fresh.compile(
    optimizer=bert_optimizer_fixed,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# --- 4. Train and Collect History ---
print(f"\nStarting FRESH BERT Training run to collect history plot data (1 Epoch)...")

# Training fit call
history_bert_plot = bert_model_fresh.fit(
    X_train_tensor,
    y_train_tensor,
    epochs=EPOCHS_BERT_HISTORY,
    batch_size=BATCH_SIZE,
    validation_data=(X_test_tensor, y_test_tensor),
)

# --- 5. Save History Plot Data ---
with open(HISTORY_SAVE_PATH, 'wb') as file:
    pickle.dump(history_bert_plot.history, file)

print(f"\n✅ BERT Training History for plot successfully saved to Drive: {HISTORY_SAVE_PATH}")
print("You now have all necessary files to complete your project report.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting FRESH BERT Training run to collect history plot data (1 Epoch)...

✅ BERT Training History for plot successfully saved to Drive: /content/drive/MyDrive/My_works/DATASCIENCE/bert_training_history.pkl
You now have all necessary files to complete your project report.


In [None]:
from sklearn.metrics import classification_report
import numpy as np
import tensorflow as tf

# Assuming bert_model_fresh (the local variable from the last cell) is the model to evaluate.
# Assuming X_test_tensor, y_test_tensor, and unique_classes are available.

def evaluate_bert_champion(model_instance, unique_classes):
    print("\n--- Final Evaluation: BERT Champion Model ---")

    # We must compile the loaded model for prediction setup
    # NOTE: It was compiled before training, but compiling again ensures consistency.
    model_instance.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
    )

    # Predict on the test data (X_test_tensor)
    y_pred_logits = model_instance.predict(X_test_tensor, verbose=0, batch_size=64).logits
    y_pred_classes = np.argmax(y_pred_logits, axis=1)

    # Calculate report
    report = classification_report(
        y_test_tensor, y_pred_classes, target_names=unique_classes, output_dict=True
    )

    # Store key metrics
    metrics = {
        'Macro F1': report['macro avg']['f1-score'],
        'Not Cyberbullying F1': report['not_cyberbullying']['f1-score'],
        'Other Cyberbullying F1': report['other_cyberbullying']['f1-score'],
        'Overall Accuracy': report['accuracy']
    }

    print(f"Overall Accuracy: {metrics['Overall Accuracy']:.4f}")
    print(f"Macro F1: {metrics['Macro F1']:.4f}")
    print("Classification Report:")
    print(classification_report(y_test_tensor, y_pred_classes, target_names=unique_classes))
    return metrics

# Run the final evaluation
final_bert_metrics = evaluate_bert_champion(bert_model_fresh, unique_classes)


--- Final Evaluation: BERT Champion Model ---
Overall Accuracy: 0.8682
Macro F1: 0.8646
Classification Report:
                     precision    recall  f1-score   support

                age       0.99      0.98      0.98      1598
          ethnicity       0.98      0.97      0.98      1592
             gender       0.88      0.90      0.89      1595
  not_cyberbullying       0.77      0.55      0.64      1589
other_cyberbullying       0.66      0.82      0.73      1565
           religion       0.94      0.98      0.96      1600

           accuracy                           0.87      9539
          macro avg       0.87      0.87      0.86      9539
       weighted avg       0.87      0.87      0.87      9539



In [None]:
import os
from google.colab import drive
import tensorflow as tf

# --- 1. Remount Drive (Safety Check) ---
# Ensure your drive is mounted before saving
drive.mount('/content/drive')

# --- 2. Define File Path on Drive ---
DRIVE_TARGET_DIR = '/content/drive/MyDrive/My_works/DATASCIENCE'
if not os.path.exists(DRIVE_TARGET_DIR):
    os.makedirs(DRIVE_TARGET_DIR)

CHAMPION_WEIGHTS_NAME = 'bert_champion_epoch_1_weights.h5'
SAVE_PATH = os.path.join(DRIVE_TARGET_DIR, CHAMPION_WEIGHTS_NAME)

# --- 3. Save Weights Manually ---
# Assuming the bert_model is currently holding the Epoch 1 weights in memory
try:
    # Use the correct save_weights method for the subclassed BERT model
    bert_model_fresh.save_weights(SAVE_PATH)
    print(f"✅ CHAMPION BERT weights (Epoch 1) successfully saved to Drive: {SAVE_PATH}")
except Exception as e:
    print(f"❌ CRITICAL ERROR: Could not save weights. {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ CHAMPION BERT weights (Epoch 1) successfully saved to Drive: /content/drive/MyDrive/My_works/DATASCIENCE/bert_champion_epoch_1_weights.h5
