In [2]:
# ============================================
# Imports, paths, and load raw data
# ============================================


#import sys
#!{sys.executable} -m pip install --upgrade pip
#!{sys.executable} -m pip install pandas

import os
import sys
import re
from typing import Dict, Tuple, List

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import AutoTokenizer

# Making sure project root is on path
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

from src.config import DATA_DIR 

data_path = DATA_DIR / "CyberBulling_Dataset_Bangla.xlsx"
df_raw = pd.read_excel(data_path)

print("Raw dataframe shape:", df_raw.shape)
print(df_raw.head())

  from .autonotebook import tqdm as notebook_tqdm


Raw dataframe shape: (6010, 3)
   Unnamed: 0                                        Description   Label
0           0  ওই হালার পুত এখন কি মদ খাওয়ার সময় রাতের বেলা...  sexual
1           1  আপনার জন্ম প্রক্রিয়ার সময় আপনার মায়ের ভিতর কি ...  sexual
2           2  ধজভংগ দের আর ভায়াগ্রা লাগবো না। ধংস হোক এই সব ...  sexual
3           3                                     বোকাচোদা একটা।  sexual
4           4  তোর দেশে ফেরার অপেক্ষায় রইলাম। জেলে একটা কামরা...  sexual


In [None]:
# ============================================
#   Text cleaning helpers + quick test
# ============================================

# Regex patterns
URL_PATTERN = re.compile(r"http\S+|www\.\S+")
USERNAME_PATTERN = re.compile(r"@\w+")
MULTI_SPACE_PATTERN = re.compile(r"\s+")

# Decorative / noisy separator patterns:
DECORATIVE_CHARS = "-_=+|<>{}•●♦♠♥★☝✔⚫৤❣॥—০0∆"
SEPARATOR_PATTERN = re.compile(rf"[{re.escape(DECORATIVE_CHARS)}]{{3,}}")
# Long repeats of the SAME non-alphanumeric character
REPEAT_NOISE_PATTERN = re.compile(r"([^A-Za-z0-9\u0980-\u09FF\s])\1{4,}")

def clean_text(text: str) -> str:
    """
    Basic text cleaning for Bangla social media comments.

    Design choices (based on EDA):
    - Remove URLs (they are rarely informative for bullying).
    - Remove @usernames (can leak personal info, not needed for label).
    - Remove decorative separator patterns and long repeated symbols.
    - Preserve emojis and punctuation because they carry
      sentiment/emphasis that may help classification.
    - Preserve numbers (e.g. years, counts) as they may appear in political content.
    - Normalise whitespace.
    """
    if not isinstance(text, str):
        text = str(text)

    text = URL_PATTERN.sub(" ", text)
    text = USERNAME_PATTERN.sub(" ", text)
    text = SEPARATOR_PATTERN.sub(" ", text)
    text = REPEAT_NOISE_PATTERN.sub(" ", text)
    text = text.strip()
    text = MULTI_SPACE_PATTERN.sub(" ", text)

    return text


# Quick test on a few examples
print("=== Cleaning examples ===")
for i in range(5):
    original = str(df_raw["Description"].iloc[i])
    cleaned = clean_text(original)
    print(f"\nOriginal: {original}")
    print(f"Cleaned : {cleaned}")

print("-----------------------------------------------------------------")
print("-----------------------------------------------------------------")

print("=== Cleaning examples (only changed rows) ===")

df_raw["cleaned"] = df_raw["Description"].astype(str).apply(clean_text)
changed = df_raw[df_raw["Description"].astype(str) != df_raw["cleaned"]]

if changed.empty:
    print("No rows changed by cleaning.")
else:
    for _, row in changed.head(5).iterrows():
        original = row["Description"]
        cleaned = row["cleaned"]
        print(f"\nOriginal: {original}")
        print(f"Cleaned : {cleaned}")


=== Cleaning examples ===

Original: ওই হালার পুত এখন কি মদ খাওয়ার সময় রাতের বেলা মদ খাই দিনের বেলাও মাঝেমধ্যে খায় এখন ম*** চ**** সময় safa কে একটু চুদাম যার ইচ্ছা আছে চুদার লাইনে দারা একজন একজন করে জাবি
Cleaned : ওই হালার পুত এখন কি মদ খাওয়ার সময় রাতের বেলা মদ খাই দিনের বেলাও মাঝেমধ্যে খায় এখন ম*** চ**** সময় safa কে একটু চুদাম যার ইচ্ছা আছে চুদার লাইনে দারা একজন একজন করে জাবি

Original: আপনার জন্ম প্রক্রিয়ার সময় আপনার মায়ের ভিতর কি আপনার বাবা হুমায়ুন কবিরের শুক্রাণু ঢুকে ছিল না অন্য কারো । তাইলে আপনি কেম্নে শিউর হইলেন উনিই আপনার বাবা। কারণ ওই কাজের সময়ে তো আপনি দেখেননি।আপনাকে জারজ বা জাউরা বললে কি ভুল হবে? পাক্নামি ছাইড়া দিয়া অভিনয়টা ভাল করে করেন।আমাগো আমজনতারে এইসব শুনাইয়েন না।
Cleaned : আপনার জন্ম প্রক্রিয়ার সময় আপনার মায়ের ভিতর কি আপনার বাবা হুমায়ুন কবিরের শুক্রাণু ঢুকে ছিল না অন্য কারো । তাইলে আপনি কেম্নে শিউর হইলেন উনিই আপনার বাবা। কারণ ওই কাজের সময়ে তো আপনি দেখেননি।আপনাকে জারজ বা জাউরা বললে কি ভুল হবে? পাক্নামি ছাইড়া দিয়া অভিনয়টা ভাল করে করেন।আমাগো আমজনতারে এইসব শুনাইয়েন 

In [8]:
# ============================================
# Label mapping and encoding + test
# ============================================

# Fixed mapping as requested
LABEL2ID: Dict[str, int] = {
    "political": 0,
    "sexual": 1,
    "troll": 2,
    "threat": 3,
    "neutral": 4,
}
ID2LABEL: Dict[int, str] = {v: k for k, v in LABEL2ID.items()}


def encode_labels(labels: pd.Series) -> np.ndarray:
    """Map string labels to numeric ids."""
    labels = labels.astype(str).str.lower()
    encoded = labels.map(LABEL2ID).values
    return encoded


# Quick test on first few labels
print("=== Label encoding test ===")
sample_labels = df_raw["Label"].head(10).astype(str).str.lower()
print("Sample labels:", sample_labels.tolist())
print("Encoded     :", encode_labels(sample_labels))

=== Label encoding test ===
Sample labels: ['sexual', 'sexual', 'sexual', 'sexual', 'sexual', 'sexual', 'sexual', 'sexual', 'sexual', 'sexual']
Encoded     : [1 1 1 1 1 1 1 1 1 1]


In [9]:
# ============================================
# prepare_dataframe() definition + run
# ============================================

def prepare_dataframe(
    df: pd.DataFrame,
    text_col: str = "Description",
    label_col: str = "Label"
) -> pd.DataFrame:
    """
    Select relevant columns, clean text, drop empty/nulls and duplicates,
    and ensure labels are in canonical form.
    """
    # Select relevant columns
    df = df[[text_col, label_col]].copy()

    # Normalise labels to lower-case strings
    df[label_col] = df[label_col].astype(str).str.lower()

    # Text cleaning
    df[text_col] = df[text_col].astype(str).apply(clean_text)

    # Drop rows with empty text or null labels
    df[text_col].replace("", np.nan, inplace=True)
    df.dropna(subset=[text_col, label_col], inplace=True)

    # Drop exact duplicate entries (text + label)
    df.drop_duplicates(subset=[text_col, label_col], inplace=True)

    # Reset index
    df.reset_index(drop=True, inplace=True)

    return df


df = prepare_dataframe(df_raw, text_col="Description", label_col="Label")

print("Prepared dataframe shape:", df.shape)
print(df.head())
print("\nLabel distribution:")
print(df["Label"].value_counts())

Prepared dataframe shape: (5860, 2)
                                         Description   Label
0  ওই হালার পুত এখন কি মদ খাওয়ার সময় রাতের বেলা...  sexual
1  আপনার জন্ম প্রক্রিয়ার সময় আপনার মায়ের ভিতর কি ...  sexual
2  ধজভংগ দের আর ভায়াগ্রা লাগবো না। ধংস হোক এই সব ...  sexual
3                                     বোকাচোদা একটা।  sexual
4  তোর দেশে ফেরার অপেক্ষায় রইলাম। জেলে একটা কামরা...  sexual

Label distribution:
Label
neutral      1200
troll        1197
sexual       1195
threat       1192
political    1076
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[text_col].replace("", np.nan, inplace=True)


In [10]:
# ============================================
# stratified_splits() definition + run
# ============================================

def stratified_splits(
    df: pd.DataFrame,
    text_col: str = "Description",
    label_col: str = "Label",
    train_size: float = 0.70,
    val_size: float = 0.15,
    test_size: float = 0.15,
    random_state: int = 42,
) -> Tuple[pd.Series, pd.Series, pd.Series, np.ndarray, np.ndarray, np.ndarray]:
    """
    Create stratified train/val/test splits.

    Returns:
        X_train, X_val, X_test, y_train, y_val, y_test
    """
    assert np.isclose(train_size + val_size + test_size, 1.0), \
        "train_size + val_size + test_size must equal 1.0"

    # Encode labels to ids
    y_all = encode_labels(df[label_col])
    X_all = df[text_col].values

    # First split: train vs temp
    X_train, X_temp, y_train, y_temp = train_test_split(
        X_all,
        y_all,
        test_size=(1.0 - train_size),
        stratify=y_all,
        random_state=random_state,
    )

    # Second split: val vs test from temp
    relative_test_size = test_size / (test_size + val_size)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp,
        y_temp,
        test_size=relative_test_size,
        stratify=y_temp,
        random_state=random_state,
    )

    return X_train, X_val, X_test, y_train, y_val, y_test


X_train, X_val, X_test, y_train, y_val, y_test = stratified_splits(df)

print("Train size:", len(X_train))
print("Val size  :", len(X_val))
print("Test size :", len(X_test))

# Check label balance in each split
print("\nTrain label distribution:")
print(pd.Series(y_train).value_counts().sort_index())
print("\nVal label distribution:")
print(pd.Series(y_val).value_counts().sort_index())
print("\nTest label distribution:")
print(pd.Series(y_test).value_counts().sort_index())

Train size: 4101
Val size  : 879
Test size : 880

Train label distribution:
0    753
1    836
2    838
3    834
4    840
Name: count, dtype: int64

Val label distribution:
0    162
1    179
2    179
3    179
4    180
Name: count, dtype: int64

Test label distribution:
0    161
1    180
2    180
3    179
4    180
Name: count, dtype: int64


In [11]:
# ============================================
# Keras tokenizer + padded sequences
# ============================================

def build_keras_tokenizer(
    texts: List[str],
    num_words: int = None,
    oov_token: str = "[OOV]"
) -> Tokenizer:
    """
    Fit a Keras Tokenizer on the training texts.
    """
    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
    tokenizer.fit_on_texts(texts)
    return tokenizer


def texts_to_padded_sequences(
    tokenizer: Tokenizer,
    texts: List[str],
    max_len: int = 128
) -> np.ndarray:
    """
    Convert a list/array of texts into padded sequences.
    """
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")
    return padded


max_len = 128
tokenizer = build_keras_tokenizer(X_train)

X_train_pad = texts_to_padded_sequences(tokenizer, X_train, max_len=max_len)
X_val_pad = texts_to_padded_sequences(tokenizer, X_val, max_len=max_len)
X_test_pad = texts_to_padded_sequences(tokenizer, X_test, max_len=max_len)

print("Padded train shape:", X_train_pad.shape)
print("Padded val shape  :", X_val_pad.shape)
print("Padded test shape :", X_test_pad.shape)

# Look at one example
idx = 0
print("\nExample text:", X_train[idx])
print("Example sequence:", X_train_pad[idx][:30])

Padded train shape: (4101, 128)
Padded val shape  : (879, 128)
Padded test shape : (880, 128)

Example text: এই নাস্তিকের বাচ্চা নাস্তিক জায়স সন্তান তুই কেনো বললি যে তুই পরকাল বিস্বাস করস না তসলিমা তসলিমি নাসরিন যেমন দেশ থেকে বিতারিত করা হয়েছিল তোকেও তেমন এ দেশ থেকে বিতারিত করবো তুই তো একটা বেস্য মাগি তর মুখ দিয়ে তো এই কথাই সোভা পায় কারন মাগি রা আল্লাহ বিস্বাস করে না তুইও বাজারের মাগি
Example sequence: [   5  305  114   78 5835  311   16  566 1823   13   16  109 1323  960
    2  686 5836  803  542   70   14  543   31 2239 5837  746   71   70
   14  543]


In [12]:
# ============================================
# Transformer tokenizer helpers + test
# ============================================

def load_transformer_tokenizer(model_name: str = "xlm-roberta-base"):
    """
    Load a pretrained Hugging Face tokenizer.

    Based on EDA (Bangla + code-mixing), a multilingual subword model such as
    'xlm-roberta-base' or 'bert-base-multilingual-cased' is appropriate.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer


def encode_texts_transformer(
    tokenizer,
    texts: List[str],
    max_len: int = 128
) -> Dict[str, np.ndarray]:
    """
    Encode texts using a Hugging Face tokenizer.

    Returns numpy arrays:
        {
            "input_ids": shape (N, max_len),
            "attention_mask": shape (N, max_len)
        }
    """
    encodings = tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_tensors=None,
    )

    input_ids = np.array(encodings["input_ids"], dtype=np.int64)
    attention_mask = np.array(encodings["attention_mask"], dtype=np.int64)

    return {"input_ids": input_ids, "attention_mask": attention_mask}


# Load tokenizer and encode a small subset to test (to save time)
hf_model_name = "xlm-roberta-base"
hf_tokenizer = load_transformer_tokenizer(hf_model_name)

sample_texts = list(X_train[:8])
enc_sample = encode_texts_transformer(hf_tokenizer, sample_texts, max_len=128)

print("Sample input_ids shape    :", enc_sample["input_ids"].shape)
print("Sample attention_mask shape:", enc_sample["attention_mask"].shape)
print("\nFirst row input_ids (first 20 tokens):")
print(enc_sample["input_ids"][0][:20])

Sample input_ids shape    : (8, 128)
Sample attention_mask shape: (8, 128)

First row input_ids (first 20 tokens):
[     0   6386   4480 212478  75011   7802 240873   4691   4480  34619
   9976  30414   2801   3458 147345      6  57158   2730  65828   9445]


In [13]:
# ============================================
# wrap up / summary print
# ============================================

print("=== Summary ===")
print("Final prepared df shape:", df.shape)
print("Train/Val/Test sizes:", len(X_train), len(X_val), len(X_test))
print("Keras padded train shape:", X_train_pad.shape)
print("Transformer sample Enc shape:", enc_sample["input_ids"].shape)

=== Summary ===
Final prepared df shape: (5860, 2)
Train/Val/Test sizes: 4101 879 880
Keras padded train shape: (4101, 128)
Transformer sample Enc shape: (8, 128)
