In [1]:
!pip install num2words
!pip install emoji

Collecting num2words
  Downloading num2words-0.5.14-py3-none-any.whl.metadata (13 kB)
Collecting docopt>=0.6.2 (from num2words)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading num2words-0.5.14-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.5/163.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=c336b7c8237d975abd017f7c90076308043f39e1db960da3bcb3744e7810bf1c
  Stored in directory: /root/.cache/pip/wheels/1a/bf/a1/4cee4f7678c68c5875ca89eaccf460593539805c3906722228
Successfully built docopt
Installing collected packages: docopt, num2words
Successfully installed docopt-0.6.2 num2words-0.5.14
Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.1

In [2]:
import re
import emoji
import torch
import pandas as pd
import matplotlib.pyplot as plt

from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, f1_score, confusion_matrix, ConfusionMatrixDisplay
from num2words import num2words

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
import re
import emoji

def normalize_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

def remove_urls_handles_numbers(text):
    return re.sub(
        r'@\w+|https?://\S+|www\.\S+|\S+@\S+|\d+',
        ' ',
        text
    )

def remove_punctuation_ur(text):
    punct = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~“”‘’،؛؟…«»"""
    return re.sub(f"[{re.escape(punct)}]", " ", text)

def clean_text_ur(text):
    text = str(text)
    text = emoji.replace_emoji(text, replace="")
    text = remove_urls_handles_numbers(text)
    text = remove_punctuation_ur(text)

    # keep only Arabic–Urdu unicode block
    text = re.sub(r"[^\u0600-\u06FF\s]", " ", text)

    text = normalize_whitespace(text)
    return text


In [4]:
import pandas as pd
import torch

df = pd.read_csv("urdu_train.csv" , encoding="utf-8-sig")


df["text"] = df["text"].astype(str).apply(clean_text_ur)
labels = torch.tensor(df["label"].values, dtype=torch.long)


In [5]:
print(df[df['label'] == 0]['text'].head(5))
print(df[df['label'] == 1]['text'].head(5))


0    اس ٹرین پر آپ دس روپے ٹکٹ میں باآسانی عام ساما...
1    ان سیاحتی مقامات میں ہزاروں ہوٹل اور رہائش گاہ...
2    جس دن انھوں نے آخری بار یونیفارم پہنا اس دن وہ...
3    اس نے ٹی وی پر ایک دو پروگرامز میں کہا تھا کہ ...
4    رات پھر پیراسیٹامول کھائی اور سو گیا۔ صبح اٹھا...
Name: text, dtype: object
5955    منی بجٹ کے حوالے سے آنے والی خبروں کے مطابق حک...
5956    تاہم یہ تجویز بھی اپنی جگہ ایک چیلنج ہے کیونکہ...
5957    نعیم کی دکان میں اس دوران ایک گاہک آتا ہے ہاتھ...
5958    امریکہ پہنچنے کے بعد ایمل کانسی کے خلاف باقاعد...
5959    چاہ بہار بندرگاہ اس نئی حکمتِ عملی کی علامت بن...
Name: text, dtype: object


In [6]:
print(df['label'].unique())  # should be only 0 and 1


[0 1]


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [8]:
encodings = tokenizer(
    df["text"].tolist(),
    padding=True,
    truncation=True,
    max_length=128
)


In [9]:
from torch.utils.data import Dataset, DataLoader

class UrduDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

dataset = UrduDataset(encodings, labels)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [10]:
import torch.nn as nn
from transformers import AutoModel

class BertClassifier(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]
        return self.fc(self.dropout(cls))


In [11]:
import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertClassifier("bert-base-multilingual-cased").to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [13]:
for epoch in range(6):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        y = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, y)

        loss.backward()
        optimizer.step()

        preds = logits.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)
        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss {total_loss:.3f} | Acc {correct/total:.3f}")


Epoch 1 | Loss 96.112 | Acc 0.885
Epoch 2 | Loss 38.968 | Acc 0.961
Epoch 3 | Loss 18.940 | Acc 0.980
Epoch 4 | Loss 12.930 | Acc 0.988
Epoch 5 | Loss 8.433 | Acc 0.992
Epoch 6 | Loss 10.010 | Acc 0.991


In [15]:
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define the save path
output_dir = '/content/drive/My Drive/abjadgeneval/urdu'

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

# 3. Save the Model Weights (state_dict)
model_save_path = os.path.join(output_dir, "model_weights.pth")
torch.save(model.state_dict(), model_save_path)
print(f"Model weights saved to: {model_save_path}")

# 4. Save the Tokenizer
# It's crucial to save the tokenizer to ensure you use the exact same vocabulary for inference later
tokenizer.save_pretrained(output_dir)
print(f"Tokenizer saved to: {output_dir}")

Mounted at /content/drive
Model weights saved to: /content/drive/My Drive/abjadgeneval/urdu/model_weights.pth
Tokenizer saved to: /content/drive/My Drive/abjadgeneval/urdu


In [19]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for batch in train_loader:
        # Move inputs to GPU
        logits = model(
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device)
        )

        # 1. Get predictions
        preds = logits.argmax(dim=1)

        # 2. FIX: Move predictions to CPU before converting to numpy
        y_pred.extend(preds.cpu().numpy())

        # 3. Handle labels (ensure they are on CPU too just in case)
        # Note: batch["labels"] is likely already on CPU, but .cpu() is safe to add
        y_true.extend(batch["labels"].cpu().numpy())

print(classification_report(y_true, y_pred, target_names=["human", "machine"]))
print("Macro F1:", f1_score(y_true, y_pred, average="macro"))

              precision    recall  f1-score   support

       human       1.00      0.99      0.99      5955
     machine       0.99      1.00      0.99      5955

    accuracy                           0.99     11910
   macro avg       0.99      0.99      0.99     11910
weighted avg       0.99      0.99      0.99     11910

Macro F1: 0.9934506149236528


In [24]:
# independent code for evaluation

# ==========================================
# 1. SETUP & IMPORTS
# ==========================================
!pip install emoji num2words transformers
import torch
import torch.nn as nn
import pandas as pd
import re
import emoji
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, f1_score
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')

# Set Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ==========================================
# 2. DEFINE CLASSES & FUNCTIONS
# ==========================================
class BertClassifier(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]
        return self.fc(self.dropout(cls))

class UrduDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

def clean_text_ur(text):
    text = str(text)
    text = emoji.replace_emoji(text, replace="")
    text = re.sub(r'@\w+|https?://\S+|www\.\S+|\S+@\S+|\d+', ' ', text)
    text = re.sub(r"[^\u0600-\u06FF\s]", " ", text) # Keep Arabic/Urdu only
    return re.sub(r'\s+', ' ', text).strip()

# ==========================================
# 3. LOAD MODEL & TOKENIZER FROM DRIVE
# ==========================================
model_path = '/content/drive/My Drive/abjadgeneval/urdu'

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_path)

print("Loading model...")
model = BertClassifier("bert-base-multilingual-cased")
model.load_state_dict(torch.load(f"{model_path}/model_weights.pth", map_location=device))
model.to(device)
print("✅ Model loaded successfully from Drive.")

# ==========================================
# 4. PREPARE DATA
# ==========================================
# NOTE: If you need to predict on a Test Set, change "urdu_train.csv" to "urdu_test.csv"
target_file = "urdu_train.csv"

try:
    print(f"Reading {target_file}...")
    df = pd.read_csv(target_file, encoding="utf-8-sig")
    df["text"] = df["text"].astype(str).apply(clean_text_ur)

    encodings = tokenizer(
        df["text"].tolist(),
        padding=True, truncation=True, max_length=128
    )

    # If using a blind test file that has no 'label' column, create dummy labels
    if "label" in df.columns:
        labels = torch.tensor(df["label"].values, dtype=torch.long)
    else:
        labels = torch.zeros(len(df), dtype=torch.long) # Dummy labels

    dataset = UrduDataset(encodings, labels)
    # Shuffle MUST be False to keep order for submission
    train_loader = DataLoader(dataset, batch_size=32, shuffle=False)
    print("✅ Data loaded successfully.")

except FileNotFoundError:
    print(f"❌ ERROR: '{target_file}' not found. Please upload it.")

# ==========================================
# 5. RUN EVALUATION & PREDICTION
# ==========================================
if 'train_loader' in locals():
    model.eval()
    y_true, y_pred = [], []

    print("Starting inference...")
    with torch.no_grad():
        for batch in train_loader:
            logits = model(
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device)
            )
            preds = logits.argmax(dim=1)

            # Move to CPU before converting to numpy
            y_pred.extend(preds.cpu().numpy())
            y_true.extend(batch["labels"].cpu().numpy())

    # Only print metrics if we actually had real labels (not dummy ones)
    if "label" in df.columns:
        print("\n" + classification_report(y_true, y_pred, target_names=["human", "machine"]))
        print("Macro F1:", f1_score(y_true, y_pred, average="macro"))

    # ==========================================
    # 6. GENERATE SUBMISSION FILE
    # ==========================================
    print("\nGenering submission files...")

    # Map 0/1 back to human/machine
    # Ensure this matches your training: 0=human, 1=machine
    label_map = {0: 0, 1: 1}
    str_predictions = [label_map[p] for p in y_pred]

    # Create DataFrame
    submission = pd.DataFrame()
    submission['label'] = str_predictions

    # Save to CSV
    submission.to_csv("predictions.csv", index=False)
    print("✅ Saved 'predictions.csv'")

    # Compress to ZIP
    !zip predictions.zip predictions.csv
    print("✅ Created 'predictions.zip' ready for upload.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Loading tokenizer...
Loading model...
✅ Model loaded successfully from Drive.
Reading urdu_train.csv...
✅ Data loaded successfully.
Starting inference...

              precision    recall  f1-score   support

       human       1.00      0.99      0.99      5955
     machine       0.99      1.00      0.99      5955

    accuracy                           0.99     11910
   macro avg       0.99      0.99      0.99     11910
weighted avg       0.99      0.99      0.99     11910

Macro F1: 0.9925269073530462

Genering submission files...
✅ Saved 'predictions.csv'
updating: predictions.csv (deflated 99%)
✅ Created 'predictions.zip' ready for upload.
