<a href="https://colab.research.google.com/github/BootCamp-BMA/colabs/blob/main/extract_embedding_dziriBert_classify_byRandomForest_SVM_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import torch
from transformers import BertTokenizer, BertModel
from google.colab import drive
import os

# ===========================
# 1️⃣ MOUNT GOOGLE DRIVE
# ===========================
drive.mount('/content/drive')

# Define dataset paths
base_path = "/content/drive/My Drive/fake-news/"
train_path = os.path.join(base_path, "train_set.csv")
test_path = os.path.join(base_path, "test_set.csv")
val_path = os.path.join(base_path, "validation_set.csv")

# ===========================
# 2️⃣ LOAD DATASETS
# ===========================
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
val_df = pd.read_csv(val_path)

# ===========================
# 3️⃣ CLEAN ARABIC TEXT
# ===========================
def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.strip()
    return text

# Apply cleaning
train_df["text"] = train_df["text"].astype(str).apply(clean_text)
test_df["text"] = test_df["text"].astype(str).apply(clean_text)
val_df["text"] = val_df["text"].astype(str).apply(clean_text)

# ===========================
# 4️⃣ LOAD DZIRIBERT TOKENIZER & MODEL
# ===========================
model_name = "alger-ia/dziribert"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===========================
# 5️⃣ EXTRACT BERT EMBEDDINGS
# ===========================
import numpy as np

# Function to process embeddings in small batches
def extract_embeddings(texts, batch_size=16):
    all_embeddings = []
    num_batches = (len(texts) + batch_size - 1) // batch_size  # Calculate total batches

    for i in range(num_batches):
        batch_texts = texts[i * batch_size:(i + 1) * batch_size]  # Get batch
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}  # Move to GPU

        with torch.no_grad():
            outputs = model(**inputs)

        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token
        all_embeddings.append(embeddings)

        print(f"Processed batch {i+1}/{num_batches}")  # Track progress

    return np.vstack(all_embeddings)  # Stack all batches

# Run extraction with batch processing
train_embeddings = extract_embeddings(train_df["text"].tolist(), batch_size=16)
test_embeddings = extract_embeddings(test_df["text"].tolist(), batch_size=16)
val_embeddings = extract_embeddings(val_df["text"].tolist(), batch_size=16)


# Convert text to embeddings
train_embeddings = extract_embeddings(train_df["text"].tolist())
test_embeddings = extract_embeddings(test_df["text"].tolist())
val_embeddings = extract_embeddings(val_df["text"].tolist())

# ===========================
# 6️⃣ SAVE EMBEDDINGS TO GOOGLE DRIVE
# ===========================
save_path = "/content/drive/My Drive/fake-news-embeddings/"

# Create directory if it doesn’t exist
os.makedirs(save_path, exist_ok=True)

pd.DataFrame(train_embeddings).to_csv(os.path.join(save_path, "train_embeddings.csv"), index=False)
pd.DataFrame(test_embeddings).to_csv(os.path.join(save_path, "test_embeddings.csv"), index=False)
pd.DataFrame(val_embeddings).to_csv(os.path.join(save_path, "val_embeddings.csv"), index=False)

print("✅ Embeddings saved successfully in:", save_path)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of BertModel were not initialized from the model checkpoint at alger-ia/dziribert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed batch 1/505
Processed batch 2/505
Processed batch 3/505
Processed batch 4/505
Processed batch 5/505
Processed batch 6/505
Processed batch 7/505
Processed batch 8/505
Processed batch 9/505
Processed batch 10/505
Processed batch 11/505
Processed batch 12/505
Processed batch 13/505
Processed batch 14/505
Processed batch 15/505
Processed batch 16/505
Processed batch 17/505
Processed batch 18/505
Processed batch 19/505
Processed batch 20/505
Processed batch 21/505
Processed batch 22/505
Processed batch 23/505
Processed batch 24/505
Processed batch 25/505
Processed batch 26/505
Processed batch 27/505
Processed batch 28/505
Processed batch 29/505
Processed batch 30/505
Processed batch 31/505
Processed batch 32/505
Processed batch 33/505
Processed batch 34/505
Processed batch 35/505
Processed batch 36/505
Processed batch 37/505
Processed batch 38/505
Processed batch 39/505
Processed batch 40/505
Processed batch 41/505
Processed batch 42/505
Processed batch 43/505
Processed batch 44/5

In [None]:
import os
import pandas as pd

# Define saved embedding paths
save_path = "/content/drive/My Drive/fake-news-embeddings/"

# Load embeddings
X_train = pd.read_csv(os.path.join(save_path, "train_embeddings.csv")).values
X_test = pd.read_csv(os.path.join(save_path, "test_embeddings.csv")).values
X_val = pd.read_csv(os.path.join(save_path, "val_embeddings.csv")).values

# Load original datasets to get labels
base_path = "/content/drive/My Drive/fake-news/"
train_df = pd.read_csv(os.path.join(base_path, "train_set.csv"))
test_df = pd.read_csv(os.path.join(base_path, "test_set.csv"))
val_df = pd.read_csv(os.path.join(base_path, "validation_set.csv"))

# Extract labels
y_train = train_df["label"].values
y_test = test_df["label"].values
y_val = val_df["label"].values

print("✅ Embeddings & Labels Loaded Successfully!")


✅ Embeddings & Labels Loaded Successfully!


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate model
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


✅ Accuracy: 0.6372646184340932
              precision    recall  f1-score   support

           0       0.63      0.76      0.69       540
           1       0.64      0.49      0.56       469

    accuracy                           0.64      1009
   macro avg       0.64      0.63      0.63      1009
weighted avg       0.64      0.64      0.63      1009



In [None]:
from sklearn.svm import SVC

# Train SVM classifier
svm_clf = SVC(kernel="linear", C=1.0, random_state=42)
svm_clf.fit(X_train, y_train)

# Predict
y_pred_svm = svm_clf.predict(X_test)

# Evaluate
print("✅ SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


✅ SVM Accuracy: 0.624380574826561
              precision    recall  f1-score   support

           0       0.65      0.65      0.65       540
           1       0.60      0.59      0.60       469

    accuracy                           0.62      1009
   macro avg       0.62      0.62      0.62      1009
weighted avg       0.62      0.62      0.62      1009



In [None]:
# Predict on validation set
y_val_pred = clf.predict(X_val)

# Evaluate
print("✅ Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))


✅ Validation Accuracy: 0.6313181367690783
              precision    recall  f1-score   support

           0       0.63      0.75      0.69       539
           1       0.63      0.49      0.55       470

    accuracy                           0.63      1009
   macro avg       0.63      0.62      0.62      1009
weighted avg       0.63      0.63      0.62      1009

