# Run 2

In [None]:
!pip install transformers



In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm  # Import tqdm for progress tracking

# Load the dataset from your .xlsx file
input_file = "/content/final_en.xlsx"
df = pd.read_excel(input_file)
df=df.head(2000)

# Specify the column name containing the text
column_to_process = "text"
text_data = df[column_to_process]

# Load hate speech detection tokenizer and model (use DistilBERT)
tokenizer_fake = AutoTokenizer.from_pretrained("bert-base-cased")
model_fake = AutoModel.from_pretrained("bert-base-cased")


# Define a function to extract embeddings
def get_sentence_embeddings(text):
    # Tokenize the text
    tokens = tokenizer_fake(text, return_tensors="pt", padding=True, truncation=True)
    # Forward pass through the model
    with torch.no_grad():
        output = model_fake(**tokens)
    # Extract the embeddings from the output
    embeddings = output.last_hidden_state.mean(dim=1)  # You can use other pooling strategies as well
    return embeddings

# Calculate embeddings for the text data
embeddings_list = []
# Use tqdm to track progress
for text in tqdm(text_data, desc="Calculating embeddings", unit=" texts"):
    embeddings = get_sentence_embeddings(text)
    embeddings_list.append(embeddings)

# Convert the list of embeddings to a single tensor
all_embeddings = torch.cat(embeddings_list)

# Save the embeddings as a PyTorch tensor
torch.save(all_embeddings, "fake_embeddings_bert.pt")

print("Embeddings saved as fake_embeddings_bert.pt")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Calculating embeddings:  21%|██        | 416/2000 [09:07<34:44,  1.32s/ texts]


KeyboardInterrupt: 

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

# Load the dataset from your .xlsx file
input_file = "/content/final_en.xlsx"  # Replace with your file path
df = pd.read_excel(input_file)

df = df.head(5000)
# label_mapping = {"hate": 1, "non-hate": 0}

# # Map the labels using the mapping
# df['Label'] = df['Label'].map(label_mapping)

# Load embeddings from "embeddings.pt"
all_embeddings = torch.load("fake_embeddings_bert.pt")

# Prepare the labels
labels_fake = torch.tensor(df['lebel'].values)

# Split the data into train and test sets
# train_embeddings, test_embeddings, train_labels_hate, test_labels_hate = train_test_split(all_embeddings, labels_hate, test_size=0.2, random_state=42)


In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from sklearn.model_selection import train_test_split
import numpy as np

# Load the dataset from your .xlsx file
input_file = "/content/final_en.xlsx"  # Replace with your file path
df = pd.read_excel(input_file)

df = df.head(5000)

# Load embeddings from "embeddings.pt"
all_embeddings = torch.load("fake_embeddings_bert.pt")
# label_mapping = {"hate": 1, "non-hate": 0}

# # Map the labels using the mapping
# df['Label'] = df['Label'].map(label_mapping)

# Prepare the labels for hate detection
labels_hate = torch.tensor(df['lebel'].values)

# Split the data into train and test sets
train_embeddings, test_embeddings, train_labels_hate, test_labels_hate = train_test_split(all_embeddings, labels_hate, test_size=0.2, random_state=42)

# Define a custom dataset for hate detection
class HateDetectionDataset(Dataset):
    def __init__(self, embeddings, labels_hate):
        self.embeddings = embeddings
        self.labels_hate = labels_hate

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels_hate[idx]

# Create data loaders for training and testing
batch_size = 32  # Set your batch size
train_dataset = HateDetectionDataset(train_embeddings, train_labels_hate)
test_dataset = HateDetectionDataset(test_embeddings, test_labels_hate)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Define the hate detection model
class HateDetectionModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.hate_classifier = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, embeddings):
        hate_logits = self.hate_classifier(embeddings)
        return hate_logits

# Initialize and train the hate detection model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HateDetectionModel(all_embeddings.shape[1])
model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 9  # Set the number of training epochs
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        embeddings, labels_hate = [x.to(device) for x in batch]

        hate_logits = model(embeddings)

        loss_hate = criterion(hate_logits.view(-1), labels_hate.float())

        optimizer.zero_grad()
        loss_hate.backward()
        optimizer.step()

# Save the model if needed
torch.save(model.state_dict(), "fake_detection_model_bertc.h5")
print("Trained model saved to hate_detection_model.h5")


Trained model saved to hate_detection_model.h5


In [None]:
# New cell for classification report
model.eval()
with torch.no_grad():
    # Calculate metrics for hate detection
    hate_preds = []
    for batch in test_loader:
        embeddings, _ = [x.to(device) for x in batch]

        hate_logits = model(embeddings)
        hate_preds.extend(torch.sigmoid(hate_logits).cpu().numpy())

# Generate classification report
from sklearn.metrics import classification_report
class_names = ["not-fake", "fake"]

classification_rep = classification_report(
    np.array(test_labels_hate),
    (np.array(hate_preds) >= 0.5).astype(int),
    target_names=class_names,
)
print("Classification Report for Fake Detection Bert (Cased) :")
print(classification_rep)


Classification Report for Fake Detection Bert (Cased) :
              precision    recall  f1-score   support

    not-fake       0.99      0.98      0.98       524
        fake       0.98      0.99      0.98       476

    accuracy                           0.98      1000
   macro avg       0.98      0.98      0.98      1000
weighted avg       0.98      0.98      0.98      1000

