In [1]:
import datasets
from torch.utils.data import Dataset,DataLoader
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
train_data = datasets.load_dataset("social_bias_frames" , split = "train[:12000]")
val_data = datasets.load_dataset("social_bias_frames" , split= "validation[:7000]")
test_data = datasets.load_dataset("social_bias_frames" , split = "test[:1700]")

Downloading data: 100%|██████████| 974k/974k [00:00<00:00, 6.91MB/s]
Downloading data: 100%|██████████| 941k/941k [00:00<00:00, 6.56MB/s]
Downloading data: 100%|██████████| 6.61M/6.61M [00:00<00:00, 24.7MB/s]


Generating test split:   0%|          | 0/17501 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16738 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/112900 [00:00<?, ? examples/s]

In [3]:
# train_df = train_data.to_pandas()

# # Drop rows with empty strings in the 'offensiveYN' column
# train_df = train_df[train_df['offensiveYN'] != '']

# train_df.loc[train_df['offensiveYN'] == '0.5', 'offensiveYN'] = '1.0'

# # Convert back to datasets.Dataset
# train_data = datasets.Dataset.from_pandas(train_df)

In [4]:
val_df = val_data.to_pandas()

# Drop rows with empty strings in the 'offensiveYN' column
val_df = val_df[val_df['offensiveYN'] != '']

val_df.loc[val_df['offensiveYN'] == '0.5', 'offensiveYN'] = '1.0'

# Convert back to datasets.Dataset
val_data = datasets.Dataset.from_pandas(val_df)

In [5]:
train_data[0]

{'whoTarget': '0.0',
 'intentYN': '0.66',
 'sexYN': '0.0',
 'sexReason': '',
 'offensiveYN': '1.0',
 'annotatorGender': 'woman',
 'annotatorMinority': '',
 'sexPhrase': '',
 'speakerMinorityYN': '',
 'WorkerId': '-8935932304856669427',
 'HITId': '363A7XIFV4G2799C5V96YERJA9AVAM',
 'annotatorPolitics': 'liberal',
 'annotatorRace': 'white',
 'annotatorAge': '45.0',
 'post': 'RT @_LexC__: I\'m convinced that some of y\'all bitches get pregnant purposely because "birth control &amp; plan b pills" are effective &#128533;&#128056;&#9749;&#65039;',
 'targetMinority': '',
 'targetCategory': '',
 'targetStereotype': '',
 'dataSource': 't/davidson'}

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [7]:
class HateDataset(Dataset):
    # def __init__(self, dataset, tokenizer, model):
    #     self.label = dataset['offensiveYN']
    #     self.post = dataset['post']
    #     self.ann_gender = dataset['annotatorGender']
    #     self.ann_minority = dataset['annotatorMinority']
    #     self.ann_politics = dataset['annotatorPolitics']
    #     self.ann_age = dataset['annotatorAge']
        
    #     self.tokenizer = tokenizer
    #     self.model = model

    #     self.gender_encoder = LabelEncoder()
    #     self.ann_gender_encoded = self.gender_encoder.fit_transform(self.ann_gender)
        
    #     self.labels_encoder = LabelEncoder()
    #     self.labels_encoded =  self.labels_encoder.fit_transform(self.label)

    def __init__(self, dataset, tokenizer, model):
        dataset = dataset.to_pandas()
        dataset = dataset[dataset['offensiveYN'] != '']
        dataset.loc[dataset['offensiveYN'] == '0.5', 'offensiveYN'] = '1.0'
        dataset = dataset.groupby(['post','offensiveYN']).size().reset_index(name='counts')
        dataset = dataset.sort_values('counts', ascending=False).drop_duplicates('post')

        dataset = datasets.Dataset.from_pandas(dataset)
        
        label_encoder = LabelEncoder()
        
        self.label = label_encoder.fit_transform(dataset['offensiveYN'])
        
        self.post = dataset['post']
        
        self.tokenizer = tokenizer
        self.model = model

    def __len__(self):
        return len(self.label)
        
    def __getitem__(self, idx):
        # Tokenize the text
        tokenized_post = self.tokenizer(self.post[idx], return_tensors='pt',max_length=50, padding='max_length', truncation=True)
        
        # Forward pass through the model
        with torch.no_grad():
            model_output = self.model(**tokenized_post)

        # Extract the embeddings from the model output
        last_hidden_state = model_output.last_hidden_state

        # Return label and input features (including encoded gender)
        return self.label[idx], last_hidden_state


In [8]:
from torch.nn.utils.rnn import pad_sequence

def custom_collate_fn(batch):
    # Separate inputs and triggers
    targets, features = zip(*batch)
    # print(features)
    # Pad inputs to the length of the longest sequence in the batch
    padded_input_ids = pad_sequence(features[0][0], padding_value=0)

    return padded_input_ids,torch.tensor(features[0][0])

In [9]:
training_data = HateDataset(train_data,tokenizer,bert_model)
train_dataloader = DataLoader(training_data , batch_size=1 , shuffle=True)

val_data = HateDataset(val_data,tokenizer,bert_model)
val_dataloader = DataLoader(val_data , batch_size=1 , shuffle=True)

In [10]:
class BERT_FC(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FullyConnectedClassifier, self).__init__()
        self.fc1 = nn.Linear(50 * input_size, hidden_size)  # Input layer
        self.fc2 = nn.Linear(hidden_size, hidden_size)  # Hidden layer (you can add more layers if needed)
        self.fc3 = nn.Linear(hidden_size, output_size)  # Output layer

    def forward(self, x):
        # Forward pass through the network
        # print(x.shape)
        x = x.view(-1)
        x = nn.functional.relu(self.fc1(x))  # Apply ReLU activation to the first layer
        x = nn.functional.relu(self.fc2(x))  # Apply ReLU activation to the second layer
        x = self.fc3(x)  # Final layer without activation function (useful for multi-class classification)
        return x

In [11]:
model=BERT_FC(768,128,2)
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
model=model.to(device)

In [12]:
from sklearn.metrics import accuracy_score, f1_score

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    all_predictions = []
    all_labels = []

    for labels, inputs in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # print(labels)
        # print(inputs)
        inputs = torch.tensor(inputs)
        labels = torch.tensor([labels[0]])
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs.squeeze())
        # print(labels)
        one_hot_targets = torch.zeros(1, 2).to(device)
        one_hot_targets.scatter_(1, labels.unsqueeze(1), 1)
        # print(outputs)
        # print(one_hot_targets)
        loss = criterion(outputs, one_hot_targets.squeeze())
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Append predictions and labels for computing accuracy and F1-score
        # print(outputs)
        predicted = torch.argmax(outputs.unsqueeze(0), dim=1)
        # print(predicted)
        all_predictions.extend(predicted.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

    # Compute metrics after each epoch
    torch.save(model.state_dict(),f'Model_{epoch+1}')
    epoch_loss = running_loss / len(train_dataloader)
    epoch_accuracy = accuracy_score(all_labels, all_predictions)
    epoch_f1 = f1_score(all_labels, all_predictions, average='macro')

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, F1: {epoch_f1:.4f}")
    
    model.train()
    running_loss = 0.0
    all_predictions = []
    all_labels = []

    for labels, inputs in tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # print(labels)
        # print(inputs)
        inputs = torch.tensor(inputs[0])
        labels = torch.tensor([labels[0]])
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs.squeeze())
        # print(labels)
        one_hot_targets = torch.zeros(1, 2).to(device)
        one_hot_targets.scatter_(1, labels.unsqueeze(1), 1)
        # print(outputs)
        # print(one_hot_targets)
        loss = criterion(outputs, one_hot_targets.squeeze())
    #     loss.backward()
    #     optimizer.step()

        running_loss += loss.item()

        # Append predictions and labels for computing accuracy and F1-score
        # print(outputs)
        predicted = torch.argmax(outputs.unsqueeze(0), dim=1)
        # print(predicted)
        all_predictions.extend(predicted.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

    # Compute metrics after each epoch
    torch.save(model.state_dict(),f'Model_{epoch+1}.pt')
    epoch_loss = running_loss / len(train_dataloader)
    epoch_accuracy = accuracy_score(all_labels, all_predictions)
    epoch_f1 = f1_score(all_labels, all_predictions, average='macro')

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, F1: {epoch_f1:.4f}")


  inputs = torch.tensor(inputs)
Epoch 1/10: 100%|██████████| 4083/4083 [06:27<00:00, 10.55it/s]


Epoch 1/10, Loss: 0.5846, Accuracy: 0.7183, F1: 0.6174


  inputs = torch.tensor(inputs[0])
Epoch 1/10: 100%|██████████| 2037/2037 [03:05<00:00, 10.98it/s]


Epoch 1/10, Loss: 0.3363, Accuracy: 0.5989, F1: 0.5352


  inputs = torch.tensor(inputs)
Epoch 2/10: 100%|██████████| 4083/4083 [06:19<00:00, 10.77it/s]


Epoch 2/10, Loss: 0.5163, Accuracy: 0.7690, F1: 0.7050


  inputs = torch.tensor(inputs[0])
Epoch 2/10:   3%|▎         | 69/2037 [00:06<03:01, 10.87it/s]


KeyboardInterrupt: 