In [1]:
pip install openai



In [2]:
pip install nltk




In [3]:
pip install --upgrade openai

Collecting openai
  Downloading openai-1.54.3-py3-none-any.whl.metadata (24 kB)
Downloading openai-1.54.3-py3-none-any.whl (389 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.6/389.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.52.2
    Uninstalling openai-1.52.2:
      Successfully uninstalled openai-1.52.2
Successfully installed openai-1.54.3


In [4]:
pip install sentence-transformers



In [5]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, RepeatVector, TimeDistributed, LSTM
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from collections import Counter
from sklearn.metrics import accuracy_score

In [8]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
data = pd.read_csv('embedded_gptneo.csv')
data['embedding'] = data['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=','))
X = np.array(data['embedding'].tolist())

# Define columns and class counts
target_columns = {
    'provokingviolence': 4,
    'individualharrassment': 4,
    'emotionaldistress': 3
}

# One-hot encode each target column and prepare the dataset
encoded_targets = {}
for col, num_classes in target_columns.items():
    encoder = OneHotEncoder(sparse_output=False)
    encoded_targets[col] = encoder.fit_transform(data[col].values.reshape(-1, 1))

# Concatenate one-hot encoded targets into a single array for easy indexing
y = np.hstack([encoded_targets[col] for col in target_columns])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape X to add a time dimension for LSTM (seq_len=1 for static embeddings)
X_train = X_train[:, np.newaxis, :]
X_val = X_val[:, np.newaxis, :]

# Create a custom Dataset class
class MultiOutputDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.features[idx], dtype=torch.float32),
            'label': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# Create DataLoaders
train_dataset = MultiOutputDataset(X_train, y_train)
val_dataset = MultiOutputDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the LSTM model with multiple output heads
class MultiOutputLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dims, num_layers=1):
        super(MultiOutputLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

        # Separate output layers for each target
        self.output_heads = nn.ModuleDict({
            target: nn.Linear(hidden_dim, output_dim) for target, output_dim in output_dims.items()
        })
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        x = self.dropout(hn[-1])

        # Return output from each output head
        outputs = {target: head(x) for target, head in self.output_heads.items()}
        return outputs

# Instantiate the model with separate output layers for each target
input_dim = X.shape[1]  # Number of features per timestep
hidden_dim = 64         # Number of features in LSTM hidden state
output_dims = {target: num_classes for target, num_classes in target_columns.items()}
model = MultiOutputLSTM(input_dim, hidden_dim, output_dims)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Suitable for multi-class with logits
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training function for multi-output model
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_data = batch['input'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_data)
            # Compute loss separately for each output head
            losses = [criterion(outputs[target], labels[:, start:end])
                      for target, (start, end) in zip(target_columns.keys(),
                                                      [(0,4), (4,8), (8,11)])]
            loss = sum(losses)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer, epochs=10)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions, true_labels = {}, {}

    for target in target_columns.keys():
        predictions[target], true_labels[target] = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_data = batch['input'].to(device)
            labels = batch['label'].cpu().numpy()

            outputs = model(input_data)
            for target, pred in outputs.items():
                predictions[target].append(pred.cpu().numpy())
                start, end = (0,4) if target == 'provokingviolence' else ((4,8) if target == 'individualharrassment' else (8,11))
                true_labels[target].append(labels[:, start:end])

    # Concatenate batches
    predictions = {target: np.vstack(preds) for target, preds in predictions.items()}
    true_labels = {target: np.vstack(labels) for target, labels in true_labels.items()}

    return predictions, true_labels

# Evaluate the model
predictions, true_labels = evaluate_model(model, val_loader)

# Apply sigmoid, threshold, and calculate metrics for each output head
for target, num_classes in target_columns.items():
    y_pred_binary = (torch.sigmoid(torch.tensor(predictions[target])) > 0.5).int().numpy()
    y_true_binary = true_labels[target]

    y_pred_labels = np.argmax(y_pred_binary, axis=1)
    y_true_labels = np.argmax(y_true_binary, axis=1)

    print(f"Classification report for {target}:")
    print(classification_report(y_true_labels, y_pred_labels))

    overall_accuracy = accuracy_score(y_true_labels, y_pred_labels)
    print(f"Overall Accuracy for {target}: {overall_accuracy:.4f}\n")


Epoch 1/10, Loss: 1.2692
Epoch 2/10, Loss: 1.2052
Epoch 3/10, Loss: 1.1952
Epoch 4/10, Loss: 1.1869
Epoch 5/10, Loss: 1.1797
Epoch 6/10, Loss: 1.1771
Epoch 7/10, Loss: 1.1727
Epoch 8/10, Loss: 1.1722
Epoch 9/10, Loss: 1.1678
Epoch 10/10, Loss: 1.1644
Classification report for provokingviolence:
              precision    recall  f1-score   support

           0       0.36      0.36      0.36      1975
           1       0.00      0.00      0.00       966
           2       0.63      0.80      0.71      5855
           3       0.79      0.61      0.69      2191

    accuracy                           0.61     10987
   macro avg       0.45      0.44      0.44     10987
weighted avg       0.56      0.61      0.58     10987

Overall Accuracy for provokingviolence: 0.6098

Classification report for individualharrassment:
              precision    recall  f1-score   support

           0       0.01      0.48      0.02        81
           1       0.57      0.16      0.25      2386
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
data = pd.read_csv('embedded_gptneo.csv')

# Convert the 'embedded_text' column to numpy arrays
data['embedding'] = data['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=','))

X = np.array(data['embedding'].tolist())

# Define target columns and their respective number of classes
target_columns = {
    'provokingviolence': 4,
    'individualharrassment': 4,
    'emotionaldistress': 3
}

# One-hot encode each target column
encoded_targets = {}
encoders = {}
for col, num_classes in target_columns.items():
    encoder = OneHotEncoder(sparse_output=False, categories='auto')
    encoded = encoder.fit_transform(data[col].values.reshape(-1, 1))
    encoded_targets[col] = encoded
    encoders[col] = encoder  # Save encoder for inverse transformations if needed

# Concatenate one-hot encoded targets into a single array
y = np.hstack([encoded_targets[col] for col in target_columns])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape X to add a time dimension for hierarchical LSTM (word-level)
X_train = X_train[:, np.newaxis, :]
X_val = X_val[:, np.newaxis, :]

# Create a custom Dataset class
class MultiOutputDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.features[idx], dtype=torch.float32),
            'label': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# Create DataLoaders
train_dataset = MultiOutputDataset(X_train, y_train)
val_dataset = MultiOutputDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the Hierarchical BiLSTM model with multiple output heads
class HierarchicalBiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dims, word_num_layers=1, sent_num_layers=1):
        super(HierarchicalBiLSTM, self).__init__()

        self.word_bilstm = nn.LSTM(
            input_dim, hidden_dim, word_num_layers,
            batch_first=True, bidirectional=True
        )

        # Sentence-level BiLSTM to capture sentence-level context
        self.sent_bilstm = nn.LSTM(
            hidden_dim * 2, hidden_dim, sent_num_layers,
            batch_first=True, bidirectional=True
        )

        # Define output heads for each target
        self.output_heads = nn.ModuleDict({
            target: nn.Linear(hidden_dim * 2, output_dim)
            for target, output_dim in output_dims.items()
        })

        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        # x: (batch_size, num_words, word_embedding_dim)
        batch_size, num_words, _ = x.size()

        # Word-level BiLSTM
        word_level_outputs, _ = self.word_bilstm(x)  # (batch_size, num_words, hidden_dim * 2)

        # Get the sentence representation by aggregating word-level outputs
        sentence_embedding = word_level_outputs.mean(dim=1)  # (batch_size, hidden_dim * 2)

        # Sentence-level BiLSTM
        sentence_embedding = sentence_embedding.unsqueeze(1)  # Add a pseudo-sequence length of 1
        sentence_output, (hn, cn) = self.sent_bilstm(sentence_embedding)

        # Aggregate forward and backward hidden states
        sentence_output = sentence_output.squeeze(1)

        # Apply dropout
        x = self.dropout(sentence_output)

        # Compute outputs for each target
        outputs = {target: head(x) for target, head in self.output_heads.items()}
        return outputs

# Instantiate the model
input_dim = X.shape[1]  # Number of features per word embedding
hidden_dim = 64         # Number of features in LSTM hidden state
output_dims = {target: num_classes for target, num_classes in target_columns.items()}
model = HierarchicalBiLSTM(input_dim, hidden_dim, output_dims)

# Define loss functions for each output
criteria = {
    target: nn.CrossEntropyLoss()  # Adjust class weights if necessary
    for target in target_columns
}

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training function
def train_model(model, train_loader, criteria, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_data = batch['input'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_data)

            # Compute loss for each target
            loss = 0
            for idx, target in enumerate(target_columns.keys()):
                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                target_labels = torch.argmax(labels[:, start:end], dim=1)
                loss += criteria[target](outputs[target], target_labels)

            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# Train the model
train_model(model, train_loader, criteria, optimizer, epochs=20)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions = {target: [] for target in target_columns}
    true_labels = {target: [] for target in target_columns}

    with torch.no_grad():
        for batch in val_loader:
            input_data = batch['input'].to(device)
            labels = batch['label'].cpu().numpy()

            outputs = model(input_data)
            for idx, target in enumerate(target_columns.keys()):
                preds = outputs[target].cpu().numpy()
                predictions[target].append(preds)

                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                true_labels[target].append(labels[:, start:end])

    # Concatenate all batches
    for target in target_columns:
        predictions[target] = np.vstack(predictions[target])
        true_labels[target] = np.vstack(true_labels[target])

    return predictions, true_labels

# Evaluate the model
predictions, true_labels = evaluate_model(model, val_loader)

# Function to compute metrics
def compute_metrics(predictions, true_labels, encoders, target_columns):
    for target, num_classes in target_columns.items():
        y_pred = np.argmax(predictions[target], axis=1)
        y_true = np.argmax(true_labels[target], axis=1)

        encoder = encoders[target]
        target_names = [str(cls) for cls in encoder.categories_[0]]

        print(f"Classification Report for '{target}':")
        print(classification_report(y_true, y_pred, target_names=target_names))

        accuracy = accuracy_score(y_true, y_pred)
        print(f"Overall Accuracy for '{target}': {accuracy:.4f}\n")

# Display the metrics
compute_metrics(predictions, true_labels, encoders, target_columns)


Epoch 1/20, Loss: 2.5110
Epoch 2/20, Loss: 2.4145
Epoch 3/20, Loss: 2.3910
Epoch 4/20, Loss: 2.3732
Epoch 5/20, Loss: 2.3599
Epoch 6/20, Loss: 2.3468
Epoch 7/20, Loss: 2.3372
Epoch 8/20, Loss: 2.3259
Epoch 9/20, Loss: 2.3149
Epoch 10/20, Loss: 2.3086
Epoch 11/20, Loss: 2.2979
Epoch 12/20, Loss: 2.2866
Epoch 13/20, Loss: 2.2773
Epoch 14/20, Loss: 2.2677
Epoch 15/20, Loss: 2.2604
Epoch 16/20, Loss: 2.2506
Epoch 17/20, Loss: 2.2433
Epoch 18/20, Loss: 2.2401
Epoch 19/20, Loss: 2.2315
Epoch 20/20, Loss: 2.2255
Classification Report for 'provokingviolence':
              precision    recall  f1-score   support

           0       0.49      0.23      0.32      1975
           1       0.00      0.00      0.00       966
           2       0.62      0.86      0.72      5855
           3       0.75      0.67      0.71      2191

    accuracy                           0.63     10987
   macro avg       0.46      0.44      0.44     10987
weighted avg       0.57      0.63      0.58     10987

Overall

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd

# Load the dataset
data = pd.read_csv('embedded_gptneo.csv')
data['embedding'] = data['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=','))

X = np.array(data['embedding'].tolist())

# Define target columns and their respective number of classes
target_columns = {
    'provokingviolence': 4,
    'individualharrassment': 4,
    'emotionaldistress': 3
}

# One-hot encode each target column
encoded_targets = {}
encoders = {}
for col, num_classes in target_columns.items():
    encoder = OneHotEncoder(sparse_output=False, categories='auto')
    encoded = encoder.fit_transform(data[col].values.reshape(-1, 1))
    encoded_targets[col] = encoded
    encoders[col] = encoder

# Concatenate one-hot encoded targets into a single array
y = np.hstack([encoded_targets[col] for col in target_columns])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape X to add a time dimension for MTM LSTM
X_train = X_train[:, np.newaxis, :]
X_val = X_val[:, np.newaxis, :]

# Create a custom Dataset class
class MultiOutputDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.features[idx], dtype=torch.float32),
            'label': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# Create DataLoaders
train_dataset = MultiOutputDataset(X_train, y_train)
val_dataset = MultiOutputDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the MTM LSTM model
class MTMLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dims, num_layers=1):
        super(MTMLSTM, self).__init__()

        # LSTM layer
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers,
            batch_first=True, bidirectional=True
        )

        # Define output heads for each target
        self.output_heads = nn.ModuleDict({
            target: nn.Linear(hidden_dim * 2, output_dim)
            for target, output_dim in output_dims.items()
        })

        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        # LSTM output
        lstm_out, _ = self.lstm(x)

        # Apply dropout
        lstm_out = self.dropout(lstm_out)

        # Compute outputs for each target
        outputs = {target: head(lstm_out[:, -1, :]) for target, head in self.output_heads.items()}
        return outputs

# Instantiate the model
input_dim = X.shape[1]  # Number of features per word embedding
hidden_dim = 64         # Number of features in LSTM hidden state
output_dims = {target: num_classes for target, num_classes in target_columns.items()}
model = MTMLSTM(input_dim, hidden_dim, output_dims)

# Define loss functions for each output
criteria = {
    target: nn.CrossEntropyLoss()
    for target in target_columns
}

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training function
def train_model(model, train_loader, criteria, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_data = batch['input'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_data)

            # Compute loss for each target
            loss = 0
            for idx, target in enumerate(target_columns.keys()):
                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                target_labels = torch.argmax(labels[:, start:end], dim=1)
                loss += criteria[target](outputs[target], target_labels)

            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# Train the model
train_model(model, train_loader, criteria, optimizer, epochs=10)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions = {target: [] for target in target_columns}
    true_labels = {target: [] for target in target_columns}

    with torch.no_grad():
        for batch in val_loader:
            input_data = batch['input'].to(device)
            labels = batch['label'].cpu().numpy()

            outputs = model(input_data)
            for idx, target in enumerate(target_columns.keys()):
                preds = outputs[target].cpu().numpy()
                predictions[target].append(preds)

                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                true_labels[target].append(labels[:, start:end])

    for target in target_columns:
        predictions[target] = np.vstack(predictions[target])
        true_labels[target] = np.vstack(true_labels[target])

    return predictions, true_labels

# Evaluate the model
predictions, true_labels = evaluate_model(model, val_loader)

# Function to compute metrics
def compute_metrics(predictions, true_labels, encoders, target_columns):
    for target, num_classes in target_columns.items():
        y_pred = np.argmax(predictions[target], axis=1)
        y_true = np.argmax(true_labels[target], axis=1)

        encoder = encoders[target]
        target_names = [str(cls) for cls in encoder.categories_[0]]

        print(f"Classification Report for '{target}':")
        print(classification_report(y_true, y_pred, target_names=target_names))

        accuracy = accuracy_score(y_true, y_pred)
        print(f"Overall Accuracy for '{target}': {accuracy:.4f}\n")

# Display the metrics
compute_metrics(predictions, true_labels, encoders, target_columns)


Epoch 1/10, Loss: 2.5366
Epoch 2/10, Loss: 2.4455
Epoch 3/10, Loss: 2.4235
Epoch 4/10, Loss: 2.4088
Epoch 5/10, Loss: 2.3971
Epoch 6/10, Loss: 2.3916
Epoch 7/10, Loss: 2.3766
Epoch 8/10, Loss: 2.3785
Epoch 9/10, Loss: 2.3702
Epoch 10/10, Loss: 2.3657
Classification Report for 'provokingviolence':
              precision    recall  f1-score   support

           0       0.47      0.29      0.36      1975
           1       0.00      0.00      0.00       966
           2       0.62      0.85      0.72      5855
           3       0.77      0.64      0.70      2191

    accuracy                           0.63     10987
   macro avg       0.47      0.44      0.44     10987
weighted avg       0.57      0.63      0.59     10987

Overall Accuracy for 'provokingviolence': 0.6314

Classification Report for 'individualharrassment':
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        81
           1       0.50      0.36      0.42      2386
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
data = pd.read_csv('embedded_gptneo.csv')

# Convert the 'embedding' column to numpy arrays
data['embedding'] = data['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=','))
X = np.array(data['embedding'].tolist())

# Define target columns and their respective number of classes
target_columns = {
    'provokingviolence': 4,
    'individualharrassment': 4,
    'emotionaldistress': 3
}

# One-hot encode each target column
encoded_targets = {}
encoders = {}
for col, num_classes in target_columns.items():
    encoder = OneHotEncoder(sparse_output=False)
    encoded = encoder.fit_transform(data[col].values.reshape(-1, 1))
    encoded_targets[col] = encoded
    encoders[col] = encoder

# Concatenate one-hot encoded targets into a single array
y = np.hstack([encoded_targets[col] for col in target_columns])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom Dataset class
class MultiOutputDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.features[idx], dtype=torch.float32),
            'label': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# Create DataLoaders
train_dataset = MultiOutputDataset(X_train, y_train)
val_dataset = MultiOutputDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the Multi-Output MLP model
class MultiOutputMLPClassifier(nn.Module):
    def __init__(self, input_dim, output_dims):
        super(MultiOutputMLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First hidden layer
        self.fc2 = nn.Linear(128, 64)          # Second hidden layer
        self.dropout = nn.Dropout(0.3)         # Dropout for regularization

        # Separate output heads for each target
        self.output_heads = nn.ModuleDict({
            target: nn.Linear(64, output_dim)
            for target, output_dim in output_dims.items()
        })

    def forward(self, x):
        x = torch.relu(self.fc1(x))            # Activation function
        x = self.dropout(x)                     # Apply dropout
        x = torch.relu(self.fc2(x))            # Activation function

        # Compute outputs for each target
        outputs = {target: head(x) for target, head in self.output_heads.items()}
        return outputs

# Instantiate the model
input_dim = X.shape[1]  # Number of features
output_dims = {col: num_classes for col, num_classes in target_columns.items()}
model = MultiOutputMLPClassifier(input_dim, output_dims)

# Define loss functions for each output
criteria = {
    target: nn.BCEWithLogitsLoss()
    for target in target_columns
}

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training function
def train_model(model, train_loader, criteria, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_data = batch['input'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_data)

            # Compute loss for each target
            loss = 0
            for idx, target in enumerate(target_columns.keys()):
                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                target_labels = labels[:, start:end]
                loss += criteria[target](outputs[target], target_labels)

            total_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# Train the model
train_model(model, train_loader, criteria, optimizer, epochs=10)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions = {target: [] for target in target_columns}
    true_labels = {target: [] for target in target_columns}

    with torch.no_grad():
        for batch in val_loader:
            input_data = batch['input'].to(device)
            labels = batch['label'].cpu().numpy()

            outputs = model(input_data)
            for idx, target in enumerate(target_columns.keys()):
                preds = outputs[target].cpu().numpy()
                predictions[target].append(preds)

                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                true_labels[target].append(labels[:, start:end])

    for target in target_columns:
        predictions[target] = np.vstack(predictions[target])
        true_labels[target] = np.vstack(true_labels[target])

    return predictions, true_labels

# Evaluate the model
predictions, true_labels = evaluate_model(model, val_loader)

# Compute metrics for each target
def compute_metrics(predictions, true_labels, encoders, target_columns):
    for target, num_classes in target_columns.items():
        y_pred = (torch.sigmoid(torch.tensor(predictions[target])) > 0.5).int().numpy()
        y_true = true_labels[target]

        y_pred_labels = np.argmax(y_pred, axis=1)
        y_true_labels = np.argmax(y_true, axis=1)

        encoder = encoders[target]
        target_names = [str(cls) for cls in encoder.categories_[0]]

        print(f"Classification Report for '{target}':")
        print(classification_report(y_true_labels, y_pred_labels, target_names=target_names))

        accuracy = accuracy_score(y_true_labels, y_pred_labels)
        print(f"Overall Accuracy for '{target}': {accuracy:.4f}\n")

# Display the metrics
compute_metrics(predictions, true_labels, encoders, target_columns)


Epoch 1/10, Loss: 1.2254
Epoch 2/10, Loss: 1.1850
Epoch 3/10, Loss: 1.1746
Epoch 4/10, Loss: 1.1678
Epoch 5/10, Loss: 1.1631
Epoch 6/10, Loss: 1.1583
Epoch 7/10, Loss: 1.1551
Epoch 8/10, Loss: 1.1525
Epoch 9/10, Loss: 1.1505
Epoch 10/10, Loss: 1.1478
Classification Report for 'provokingviolence':
              precision    recall  f1-score   support

           0       0.42      0.34      0.38      1975
           1       0.00      0.00      0.00       966
           2       0.63      0.80      0.71      5855
           3       0.75      0.68      0.72      2191

    accuracy                           0.63     10987
   macro avg       0.45      0.46      0.45     10987
weighted avg       0.56      0.63      0.59     10987

Overall Accuracy for 'provokingviolence': 0.6250

Classification Report for 'individualharrassment':
              precision    recall  f1-score   support

           0       0.01      0.42      0.02        81
           1       0.57      0.19      0.29      2386
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb

# Load the dataset
data = pd.read_csv('embedded_gptneo.csv')

# Convert the 'embedded_text' column to numpy arrays
data['embedding'] = data['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=','))
X = np.array(data['embedding'].tolist())

# Define the target columns and initialize label encoders for each
target_columns = ['provokingviolence', 'individualharrassment', 'emotionaldistress']
label_encoders = {col: LabelEncoder() for col in target_columns}

# Encode the labels for each target column
y_encoded = {}
for col in target_columns:
    y_encoded[col] = label_encoders[col].fit_transform(data[col])

# Split data into training and validation sets for each target column
train_test_splits = {}
for col in target_columns:
    X_train, X_val, y_train, y_val = train_test_split(X, y_encoded[col], test_size=0.2, random_state=42)
    train_test_splits[col] = (X_train, X_val, y_train, y_val)

# Function to train and evaluate XGBoost for each target
def train_evaluate_xgboost(target_column):
    X_train, X_val, y_train, y_val = train_test_splits[target_column]

    # Initialize XGBoost classifier with suitable parameters
    model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=len(label_encoders[target_column].classes_),  # Number of classes for the target
        eval_metric='mlogloss',
        use_label_encoder=False,
        max_depth=6,
        learning_rate=0.1,
        n_estimators=100,
        random_state=42
    )

    # Train the model
    model.fit(X_train, y_train)

    # Predict on validation data
    y_pred = model.predict(X_val)

    # Convert predictions and true labels back to original labels
    y_pred_labels = label_encoders[target_column].inverse_transform(y_pred)
    y_val_labels = label_encoders[target_column].inverse_transform(y_val)

    # Print classification report and accuracy
    print(f"Classification Report for '{target_column}':")
    print(classification_report(y_val_labels, y_pred_labels))
    accuracy = accuracy_score(y_val_labels, y_pred_labels)
    print(f"Overall Accuracy for '{target_column}': {accuracy:.4f}\n")

    return model

# Train and evaluate XGBoost model for each target column
models = {}
for col in target_columns:
    print(f"Training and evaluating model for target: {col}")
    models[col] = train_evaluate_xgboost(col)


Training and evaluating model for target: provokingviolence


Parameters: { "use_label_encoder" } are not used.



Classification Report for 'provokingviolence':
              precision    recall  f1-score   support

           0       0.50      0.22      0.31      1975
           1       0.15      0.01      0.01       966
           2       0.62      0.86      0.72      5855
           3       0.75      0.69      0.72      2191

    accuracy                           0.63     10987
   macro avg       0.51      0.44      0.44     10987
weighted avg       0.58      0.63      0.58     10987

Overall Accuracy for 'provokingviolence': 0.6347

Training and evaluating model for target: individualharrassment


Parameters: { "use_label_encoder" } are not used.



Classification Report for 'individualharrassment':
              precision    recall  f1-score   support

           0       0.20      0.01      0.02        81
           1       0.51      0.29      0.37      2386
           2       0.53      0.77      0.63      5430
           3       0.56      0.32      0.40      3090

    accuracy                           0.53     10987
   macro avg       0.45      0.35      0.36     10987
weighted avg       0.53      0.53      0.51     10987

Overall Accuracy for 'individualharrassment': 0.5342

Training and evaluating model for target: emotionaldistress


Parameters: { "use_label_encoder" } are not used.



Classification Report for 'emotionaldistress':
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       100
           1       0.59      0.40      0.48      3151
           2       0.78      0.90      0.84      7736

    accuracy                           0.75     10987
   macro avg       0.46      0.43      0.44     10987
weighted avg       0.72      0.75      0.73     10987

Overall Accuracy for 'emotionaldistress': 0.7477



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
!pip install keras tensorflow



In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Bidirectional, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Load your dataset
data_file = 'embedded_gptneo.csv'  # Adjust this to your dataset path
data = pd.read_csv(data_file)

# Assume your dataset has the text and the labels in the following columns
texts = data['embedding'].tolist()  # Column with your input text
labels = data[['provokingviolence', 'individualharrassment', 'emotionaldistress']]  # Adjust based on your actual column names

# Tokenization parameters
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 350
EMBEDDING_DIM = 100

# Tokenization and padding
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

# Convert labels to categorical for each output
Y_provoking = pd.get_dummies(labels['provokingviolence']).values
Y_harassment = pd.get_dummies(labels['individualharrassment']).values
Y_distress = pd.get_dummies(labels['emotionaldistress']).values

# Split data into training and test sets
X_train, X_test, Y_train_provoking, Y_test_provoking = train_test_split(X, Y_provoking, test_size=0.30, random_state=1)
_, _, Y_train_harassment, Y_test_harassment = train_test_split(X, Y_harassment, test_size=0.30, random_state=1)
_, _, Y_train_distress, Y_test_distress = train_test_split(X, Y_distress, test_size=0.30, random_state=1)

# Model architecture with multiple outputs
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_layer)
x = SpatialDropout1D(0.2)(embedding_layer)
x = Bidirectional(LSTM(200, dropout=0.2, recurrent_dropout=0.2))(x)

# Define separate output layers for each label
output_provoking = Dense(4, activation='softmax', name='provokingviolence')(x)
output_harassment = Dense(4, activation='softmax', name='individualharrassment')(x)
output_distress = Dense(3, activation='softmax', name='emotionaldistress')(x)

# Model architecture with multiple outputs
model = Model(inputs=input_layer, outputs=[output_provoking, output_harassment, output_distress])

# Compile multi-output model with separate metrics for each output
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', 'accuracy', 'accuracy'])  # One metric for each output
print(model.summary())

# Train the model
epochs = 10  # Adjust the number of epochs as needed
batch_size = 64
history = model.fit(
    X_train,
    [Y_train_provoking, Y_train_harassment, Y_train_distress],
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.1,
    callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)]
)

# Evaluate the model
test_results = model.evaluate(X_test, [Y_test_provoking, Y_test_harassment, Y_test_distress])
print(f"Evaluation Results: {test_results}")

# Predict and evaluate each output independently
preds_provoking, preds_harassment, preds_distress = model.predict(X_test)

# Convert predictions to binary for each label
preds_provoking_binary = (preds_provoking == preds_provoking.max(axis=1, keepdims=1)).astype(int)
preds_harassment_binary = (preds_harassment == preds_harassment.max(axis=1, keepdims=1)).astype(int)
preds_distress_binary = (preds_distress == preds_distress.max(axis=1, keepdims=1)).astype(int)

# Evaluate classification metrics
print("Classification Report for Provoking Violence:")
print(classification_report(Y_test_provoking.argmax(axis=1), preds_provoking_binary.argmax(axis=1)))

print("Classification Report for Individual Harassment:")
print(classification_report(Y_test_harassment.argmax(axis=1), preds_harassment_binary.argmax(axis=1)))

print("Classification Report for Emotional Distress:")
print(classification_report(Y_test_distress.argmax(axis=1), preds_distress_binary.argmax(axis=1)))




None
Epoch 1/10
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m699s[0m 1s/step - emotionaldistress_accuracy: 0.6994 - individualharrassment_accuracy: 0.5119 - loss: 2.7548 - provokingviolence_accuracy: 0.5790 - val_emotionaldistress_accuracy: 0.7101 - val_individualharrassment_accuracy: 0.5257 - val_loss: 2.5268 - val_provokingviolence_accuracy: 0.6183
Epoch 2/10
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m744s[0m 1s/step - emotionaldistress_accuracy: 0.7108 - individualharrassment_accuracy: 0.5261 - loss: 2.5461 - provokingviolence_accuracy: 0.6111 - val_emotionaldistress_accuracy: 0.7028 - val_individualharrassment_accuracy: 0.5255 - val_loss: 2.5349 - val_provokingviolence_accuracy: 0.6157
[1m515/515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 281ms/step - emotionaldistress_accuracy: 0.7004 - individualharrassment_accuracy: 0.5152 - loss: 2.5588 - provokingviolence_accuracy: 0.6129
Evaluation Results: [2.5607388019561768, 0.7003033757209778,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
import pandas as pd
import numpy as np
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from keras import backend as K
from sklearn.metrics import classification_report

# Load your dataset
data = pd.read_csv('embedded_gptneo.csv')  # Your dataset with precomputed embeddings

# Assuming 'embedded_text' contains lists of embeddings as strings
# Convert the string representations of lists to actual lists
X = np.array(data['embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=',')).tolist())

# Check the shape of X after conversion
print(f"Shape of X after converting: {X.shape}")

# Prepare target variables as one-hot encoded arrays
Y_provoking = pd.get_dummies(labels['provokingviolence']).values
Y_harassment = pd.get_dummies(labels['individualharrassment']).values
Y_distress = pd.get_dummies(labels['emotionaldistress']).values

# Check shapes of Y as well
print(f"Shapes of Y: Provoking: {Y_provoking.shape}, Harassment: {Y_harassment.shape}, Distress: {Y_distress.shape}")

# Split the data
X_train, X_test, Y_train_provoking, Y_test_provoking = train_test_split(X, Y_provoking, test_size=0.3, random_state=42)
_, _, Y_train_harassment, Y_test_harassment = train_test_split(X, Y_harassment, test_size=0.3, random_state=42)
_, _, Y_train_distress, Y_test_distress = train_test_split(X, Y_distress, test_size=0.3, random_state=42)

# Build the multi-task LSTM model
input_layer = Input(shape=(X.shape[1],))  # Adjust input shape based on your embeddings
x = Dense(256, activation='relu')(input_layer)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)

# Output layers for each task
output_provoking = Dense(4, activation='softmax', name='provokingviolence')(x)
output_harassment = Dense(4, activation='softmax', name='individualharrassment')(x)
output_distress = Dense(3, activation='softmax', name='emotionaldistress')(x)

metrics = ['accuracy'] * 3
# Compile the model
model = Model(inputs=input_layer, outputs=[output_provoking, output_harassment, output_distress])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = metrics)

# Train the model
epochs = 10
batch_size = 64
history = model.fit(
    X_train,
    [Y_train_provoking, Y_train_harassment, Y_train_distress],
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.1,
    callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)]
)

# Evaluate the model
test_results = model.evaluate(X_test, [Y_test_provoking, Y_test_harassment, Y_test_distress])
print(f"Evaluation Results: {test_results}")

# Predict and evaluate
preds_provoking, preds_harassment, preds_distress = model.predict(X_test)

# Classification reports
print("Classification Report for Provoking Violence:")
print(classification_report(Y_test_provoking.argmax(axis=1), preds_provoking.argmax(axis=1)))

print("Classification Report for Individual Harassment:")
print(classification_report(Y_test_harassment.argmax(axis=1), preds_harassment.argmax(axis=1)))

print("Classification Report for Emotional Distress:")
print(classification_report(Y_test_distress.argmax(axis=1), preds_distress.argmax(axis=1)))

# Define the squared Euclidean distance function
def squared_euclidean_distance(y_true, y_pred):
    return K.sum(K.square(y_true - y_pred), axis=-1)

# Example usage of squared Euclidean distance
# This should be part of a custom metric if needed
# distance = squared_euclidean_distance(Y_test_provoking, preds_provoking)


Shape of X after converting: (54932, 768)
Shapes of Y: Provoking: (54932, 4), Harassment: (54932, 4), Distress: (54932, 3)
Epoch 1/10
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - emotionaldistress_accuracy: 0.6845 - individualharrassment_accuracy: 0.4598 - loss: 3.1622 - provokingviolence_accuracy: 0.5359 - val_emotionaldistress_accuracy: 0.7327 - val_individualharrassment_accuracy: 0.5346 - val_loss: 2.4747 - val_provokingviolence_accuracy: 0.6113
Epoch 2/10
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - emotionaldistress_accuracy: 0.7275 - individualharrassment_accuracy: 0.5185 - loss: 2.5293 - provokingviolence_accuracy: 0.6054 - val_emotionaldistress_accuracy: 0.7379 - val_individualharrassment_accuracy: 0.5473 - val_loss: 2.4286 - val_provokingviolence_accuracy: 0.6188
Epoch 3/10
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - emotionaldistress_accuracy: 0.7294 - individualharrassment_accu

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
#####
###
# Hybrid CNN attention _ Multilabel
###
#####
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
data = pd.read_csv('embedded_gptneo.csv')

# Convert 'embedding' to numpy arrays
data['embedding'] = data['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=','))
X = np.array(data['embedding'].tolist())
y = data[['emotionaldistress', 'provokingviolence', 'individualharrassment']].values

# Convert y to binary format (multi-label)
y_binary = (y > 0).astype(int)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Dataset class
class MultilabelDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return {'input_ids': self.X[idx], 'labels': self.y[idx]}

# Create DataLoader
train_dataset = MultilabelDataset(X_train, y_train)
val_dataset = MultilabelDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Hybrid CNN-Attention model
class HybridMultilabelClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(HybridMultilabelClassifier, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=128, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(2)

        # Fully connected layer after flattening
        self.fc1 = nn.Linear(128 * (input_dim // 2), 64)

        # Attention mechanism
        self.attention = nn.Linear(64, 1)

        # Final fully connected layer for classification
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension for Conv1d, Shape: [batch_size, 1, input_dim]
        x = torch.relu(self.conv1(x))  # Shape: [batch_size, 128, input_dim]
        x = self.pool1(x)  # Shape: [batch_size, 128, input_dim//2]

        x = x.view(x.size(0), -1)  # Flatten for fully connected layer
        x = torch.relu(self.fc1(x))  # Shape: [batch_size, 64]

        attention_weights = torch.softmax(self.attention(x), dim=1)  # Shape: [batch_size, 1]
        x = attention_weights * x  # Apply attention

        x = torch.sigmoid(self.fc2(x))  # Sigmoid for multilabel output
        return x

# Instantiate model, loss function, optimizer
input_dim = X.shape[1]  # Number of features in embeddings
output_dim = y_binary.shape[1]  # Number of labels
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridMultilabelClassifier(input_dim=input_dim, output_dim=output_dim).to(device)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for multilabel
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

# Training function
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids)
            predictions.append(outputs.cpu().numpy())
            true_labels.append(labels.cpu().numpy())

    return np.vstack(predictions), np.vstack(true_labels)

# Evaluate the model
y_pred, y_true = evaluate_model(model, val_loader)

# Binarize predictions
y_pred_binary = (y_pred > 0.5).astype(int)

# Print classification report
print(classification_report(y_true, y_pred_binary, target_names=['Emotional Distress', 'Provoking Violence', 'Individual Harassment']))

# Calculate overall accuracy
overall_accuracy = accuracy_score(y_true, y_pred_binary)
print(f"Overall Accuracy: {overall_accuracy:.4f}")


Epoch 1/10, Loss: 6.2968
Epoch 2/10, Loss: 6.3053
Epoch 3/10, Loss: 6.3092
Epoch 4/10, Loss: 6.3073
Epoch 5/10, Loss: 6.3112
Epoch 6/10, Loss: 6.3112
Epoch 7/10, Loss: 6.3092
Epoch 8/10, Loss: 6.3092
Epoch 9/10, Loss: 6.3092
Epoch 10/10, Loss: 6.3092
                       precision    recall  f1-score   support

   Emotional Distress       0.99      1.00      1.00     10887
   Provoking Violence       0.82      1.00      0.90      9012
Individual Harassment       0.99      1.00      1.00     10906

            micro avg       0.93      1.00      0.97     30805
            macro avg       0.93      1.00      0.96     30805
         weighted avg       0.94      1.00      0.97     30805
          samples avg       0.93      0.99      0.96     30805

Overall Accuracy: 0.8201


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
