In [None]:
import pandas as pd
input_file = 'measuring_hate_speech.csv'
df=pd.read_csv(input_file)

In [None]:
# Calculate 'emotionaldistavg'
df['emotionaldistavg'] = df.apply(
    lambda row: row[['respect', 'sentiment', 'insult', 'dehumanize', 'humiliate', 'status']].mean()
    if row['hate_speech_score'] < 6.5 else None, axis=1)

# Calculate 'provokingviolenceavg'
df['provokingviolenceavg'] = df[['violence', 'genocide', 'dehumanize', 'attack_defend']].mean(axis=1)

# Function to calculate 'provokingviolence'
def calculate_provoking_violence(row):
    avg = row['provokingviolenceavg']
    if 0 <= avg < 1.35:
        return 0
    elif 1.35 <= avg < 1.5:
        return 1
    elif 1.5 <= avg < 3.16:
        return 2
    elif avg >= 3.16:
        return 3
    return None

# Function to calculate 'emotionaldistress'
def calculate_emotional_distress(row):
    avg = row['emotionaldistavg']
    if row['hate_speech_score'] > 6.5:
        return 3
    elif avg is not None:
        if 0 <= avg < 1.35:
            return 0
        elif 1.35 <= avg < 3.1:
            return 1
        elif 3.1 <= avg <= 4:
            return 2
    return None

# Function to adjust 'provokingviolence'
def adjust_provoking_violence(row):
    if row['provokingviolence'] == 2:
        if row['violence'] == 0 and row['genocide'] == 0 and (row['dehumanize'] in [2, 3] or row['attack_defend'] in [2, 3]):
            return 1
        elif row['violence'] == 0 and row['genocide'] == 0 and (row['dehumanize'] in [0, 1] or row['attack_defend'] in [0, 1]):
            return 0
    return row['provokingviolence']

# Apply calculations and adjustments
df['emotionaldistress'] = df.apply(calculate_emotional_distress, axis=1)
df['provokingviolence'] = df.apply(calculate_provoking_violence, axis=1)
df['provokingviolence'] = df.apply(adjust_provoking_violence, axis=1)

# Calculate 'individualharassmentavg'
df['individualharassmentavg'] = df[['status', 'insult', 'attack_defend', 'dehumanize', 'humiliate']].mean(axis=1)

# Function to calculate 'individualharrassment'
def calculate_individual_harassment(row):
    avg = row['individualharassmentavg']
    if 0 <= avg <= 1.1:
        return 0
    elif 1.1 < avg < 2.8:
        return 1
    elif 2.8 <= avg < 3.75:
        return 2
    elif 3.75 <= avg <= 4:
        return 3
    return None

# Apply individual harassment calculation
df['individualharrassment'] = df.apply(calculate_individual_harassment, axis=1)

# Function to calculate 'hatespeechintensity'
def calculate_hatespeech_intensity(row):
    if row['hatespeech'] in [0, 1]:
        return row['hatespeech']
    elif row['hatespeech'] == 2:
        if (row[['sentiment', 'respect', 'insult', 'dehumanize', 'humiliate', 'violence', 'attack_defend']].mean()) >= 3.8:
            return 3
        else:
            return 2
    return None

# Apply the function to generate 'hatespeechintensity'
df['hatespeechintensity'] = df.apply(calculate_hatespeech_intensity, axis=1)

# Select columns for the first output file
columns_to_keep = [
    'comment_id', 'annotator_id', 'platform', 'sentiment', 'respect', 'insult', 'humiliate', 'status',
    'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'text', 'hate_speech_score',
    'emotionaldistavg', 'emotionaldistress', 'provokingviolenceavg', 'provokingviolence',
    'individualharassmentavg', 'individualharrassment', 'hatespeechintensity'
]

# Save the preprocessed dataset into 'updated_dataset_preprocessed.csv'
new_df = df[columns_to_keep]
new_df.to_csv('updated_dataset_preprocessed.csv', index=False)

# Load the new dataframe and select columns for the second output file
new_df = pd.read_csv('updated_dataset_preprocessed.csv')
columns_to_save = ['comment_id', 'text', 'emotionaldistress', 'provokingviolence', 'individualharrassment', 'hatespeechintensity']

# Save the second version into 'dataset_v2.csv'
new_df_v2 = new_df[columns_to_save]
new_df_v2.to_csv('dataset_v2.csv', index=False)

# Print sample data from both datasets
print("Updated Dataset (Preprocessed):")
print(new_df.head())

print("\nDataset V2:")
print(new_df_v2.head())

# Print the length of the new dataset
print(f"\nLength of Dataset V2: {len(new_df_v2)}")


Updated Dataset (Preprocessed):
   comment_id  annotator_id  platform  sentiment  respect  insult  humiliate  \
0       47777         10873         3          0        0       0          0   
1       39773          2790         2          0        0       0          0   
2       47101          3379         3          4        4       4          4   
3       43625          7365         3          2        3       2          1   
4       12538           488         0          4        4       4          4   

   status  dehumanize  violence  ...  hatespeech  \
0       2           0         0  ...           0   
1       2           0         0  ...           0   
2       4           4         0  ...           2   
3       2           0         0  ...           0   
4       4           4         4  ...           2   

                                                text  hate_speech_score  \
0  Yes indeed. She sort of reminds me of the elde...              -3.90   
1  The trans women readi

In [None]:
new_df1 = pd.read_csv('updated_dataset_preprocessed.csv')

new_columns = [
    'emotionaldistavg', 'emotionaldistress', 'provokingviolenceavg', 'provokingviolence',
    'individualharassmentavg', 'individualharrassment', 'hatespeechintensity'
]

nan_counts = new_df1[new_columns].isna().sum()

print("Number of NaN values in each new column:")
print(nan_counts)

Number of NaN values in each new column:
emotionaldistavg           0
emotionaldistress          0
provokingviolenceavg       0
provokingviolence          0
individualharassmentavg    0
individualharrassment      0
hatespeechintensity        0
dtype: int64


In [None]:
import pandas as pd
import re
import torch
from transformers import BertTokenizer, BertModel
from bs4 import BeautifulSoup
import string

df = pd.read_csv('updated_dataset_preprocessed.csv')


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model.to(device)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'@\w+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

def tokenize_text(text):

    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return encoded

def embed_text(text):
    tokens = tokenize_text(text)
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding


df['embedded_text'] = df['cleaned_text'].apply(embed_text)

df['embedded_text'] = df['embedded_text'].apply(lambda x: x.tolist())

columns_to_save = [
    'comment_id', 'embedded_text', 'emotionaldistress',
    'provokingviolence', 'individualharrassment', 'hatespeechintensity'
]

processed_df = df[columns_to_save]

processed_df.to_csv('processed_bert_embedded_dataset.csv', index=False)

print(processed_df.head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Using device: cuda


  text = BeautifulSoup(text, "html.parser").get_text()


KeyboardInterrupt: 

In [None]:
import pandas as pd

file_path = 'processed_bert_embedded_dataset.csv'
df = pd.read_csv(file_path)
df_filtered = df[df['hatespeechintensity'] != 0]
df_final = df_filtered.drop('hatespeechintensity', axis=1)
df_final.to_csv('final_embedded_dataset.csv', index=False)
print(df_final.head())

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
data = pd.read_csv('final_embedded_dataset.csv')
data['embedded_text'] = data['embedded_text'].apply(lambda x: np.fromstring(x[1:-1], sep=','))
X = np.array(data['embedded_text'].tolist())

# Define columns and class counts
target_columns = {
    'provokingviolence': 4,
    'individualharrassment': 4,
    'emotionaldistress': 3
}

# One-hot encode each target column and prepare the dataset
encoded_targets = {}
for col, num_classes in target_columns.items():
    encoder = OneHotEncoder(sparse_output=False)
    encoded_targets[col] = encoder.fit_transform(data[col].values.reshape(-1, 1))

# Concatenate one-hot encoded targets into a single array for easy indexing
y = np.hstack([encoded_targets[col] for col in target_columns])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape X to add a time dimension for LSTM (seq_len=1 for static embeddings)
X_train = X_train[:, np.newaxis, :]
X_val = X_val[:, np.newaxis, :]

# Create a custom Dataset class
class MultiOutputDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.features[idx], dtype=torch.float32),
            'label': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# Create DataLoaders
train_dataset = MultiOutputDataset(X_train, y_train)
val_dataset = MultiOutputDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the LSTM model with multiple output heads
class MultiOutputLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dims, num_layers=1):
        super(MultiOutputLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

        # Separate output layers for each target
        self.output_heads = nn.ModuleDict({
            target: nn.Linear(hidden_dim, output_dim) for target, output_dim in output_dims.items()
        })
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        x = self.dropout(hn[-1])

        # Return output from each output head
        outputs = {target: head(x) for target, head in self.output_heads.items()}
        return outputs

# Instantiate the model with separate output layers for each target
input_dim = X.shape[1]  # Number of features per timestep
hidden_dim = 64         # Number of features in LSTM hidden state
output_dims = {target: num_classes for target, num_classes in target_columns.items()}
model = MultiOutputLSTM(input_dim, hidden_dim, output_dims)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Suitable for multi-class with logits
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training function for multi-output model
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_data = batch['input'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_data)
            # Compute loss separately for each output head
            losses = [criterion(outputs[target], labels[:, start:end])
                      for target, (start, end) in zip(target_columns.keys(),
                                                      [(0,4), (4,8), (8,11)])]
            loss = sum(losses)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer, epochs=10)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions, true_labels = {}, {}

    for target in target_columns.keys():
        predictions[target], true_labels[target] = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_data = batch['input'].to(device)
            labels = batch['label'].cpu().numpy()

            outputs = model(input_data)
            for target, pred in outputs.items():
                predictions[target].append(pred.cpu().numpy())
                start, end = (0,4) if target == 'provokingviolence' else ((4,8) if target == 'individualharrassment' else (8,11))
                true_labels[target].append(labels[:, start:end])

    # Concatenate batches
    predictions = {target: np.vstack(preds) for target, preds in predictions.items()}
    true_labels = {target: np.vstack(labels) for target, labels in true_labels.items()}

    return predictions, true_labels

# Evaluate the model
predictions, true_labels = evaluate_model(model, val_loader)

# Apply sigmoid, threshold, and calculate metrics for each output head
for target, num_classes in target_columns.items():
    y_pred_binary = (torch.sigmoid(torch.tensor(predictions[target])) > 0.5).int().numpy()
    y_true_binary = true_labels[target]

    y_pred_labels = np.argmax(y_pred_binary, axis=1)
    y_true_labels = np.argmax(y_true_binary, axis=1)

    print(f"Classification report for {target}:")
    print(classification_report(y_true_labels, y_pred_labels))

    overall_accuracy = accuracy_score(y_true_labels, y_pred_labels)
    print(f"Overall Accuracy for {target}: {overall_accuracy:.4f}\n")


Epoch 1/10, Loss: 1.2389
Epoch 2/10, Loss: 1.1791
Epoch 3/10, Loss: 1.1654
Epoch 4/10, Loss: 1.1570
Epoch 5/10, Loss: 1.1515
Epoch 6/10, Loss: 1.1459
Epoch 7/10, Loss: 1.1429
Epoch 8/10, Loss: 1.1390
Epoch 9/10, Loss: 1.1353
Epoch 10/10, Loss: 1.1325
Classification report for provokingviolence:
              precision    recall  f1-score   support

           0       0.36      0.45      0.40      1975
           1       0.00      0.00      0.00       966
           2       0.65      0.77      0.71      5855
           3       0.80      0.58      0.68      2191

    accuracy                           0.61     10987
   macro avg       0.45      0.45      0.44     10987
weighted avg       0.57      0.61      0.58     10987

Overall Accuracy for provokingviolence: 0.6064

Classification report for individualharrassment:
              precision    recall  f1-score   support

           0       0.01      0.28      0.01        81
           1       0.51      0.39      0.44      2386
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
data = pd.read_csv('final_embedded_dataset.csv')

# Convert the 'embedded_text' column to numpy arrays
data['embedded_text'] = data['embedded_text'].apply(lambda x: np.fromstring(x[1:-1], sep=','))

X = np.array(data['embedded_text'].tolist())

# Define target columns and their respective number of classes
target_columns = {
    'provokingviolence': 4,
    'individualharrassment': 4,
    'emotionaldistress': 3
}

# One-hot encode each target column
encoded_targets = {}
encoders = {}
for col, num_classes in target_columns.items():
    encoder = OneHotEncoder(sparse_output=False, categories='auto')
    encoded = encoder.fit_transform(data[col].values.reshape(-1, 1))
    encoded_targets[col] = encoded
    encoders[col] = encoder  # Save encoder for inverse transformations if needed

# Concatenate one-hot encoded targets into a single array
y = np.hstack([encoded_targets[col] for col in target_columns])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape X to add a time dimension for hierarchical LSTM (word-level)
X_train = X_train[:, np.newaxis, :]
X_val = X_val[:, np.newaxis, :]

# Create a custom Dataset class
class MultiOutputDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.features[idx], dtype=torch.float32),
            'label': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# Create DataLoaders
train_dataset = MultiOutputDataset(X_train, y_train)
val_dataset = MultiOutputDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the Hierarchical BiLSTM model with multiple output heads
class HierarchicalBiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dims, word_num_layers=1, sent_num_layers=1):
        super(HierarchicalBiLSTM, self).__init__()

        self.word_bilstm = nn.LSTM(
            input_dim, hidden_dim, word_num_layers,
            batch_first=True, bidirectional=True
        )

        # Sentence-level BiLSTM to capture sentence-level context
        self.sent_bilstm = nn.LSTM(
            hidden_dim * 2, hidden_dim, sent_num_layers,
            batch_first=True, bidirectional=True
        )

        # Define output heads for each target
        self.output_heads = nn.ModuleDict({
            target: nn.Linear(hidden_dim * 2, output_dim)
            for target, output_dim in output_dims.items()
        })

        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        # x: (batch_size, num_words, word_embedding_dim)
        batch_size, num_words, _ = x.size()

        # Word-level BiLSTM
        word_level_outputs, _ = self.word_bilstm(x)  # (batch_size, num_words, hidden_dim * 2)

        # Get the sentence representation by aggregating word-level outputs
        sentence_embedding = word_level_outputs.mean(dim=1)  # (batch_size, hidden_dim * 2)

        # Sentence-level BiLSTM
        sentence_embedding = sentence_embedding.unsqueeze(1)  # Add a pseudo-sequence length of 1
        sentence_output, (hn, cn) = self.sent_bilstm(sentence_embedding)

        # Aggregate forward and backward hidden states
        sentence_output = sentence_output.squeeze(1)

        # Apply dropout
        x = self.dropout(sentence_output)

        # Compute outputs for each target
        outputs = {target: head(x) for target, head in self.output_heads.items()}
        return outputs

# Instantiate the model
input_dim = X.shape[1]  # Number of features per word embedding
hidden_dim = 64         # Number of features in LSTM hidden state
output_dims = {target: num_classes for target, num_classes in target_columns.items()}
model = HierarchicalBiLSTM(input_dim, hidden_dim, output_dims)

# Define loss functions for each output
criteria = {
    target: nn.CrossEntropyLoss()  # Adjust class weights if necessary
    for target in target_columns
}

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training function
def train_model(model, train_loader, criteria, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_data = batch['input'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_data)

            # Compute loss for each target
            loss = 0
            for idx, target in enumerate(target_columns.keys()):
                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                target_labels = torch.argmax(labels[:, start:end], dim=1)
                loss += criteria[target](outputs[target], target_labels)

            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# Train the model
train_model(model, train_loader, criteria, optimizer, epochs=20)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions = {target: [] for target in target_columns}
    true_labels = {target: [] for target in target_columns}

    with torch.no_grad():
        for batch in val_loader:
            input_data = batch['input'].to(device)
            labels = batch['label'].cpu().numpy()

            outputs = model(input_data)
            for idx, target in enumerate(target_columns.keys()):
                preds = outputs[target].cpu().numpy()
                predictions[target].append(preds)

                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                true_labels[target].append(labels[:, start:end])

    # Concatenate all batches
    for target in target_columns:
        predictions[target] = np.vstack(predictions[target])
        true_labels[target] = np.vstack(true_labels[target])

    return predictions, true_labels

# Evaluate the model
predictions, true_labels = evaluate_model(model, val_loader)

# Function to compute metrics
def compute_metrics(predictions, true_labels, encoders, target_columns):
    for target, num_classes in target_columns.items():
        y_pred = np.argmax(predictions[target], axis=1)
        y_true = np.argmax(true_labels[target], axis=1)

        encoder = encoders[target]
        target_names = [str(cls) for cls in encoder.categories_[0]]

        print(f"Classification Report for '{target}':")
        print(classification_report(y_true, y_pred, target_names=target_names))

        accuracy = accuracy_score(y_true, y_pred)
        print(f"Overall Accuracy for '{target}': {accuracy:.4f}\n")

# Display the metrics
compute_metrics(predictions, true_labels, encoders, target_columns)


Epoch 1/20, Loss: 2.4809
Epoch 2/20, Loss: 2.3710
Epoch 3/20, Loss: 2.3448
Epoch 4/20, Loss: 2.3248
Epoch 5/20, Loss: 2.3063
Epoch 6/20, Loss: 2.2881
Epoch 7/20, Loss: 2.2692
Epoch 8/20, Loss: 2.2508
Epoch 9/20, Loss: 2.2324
Epoch 10/20, Loss: 2.2095
Epoch 11/20, Loss: 2.1891
Epoch 12/20, Loss: 2.1653
Epoch 13/20, Loss: 2.1408
Epoch 14/20, Loss: 2.1193
Epoch 15/20, Loss: 2.0992
Epoch 16/20, Loss: 2.0808
Epoch 17/20, Loss: 2.0634
Epoch 18/20, Loss: 2.0390
Epoch 19/20, Loss: 2.0159
Epoch 20/20, Loss: 2.0012
Classification Report for 'provokingviolence':
              precision    recall  f1-score   support

           0       0.47      0.37      0.41      1975
           1       0.18      0.03      0.05       966
           2       0.64      0.79      0.71      5855
           3       0.74      0.70      0.72      2191

    accuracy                           0.63     10987
   macro avg       0.51      0.47      0.47     10987
weighted avg       0.59      0.63      0.60     10987

Overall

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd

# Load the dataset
data = pd.read_csv('final_embedded_dataset.csv')
data['embedded_text'] = data['embedded_text'].apply(lambda x: np.fromstring(x[1:-1], sep=','))

X = np.array(data['embedded_text'].tolist())

# Define target columns and their respective number of classes
target_columns = {
    'provokingviolence': 4,
    'individualharrassment': 4,
    'emotionaldistress': 3
}

# One-hot encode each target column
encoded_targets = {}
encoders = {}
for col, num_classes in target_columns.items():
    encoder = OneHotEncoder(sparse_output=False, categories='auto')
    encoded = encoder.fit_transform(data[col].values.reshape(-1, 1))
    encoded_targets[col] = encoded
    encoders[col] = encoder

# Concatenate one-hot encoded targets into a single array
y = np.hstack([encoded_targets[col] for col in target_columns])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape X to add a time dimension for MTM LSTM
X_train = X_train[:, np.newaxis, :]
X_val = X_val[:, np.newaxis, :]

# Create a custom Dataset class
class MultiOutputDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.features[idx], dtype=torch.float32),
            'label': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# Create DataLoaders
train_dataset = MultiOutputDataset(X_train, y_train)
val_dataset = MultiOutputDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the MTM LSTM model
class MTMLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dims, num_layers=1):
        super(MTMLSTM, self).__init__()

        # LSTM layer
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers,
            batch_first=True, bidirectional=True
        )

        # Define output heads for each target
        self.output_heads = nn.ModuleDict({
            target: nn.Linear(hidden_dim * 2, output_dim)
            for target, output_dim in output_dims.items()
        })

        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        # LSTM output
        lstm_out, _ = self.lstm(x)

        # Apply dropout
        lstm_out = self.dropout(lstm_out)

        # Compute outputs for each target
        outputs = {target: head(lstm_out[:, -1, :]) for target, head in self.output_heads.items()}
        return outputs

# Instantiate the model
input_dim = X.shape[1]  # Number of features per word embedding
hidden_dim = 64         # Number of features in LSTM hidden state
output_dims = {target: num_classes for target, num_classes in target_columns.items()}
model = MTMLSTM(input_dim, hidden_dim, output_dims)

# Define loss functions for each output
criteria = {
    target: nn.CrossEntropyLoss()
    for target in target_columns
}

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training function
def train_model(model, train_loader, criteria, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_data = batch['input'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_data)

            # Compute loss for each target
            loss = 0
            for idx, target in enumerate(target_columns.keys()):
                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                target_labels = torch.argmax(labels[:, start:end], dim=1)
                loss += criteria[target](outputs[target], target_labels)

            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# Train the model
train_model(model, train_loader, criteria, optimizer, epochs=10)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions = {target: [] for target in target_columns}
    true_labels = {target: [] for target in target_columns}

    with torch.no_grad():
        for batch in val_loader:
            input_data = batch['input'].to(device)
            labels = batch['label'].cpu().numpy()

            outputs = model(input_data)
            for idx, target in enumerate(target_columns.keys()):
                preds = outputs[target].cpu().numpy()
                predictions[target].append(preds)

                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                true_labels[target].append(labels[:, start:end])

    for target in target_columns:
        predictions[target] = np.vstack(predictions[target])
        true_labels[target] = np.vstack(true_labels[target])

    return predictions, true_labels

# Evaluate the model
predictions, true_labels = evaluate_model(model, val_loader)

# Function to compute metrics
def compute_metrics(predictions, true_labels, encoders, target_columns):
    for target, num_classes in target_columns.items():
        y_pred = np.argmax(predictions[target], axis=1)
        y_true = np.argmax(true_labels[target], axis=1)

        encoder = encoders[target]
        target_names = [str(cls) for cls in encoder.categories_[0]]

        print(f"Classification Report for '{target}':")
        print(classification_report(y_true, y_pred, target_names=target_names))

        accuracy = accuracy_score(y_true, y_pred)
        print(f"Overall Accuracy for '{target}': {accuracy:.4f}\n")

# Display the metrics
compute_metrics(predictions, true_labels, encoders, target_columns)


Epoch 1/10, Loss: 2.4877
Epoch 2/10, Loss: 2.3905
Epoch 3/10, Loss: 2.3683
Epoch 4/10, Loss: 2.3497
Epoch 5/10, Loss: 2.3359
Epoch 6/10, Loss: 2.3231
Epoch 7/10, Loss: 2.3154
Epoch 8/10, Loss: 2.3016
Epoch 9/10, Loss: 2.2947
Epoch 10/10, Loss: 2.2848
Classification Report for 'provokingviolence':
              precision    recall  f1-score   support

           0       0.46      0.42      0.44      1975
           1       0.00      0.00      0.00       966
           2       0.65      0.79      0.71      5855
           3       0.75      0.71      0.73      2191

    accuracy                           0.64     10987
   macro avg       0.46      0.48      0.47     10987
weighted avg       0.58      0.64      0.60     10987

Overall Accuracy for 'provokingviolence': 0.6371

Classification Report for 'individualharrassment':
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        81
           1       0.48      0.46      0.47      2386
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights for each target
class_weights = {}
for target, num_classes in target_columns.items():
    y_true = data[target].values
    weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_true), y=y_true)
    class_weights[target] = torch.tensor(weights, dtype=torch.float32).to(device)

# Update loss functions to include class weights
criteria = {
    target: nn.CrossEntropyLoss(weight=class_weights[target])
    for target in target_columns
}


In [4]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
data = pd.read_csv('final_embedded_dataset.csv')

# Convert the 'embedding' column to numpy arrays
data['embedded_text'] = data['embedded_text'].apply(lambda x: np.fromstring(x[1:-1], sep=','))
X = np.array(data['embedded_text'].tolist())

# Define target columns and their respective number of classes
target_columns = {
    'provokingviolence': 4,
    'individualharrassment': 4,
    'emotionaldistress': 3
}

# One-hot encode each target column
encoded_targets = {}
encoders = {}
for col, num_classes in target_columns.items():
    encoder = OneHotEncoder(sparse_output=False)
    encoded = encoder.fit_transform(data[col].values.reshape(-1, 1))
    encoded_targets[col] = encoded
    encoders[col] = encoder

# Concatenate one-hot encoded targets into a single array
y = np.hstack([encoded_targets[col] for col in target_columns])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom Dataset class
class MultiOutputDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.features[idx], dtype=torch.float32),
            'label': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# Create DataLoaders
train_dataset = MultiOutputDataset(X_train, y_train)
val_dataset = MultiOutputDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the Multi-Output MLP model
class MultiOutputMLPClassifier(nn.Module):
    def __init__(self, input_dim, output_dims):
        super(MultiOutputMLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First hidden layer
        self.fc2 = nn.Linear(128, 64)          # Second hidden layer
        self.dropout = nn.Dropout(0.3)         # Dropout for regularization

        # Separate output heads for each target
        self.output_heads = nn.ModuleDict({
            target: nn.Linear(64, output_dim)
            for target, output_dim in output_dims.items()
        })

    def forward(self, x):
        x = torch.relu(self.fc1(x))            # Activation function
        x = self.dropout(x)                     # Apply dropout
        x = torch.relu(self.fc2(x))            # Activation function

        # Compute outputs for each target
        outputs = {target: head(x) for target, head in self.output_heads.items()}
        return outputs

# Instantiate the model
input_dim = X.shape[1]  # Number of features
output_dims = {col: num_classes for col, num_classes in target_columns.items()}
model = MultiOutputMLPClassifier(input_dim, output_dims)

# Define loss functions for each output
criteria = {
    target: nn.BCEWithLogitsLoss()
    for target in target_columns
}

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training function
def train_model(model, train_loader, criteria, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_data = batch['input'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_data)

            # Compute loss for each target
            loss = 0
            for idx, target in enumerate(target_columns.keys()):
                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                target_labels = labels[:, start:end]
                loss += criteria[target](outputs[target], target_labels)

            total_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# Train the model
train_model(model, train_loader, criteria, optimizer, epochs=10)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions = {target: [] for target in target_columns}
    true_labels = {target: [] for target in target_columns}

    with torch.no_grad():
        for batch in val_loader:
            input_data = batch['input'].to(device)
            labels = batch['label'].cpu().numpy()

            outputs = model(input_data)
            for idx, target in enumerate(target_columns.keys()):
                preds = outputs[target].cpu().numpy()
                predictions[target].append(preds)

                start = sum(list(target_columns.values())[:idx])
                end = start + target_columns[target]
                true_labels[target].append(labels[:, start:end])

    for target in target_columns:
        predictions[target] = np.vstack(predictions[target])
        true_labels[target] = np.vstack(true_labels[target])

    return predictions, true_labels

# Evaluate the model
predictions, true_labels = evaluate_model(model, val_loader)

# Compute metrics for each target
def compute_metrics(predictions, true_labels, encoders, target_columns):
    for target, num_classes in target_columns.items():
        y_pred = (torch.sigmoid(torch.tensor(predictions[target])) > 0.5).int().numpy()
        y_true = true_labels[target]

        y_pred_labels = np.argmax(y_pred, axis=1)
        y_true_labels = np.argmax(y_true, axis=1)

        encoder = encoders[target]
        target_names = [str(cls) for cls in encoder.categories_[0]]

        print(f"Classification Report for '{target}':")
        print(classification_report(y_true_labels, y_pred_labels, target_names=target_names))

        accuracy = accuracy_score(y_true_labels, y_pred_labels)
        print(f"Overall Accuracy for '{target}': {accuracy:.4f}\n")

# Display the metrics
compute_metrics(predictions, true_labels, encoders, target_columns)


Epoch 1/10, Loss: 1.2228
Epoch 2/10, Loss: 1.1730
Epoch 3/10, Loss: 1.1642
Epoch 4/10, Loss: 1.1575
Epoch 5/10, Loss: 1.1529
Epoch 6/10, Loss: 1.1470
Epoch 7/10, Loss: 1.1458
Epoch 8/10, Loss: 1.1423
Epoch 9/10, Loss: 1.1394
Epoch 10/10, Loss: 1.1379
Classification Report for 'provokingviolence':
              precision    recall  f1-score   support

           0       0.43      0.39      0.41      1975
           1       0.00      0.00      0.00       966
           2       0.64      0.82      0.72      5855
           3       0.79      0.62      0.70      2191

    accuracy                           0.63     10987
   macro avg       0.47      0.46      0.46     10987
weighted avg       0.58      0.63      0.60     10987

Overall Accuracy for 'provokingviolence': 0.6321

Classification Report for 'individualharrassment':
              precision    recall  f1-score   support

           0       0.01      0.27      0.02        81
           1       0.58      0.19      0.28      2386
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb

# Load the dataset
data = pd.read_csv('final_embedded_dataset.csv')

# Convert the 'embedded_text' column to numpy arrays
data['embedded_text'] = data['embedded_text'].apply(lambda x: np.fromstring(x[1:-1], sep=','))
X = np.array(data['embedded_text'].tolist())

# Define the target columns and initialize label encoders for each
target_columns = ['provokingviolence', 'individualharrassment', 'emotionaldistress']
label_encoders = {col: LabelEncoder() for col in target_columns}

# Encode the labels for each target column
y_encoded = {}
for col in target_columns:
    y_encoded[col] = label_encoders[col].fit_transform(data[col])

# Split data into training and validation sets for each target column
train_test_splits = {}
for col in target_columns:
    X_train, X_val, y_train, y_val = train_test_split(X, y_encoded[col], test_size=0.2, random_state=42)
    train_test_splits[col] = (X_train, X_val, y_train, y_val)

# Function to train and evaluate XGBoost for each target
def train_evaluate_xgboost(target_column):
    X_train, X_val, y_train, y_val = train_test_splits[target_column]

    # Initialize XGBoost classifier with suitable parameters
    model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=len(label_encoders[target_column].classes_),  # Number of classes for the target
        eval_metric='mlogloss',
        use_label_encoder=False,
        max_depth=6,
        learning_rate=0.1,
        n_estimators=100,
        random_state=42
    )

    # Train the model
    model.fit(X_train, y_train)

    # Predict on validation data
    y_pred = model.predict(X_val)

    # Convert predictions and true labels back to original labels
    y_pred_labels = label_encoders[target_column].inverse_transform(y_pred)
    y_val_labels = label_encoders[target_column].inverse_transform(y_val)

    # Print classification report and accuracy
    print(f"Classification Report for '{target_column}':")
    print(classification_report(y_val_labels, y_pred_labels))
    accuracy = accuracy_score(y_val_labels, y_pred_labels)
    print(f"Overall Accuracy for '{target_column}': {accuracy:.4f}\n")

    return model

# Train and evaluate XGBoost model for each target column
models = {}
for col in target_columns:
    print(f"Training and evaluating model for target: {col}")
    models[col] = train_evaluate_xgboost(col)


Training and evaluating model for target: provokingviolence


Parameters: { "use_label_encoder" } are not used.



Classification Report for 'provokingviolence':
              precision    recall  f1-score   support

           0       0.48      0.24      0.32      1975
           1       0.21      0.01      0.03       966
           2       0.63      0.85      0.72      5855
           3       0.75      0.70      0.73      2191

    accuracy                           0.63     10987
   macro avg       0.52      0.45      0.45     10987
weighted avg       0.59      0.63      0.59     10987

Overall Accuracy for 'provokingviolence': 0.6348

Training and evaluating model for target: individualharrassment


Parameters: { "use_label_encoder" } are not used.



Classification Report for 'individualharrassment':
              precision    recall  f1-score   support

           0       0.29      0.02      0.05        81
           1       0.50      0.31      0.38      2386
           2       0.53      0.75      0.62      5430
           3       0.55      0.33      0.41      3090

    accuracy                           0.53     10987
   macro avg       0.47      0.35      0.37     10987
weighted avg       0.53      0.53      0.51     10987

Overall Accuracy for 'individualharrassment': 0.5316

Training and evaluating model for target: emotionaldistress


Parameters: { "use_label_encoder" } are not used.



Classification Report for 'emotionaldistress':
              precision    recall  f1-score   support

           0       0.50      0.03      0.06       100
           1       0.60      0.41      0.49      3151
           2       0.79      0.90      0.84      7736

    accuracy                           0.75     10987
   macro avg       0.63      0.45      0.46     10987
weighted avg       0.73      0.75      0.73     10987

Overall Accuracy for 'emotionaldistress': 0.7494



In [6]:
!pip install keras tensorflow



In [8]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Bidirectional, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Load your dataset
data_file = 'final_embedded_dataset.csv'  # Adjust this to your dataset path
data = pd.read_csv(data_file)

# Assume your dataset has the text and the labels in the following columns
texts = data['embedded_text'].tolist()  # Column with your input text
labels = data[['provokingviolence', 'individualharrassment', 'emotionaldistress']]  # Adjust based on your actual column names

# Tokenization parameters
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 350
EMBEDDING_DIM = 100

# Tokenization and padding
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

# Convert labels to categorical for each output
Y_provoking = pd.get_dummies(labels['provokingviolence']).values
Y_harassment = pd.get_dummies(labels['individualharrassment']).values
Y_distress = pd.get_dummies(labels['emotionaldistress']).values

# Split data into training and test sets
X_train, X_test, Y_train_provoking, Y_test_provoking = train_test_split(X, Y_provoking, test_size=0.30, random_state=1)
_, _, Y_train_harassment, Y_test_harassment = train_test_split(X, Y_harassment, test_size=0.30, random_state=1)
_, _, Y_train_distress, Y_test_distress = train_test_split(X, Y_distress, test_size=0.30, random_state=1)

# Model architecture with multiple outputs
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_layer)
x = SpatialDropout1D(0.2)(embedding_layer)
x = Bidirectional(LSTM(200, dropout=0.2, recurrent_dropout=0.2))(x)

# Define separate output layers for each label
output_provoking = Dense(4, activation='softmax', name='provokingviolence')(x)
output_harassment = Dense(4, activation='softmax', name='individualharrassment')(x)
output_distress = Dense(3, activation='softmax', name='emotionaldistress')(x)

# Model architecture with multiple outputs
model = Model(inputs=input_layer, outputs=[output_provoking, output_harassment, output_distress])

# Compile multi-output model with separate metrics for each output
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', 'accuracy', 'accuracy'])  # One metric for each output
print(model.summary())

# Train the model
epochs = 10  # Adjust the number of epochs as needed
batch_size = 64
history = model.fit(
    X_train,
    [Y_train_provoking, Y_train_harassment, Y_train_distress],
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.1,
    callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)]
)

# Evaluate the model
test_results = model.evaluate(X_test, [Y_test_provoking, Y_test_harassment, Y_test_distress])
print(f"Evaluation Results: {test_results}")

# Predict and evaluate each output independently
preds_provoking, preds_harassment, preds_distress = model.predict(X_test)

# Convert predictions to binary for each label
preds_provoking_binary = (preds_provoking == preds_provoking.max(axis=1, keepdims=1)).astype(int)
preds_harassment_binary = (preds_harassment == preds_harassment.max(axis=1, keepdims=1)).astype(int)
preds_distress_binary = (preds_distress == preds_distress.max(axis=1, keepdims=1)).astype(int)

# Evaluate classification metrics
print("Classification Report for Provoking Violence:")
print(classification_report(Y_test_provoking.argmax(axis=1), preds_provoking_binary.argmax(axis=1)))

print("Classification Report for Individual Harassment:")
print(classification_report(Y_test_harassment.argmax(axis=1), preds_harassment_binary.argmax(axis=1)))

print("Classification Report for Emotional Distress:")
print(classification_report(Y_test_distress.argmax(axis=1), preds_distress_binary.argmax(axis=1)))




None
Epoch 1/10
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m660s[0m 1s/step - emotionaldistress_accuracy: 0.6967 - individualharrassment_accuracy: 0.5050 - loss: 2.7077 - provokingviolence_accuracy: 0.5885 - val_emotionaldistress_accuracy: 0.7176 - val_individualharrassment_accuracy: 0.5213 - val_loss: 2.4630 - val_provokingviolence_accuracy: 0.6334
Epoch 2/10
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m676s[0m 1s/step - emotionaldistress_accuracy: 0.7126 - individualharrassment_accuracy: 0.5321 - loss: 2.4768 - provokingviolence_accuracy: 0.6247 - val_emotionaldistress_accuracy: 0.7166 - val_individualharrassment_accuracy: 0.5372 - val_loss: 2.4475 - val_provokingviolence_accuracy: 0.6318
Epoch 3/10
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m679s[0m 1s/step - emotionaldistress_accuracy: 0.7183 - individualharrassment_accuracy: 0.5364 - loss: 2.4488 - provokingviolence_accuracy: 0.6325 - val_emotionaldistress_accuracy: 0.7181 - val_indiv

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
import pandas as pd
import numpy as np
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from keras import backend as K
from sklearn.metrics import classification_report

# Load your dataset
data = pd.read_csv('final_embedded_dataset.csv')  # Your dataset with precomputed embeddings

# Assuming 'embedded_text' contains lists of embeddings as strings
# Convert the string representations of lists to actual lists
X = np.array(data['embedded_text'].apply(lambda x: np.fromstring(x.strip("[]"), sep=',')).tolist())

# Check the shape of X after conversion
print(f"Shape of X after converting: {X.shape}")

# Prepare target variables as one-hot encoded arrays
Y_provoking = pd.get_dummies(labels['provokingviolence']).values
Y_harassment = pd.get_dummies(labels['individualharrassment']).values
Y_distress = pd.get_dummies(labels['emotionaldistress']).values

# Check shapes of Y as well
print(f"Shapes of Y: Provoking: {Y_provoking.shape}, Harassment: {Y_harassment.shape}, Distress: {Y_distress.shape}")

# Split the data
X_train, X_test, Y_train_provoking, Y_test_provoking = train_test_split(X, Y_provoking, test_size=0.3, random_state=42)
_, _, Y_train_harassment, Y_test_harassment = train_test_split(X, Y_harassment, test_size=0.3, random_state=42)
_, _, Y_train_distress, Y_test_distress = train_test_split(X, Y_distress, test_size=0.3, random_state=42)

# Build the multi-task LSTM model
input_layer = Input(shape=(X.shape[1],))  # Adjust input shape based on your embeddings
x = Dense(256, activation='relu')(input_layer)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)

# Output layers for each task
output_provoking = Dense(4, activation='softmax', name='provokingviolence')(x)
output_harassment = Dense(4, activation='softmax', name='individualharrassment')(x)
output_distress = Dense(3, activation='softmax', name='emotionaldistress')(x)

metrics = ['accuracy'] * 3
# Compile the model
model = Model(inputs=input_layer, outputs=[output_provoking, output_harassment, output_distress])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = metrics)

# Train the model
epochs = 10
batch_size = 64
history = model.fit(
    X_train,
    [Y_train_provoking, Y_train_harassment, Y_train_distress],
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.1,
    callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)]
)

# Evaluate the model
test_results = model.evaluate(X_test, [Y_test_provoking, Y_test_harassment, Y_test_distress])
print(f"Evaluation Results: {test_results}")

# Predict and evaluate
preds_provoking, preds_harassment, preds_distress = model.predict(X_test)

# Classification reports
print("Classification Report for Provoking Violence:")
print(classification_report(Y_test_provoking.argmax(axis=1), preds_provoking.argmax(axis=1)))

print("Classification Report for Individual Harassment:")
print(classification_report(Y_test_harassment.argmax(axis=1), preds_harassment.argmax(axis=1)))

print("Classification Report for Emotional Distress:")
print(classification_report(Y_test_distress.argmax(axis=1), preds_distress.argmax(axis=1)))

# Define the squared Euclidean distance function
def squared_euclidean_distance(y_true, y_pred):
    return K.sum(K.square(y_true - y_pred), axis=-1)

# Example usage of squared Euclidean distance
# This should be part of a custom metric if needed
# distance = squared_euclidean_distance(Y_test_provoking, preds_provoking)


Shape of X after converting: (54932, 768)
Shapes of Y: Provoking: (54932, 4), Harassment: (54932, 4), Distress: (54932, 3)
Epoch 1/10
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - emotionaldistress_accuracy: 0.7028 - individualharrassment_accuracy: 0.5016 - loss: 2.7077 - provokingviolence_accuracy: 0.5751 - val_emotionaldistress_accuracy: 0.7376 - val_individualharrassment_accuracy: 0.5411 - val_loss: 2.3961 - val_provokingviolence_accuracy: 0.6186
Epoch 2/10
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - emotionaldistress_accuracy: 0.7420 - individualharrassment_accuracy: 0.5250 - loss: 2.4447 - provokingviolence_accuracy: 0.6216 - val_emotionaldistress_accuracy: 0.7426 - val_individualharrassment_accuracy: 0.5416 - val_loss: 2.3809 - val_provokingviolence_accuracy: 0.6206
Epoch 3/10
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - emotionaldistress_accuracy: 0.7399 - individualharrassment_accu

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
