In [1]:
import numpy as np
import os

# Function to load embeddings from saved files
def load_embeddings(directory):
    # Extract numeric ranges and sort filenames accordingly
    def extract_range(filename):
        start, end = map(int, filename.rstrip('.npy').split('_')[-2:])
        return start, end
    
    # Get sorted filenames
    sentence_embeddings_files = sorted([f for f in os.listdir(directory) if f.startswith('sentence_embeddings_') and f.endswith('.npy')],
                                       key=extract_range)
    text_embeddings_files = sorted([f for f in os.listdir(directory) if f.startswith('text_embeddings_') and f.endswith('.npy')],
                                   key=extract_range)
    
    sentence_embeddings_list = []
    text_embeddings_list = []
    
    for sentence_file, text_file in zip(sentence_embeddings_files, text_embeddings_files):
        print(f"Loading sentence embeddings from file: {sentence_file}")
        sentence_embeddings = np.load(os.path.join(directory, sentence_file))
        
        print(f"Loading text embeddings from file: {text_file}")
        text_embeddings = np.load(os.path.join(directory, text_file))
        
        sentence_embeddings_list.append(sentence_embeddings)
        text_embeddings_list.append(text_embeddings)
    
    all_sentence_embeddings = np.concatenate(sentence_embeddings_list, axis=0)
    all_text_embeddings = np.concatenate(text_embeddings_list, axis=0)
    
    return all_sentence_embeddings, all_text_embeddings

# Load embeddings for training
all_sentence_embeddings, all_text_embeddings = load_embeddings('embeddings')
print("All Sentence Embeddings Shape:", all_sentence_embeddings.shape)
print("All Text Embeddings Shape:", all_text_embeddings.shape)



Loading sentence embeddings from file: sentence_embeddings_1_1000.npy
Loading text embeddings from file: text_embeddings_1_1000.npy
Loading sentence embeddings from file: sentence_embeddings_1001_2000.npy
Loading text embeddings from file: text_embeddings_1001_2000.npy
Loading sentence embeddings from file: sentence_embeddings_2001_3000.npy
Loading text embeddings from file: text_embeddings_2001_3000.npy
Loading sentence embeddings from file: sentence_embeddings_3001_4000.npy
Loading text embeddings from file: text_embeddings_3001_4000.npy
Loading sentence embeddings from file: sentence_embeddings_4001_5000.npy
Loading text embeddings from file: text_embeddings_4001_5000.npy
Loading sentence embeddings from file: sentence_embeddings_5001_6000.npy
Loading text embeddings from file: text_embeddings_5001_6000.npy
Loading sentence embeddings from file: sentence_embeddings_6001_7000.npy
Loading text embeddings from file: text_embeddings_6001_7000.npy
Loading sentence embeddings from file: s

In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import os

# Load labels from CSV file
labels_df = pd.read_csv('Offensive_Humor_detection.csv')
labels = labels_df['joke_type'].values
print("Labels Shape:", labels.shape)

  from .autonotebook import tqdm as notebook_tqdm



Labels Shape: (21860,)


In [3]:
# Define the neural network architecture
max_sentences = all_sentence_embeddings.shape[1]  # Number of sentences per sample
embedding_dim = all_sentence_embeddings.shape[2]  # Embedding dimension

print(max_sentences)
print(embedding_dim)

6
768


In [4]:
# Convert labels to numpy array
labels = np.array(labels)
all_sentence_embeddings = np.array(all_sentence_embeddings)
all_text_embeddings = np.array(all_text_embeddings)

In [5]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate
from sklearn.model_selection import train_test_split

# Define the necessary parameters
num_sentences = 6  # Replace with your actual number of sentences per sample
embedding_dim = 768  # Dimension of the BERT embeddings
num_classes = 4  # Number of classes for classification

# Input shape for sentence embeddings
input_sentences = Input(shape=(num_sentences, embedding_dim))

# Define parallel lines for sentence embeddings
sentence_outputs = []
for _ in range(num_sentences):
    x = Dense(128, activation='relu')(input_sentences[:, _, :])
    x = Dropout(0.2)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    sentence_output = Dense(20, activation='relu')(x)
    sentence_outputs.append(sentence_output)

# Input shape for text embeddings
input_text = Input(shape=(embedding_dim,))

# Define parallel lines for text embeddings
text_outputs = []
for _ in range(3):
    x = Dense(128, activation='relu')(input_text)
    x = Dropout(0.2)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    text_output = Dense(60, activation='relu')(x)
    text_outputs.append(text_output)

# Concatenate the outputs of sentence and text branches
concatenated = Concatenate()(sentence_outputs + text_outputs)

# Define a fully connected layer
x = Dense(256, activation='relu')(concatenated)
x = Dropout(0.2)(x)

# Define the sequential layers
sequential_layers = Sequential([
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

# Connect the fully connected layer with the sequential layers
output = sequential_layers(x)

# Build the model
model = Model(inputs=[input_sentences, input_text], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


In [6]:

num_samples = 21860
num_sentences = 6 
embedding_dim = 768  

X_sentence_embeddings = all_sentence_embeddings  # This should be of shape (num_samples, num_sentences, embedding_dim)
X_text_embeddings = all_text_embeddings  # This should be of shape (num_samples, embedding_dim)
y_labels = labels  # This should be of shape (num_samples,)

# Ensure your data is the correct shape
assert X_sentence_embeddings.shape == (num_samples, num_sentences, embedding_dim)
assert X_text_embeddings.shape == (num_samples, embedding_dim)
assert y_labels.shape[0] == num_samples

# Split data into training and validation sets
X_train_sentences, X_val_sentences, X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_sentence_embeddings, X_text_embeddings, y_labels, test_size=0.2, random_state=42
)

# Train the model
history = model.fit(
    [X_train_sentences, X_train_text],
    y_train,
    validation_data=([X_val_sentences, X_val_text], y_val),
    epochs=40,  # Set the number of epochs
    batch_size=32,  # Set the batch size
    verbose=1  # Print progress during training
)


Epoch 1/40




[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.6030 - loss: 0.8096 - val_accuracy: 0.7543 - val_loss: 0.5642
Epoch 2/40
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.7536 - loss: 0.5822 - val_accuracy: 0.7665 - val_loss: 0.5433
Epoch 3/40
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7680 - loss: 0.5424 - val_accuracy: 0.7523 - val_loss: 0.5779
Epoch 4/40
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7779 - loss: 0.5212 - val_accuracy: 0.7585 - val_loss: 0.5438
Epoch 5/40
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7839 - loss: 0.5136 - val_accuracy: 0.7617 - val_loss: 0.5863
Epoch 6/40
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7894 - loss: 0.4974 - val_accuracy: 0.7697 - val_loss: 0.5448
Epoch 7/40
[1m547/547[0m [32m━━━━━━━

In [7]:

# Prepare binary labels for Stage 1: Humor (1) vs Non-Humor (0)
labels_binary = np.where(labels == 4, 0, 1)

# Separate humor samples for Stage 2 classification (offensive vs classic humor)
humor_labels = labels[labels != 4]  # Exclude non-humor samples
humor_sentence_embeddings = all_sentence_embeddings[labels != 4]
humor_text_embeddings = all_text_embeddings[labels != 4]

# Create binary labels for Stage 2 (0 for clean humor, 1 for offensive humor)
humor_labels_binary = np.where(humor_labels == 0, 0, 1)


In [8]:

from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate
from tensorflow.keras.models import Model

# Stage 1: Define model for humor vs non-humor classification
input_sentences = Input(shape=(num_sentences, embedding_dim))
input_text = Input(shape=(embedding_dim,))

# Parallel layers for sentence and text embeddings
sentence_layers = [Dense(128, activation='relu')(input_sentences[:, i, :]) for i in range(num_sentences)]
text_layers = [Dense(128, activation='relu')(input_text)] * 3  # Repeating for consistency

# Concatenate layers
combined = Concatenate()(sentence_layers + text_layers)

# Fully connected layers
x = Dense(256, activation='relu')(combined)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
output_stage1 = Dense(1, activation='sigmoid')(x)

# Model for Stage 1
stage1_model = Model(inputs=[input_sentences, input_text], outputs=output_stage1)
stage1_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
stage1_model.summary()


In [9]:

from sklearn.model_selection import train_test_split

# Split data for Stage 1
X_train_sen, X_val_sen, X_train_text, X_val_text, y_train_bin, y_val_bin = train_test_split(
    all_sentence_embeddings, all_text_embeddings, labels_binary, test_size=0.2, random_state=42
)

# Train Stage 1 model
history_stage1 = stage1_model.fit(
    [X_train_sen, X_train_text],
    y_train_bin,
    validation_data=([X_val_sen, X_val_text], y_val_bin),
    epochs=20,
    batch_size=32,
    verbose=1
)


Epoch 1/20




[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9898 - loss: 0.0132 - val_accuracy: 1.0000 - val_loss: 5.1990e-14
Epoch 2/20
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 4.4131e-12 - val_accuracy: 1.0000 - val_loss: 5.1982e-14
Epoch 3/20
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 9.3813e-13 - val_accuracy: 1.0000 - val_loss: 5.1982e-14
Epoch 4/20
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 3.1058e-13 - val_accuracy: 1.0000 - val_loss: 5.1982e-14
Epoch 5/20
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 3.3978e-11 - val_accuracy: 1.0000 - val_loss: 5.1964e-14
Epoch 6/20
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 1.0000 - loss: 1.2075e-12 - val_accuracy: 1.0000 - val_loss: 5.1964

In [10]:

# Stage 2: Define model for offensive vs classic humor classification
input_sentences_stage2 = Input(shape=(num_sentences, embedding_dim))
input_text_stage2 = Input(shape=(embedding_dim,))

# Layers for Stage 2
sentence_layers_stage2 = [Dense(128, activation='relu')(input_sentences_stage2[:, i, :]) for i in range(num_sentences)]
text_layers_stage2 = [Dense(128, activation='relu')(input_text_stage2)] * 3

# Concatenate layers
combined_stage2 = Concatenate()(sentence_layers_stage2 + text_layers_stage2)

# Fully connected layers
x_stage2 = Dense(256, activation='relu')(combined_stage2)
x_stage2 = Dropout(0.2)(x_stage2)
x_stage2 = Dense(128, activation='relu')(x_stage2)
x_stage2 = Dropout(0.2)(x_stage2)
output_stage2 = Dense(1, activation='sigmoid')(x_stage2)

# Model for Stage 2
stage2_model = Model(inputs=[input_sentences_stage2, input_text_stage2], outputs=output_stage2)
stage2_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
stage2_model.summary()


In [11]:

# Split data for Stage 2
X_train_sen_humor, X_val_sen_humor, X_train_text_humor, X_val_text_humor, y_train_humor, y_val_humor = train_test_split(
    humor_sentence_embeddings, humor_text_embeddings, humor_labels_binary, test_size=0.2, random_state=42
)

# Train Stage 2 model
history_stage2 = stage2_model.fit(
    [X_train_sen_humor, X_train_text_humor],
    y_train_humor,
    validation_data=([X_val_sen_humor, X_val_text_humor], y_val_humor),
    epochs=40,
    batch_size=32,
    verbose=1
)


Epoch 1/40




[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8242 - loss: 0.3660 - val_accuracy: 0.8687 - val_loss: 0.2906
Epoch 2/40
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8787 - loss: 0.2771 - val_accuracy: 0.8744 - val_loss: 0.2925
Epoch 3/40
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8881 - loss: 0.2530 - val_accuracy: 0.8731 - val_loss: 0.2792
Epoch 4/40
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8966 - loss: 0.2334 - val_accuracy: 0.8721 - val_loss: 0.2889
Epoch 5/40
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9055 - loss: 0.2126 - val_accuracy: 0.8760 - val_loss: 0.2932
Epoch 6/40
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9193 - loss: 0.1917 - val_accuracy: 0.8673 - val_loss: 0.3144
Epoch 7/40
[1m547/547[0m [32m━━━━━━━

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Predict on the validation set
y_pred_probs = stage2_model.predict([X_val_sen_humor, X_val_text_humor])

# Step 2: Convert probabilities to binary labels
y_pred_binary = (y_pred_probs > 0.5).astype(int)

# Step 3: Compute the confusion matrix
conf_matrix = confusion_matrix(y_val_humor, y_pred_binary)

# Step 4: Compute metrics for each class
report = classification_report(y_val_humor, y_pred_binary, target_names=["Class 0", "Class 1"])

# Print results
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(report)