In [1]:
import numpy as np
import os

# Function to load embeddings from saved files
def load_embeddings(directory):
    # Extract numeric ranges and sort filenames accordingly
    def extract_range(filename):
        start, end = map(int, filename.rstrip('.npy').split('_')[-2:])
        return start, end
    
    # Get sorted filenames
    sentence_embeddings_files = sorted([f for f in os.listdir(directory) if f.startswith('sentence_embeddings_') and f.endswith('.npy')],
                                       key=extract_range)
    text_embeddings_files = sorted([f for f in os.listdir(directory) if f.startswith('text_embeddings_') and f.endswith('.npy')],
                                   key=extract_range)
    
    sentence_embeddings_list = []
    text_embeddings_list = []
    
    for sentence_file, text_file in zip(sentence_embeddings_files, text_embeddings_files):
        print(f"Loading sentence embeddings from file: {sentence_file}")
        sentence_embeddings = np.load(os.path.join(directory, sentence_file))
        
        print(f"Loading text embeddings from file: {text_file}")
        text_embeddings = np.load(os.path.join(directory, text_file))
        
        sentence_embeddings_list.append(sentence_embeddings)
        text_embeddings_list.append(text_embeddings)
    
    all_sentence_embeddings = np.concatenate(sentence_embeddings_list, axis=0)
    all_text_embeddings = np.concatenate(text_embeddings_list, axis=0)
    
    return all_sentence_embeddings, all_text_embeddings

# Load embeddings for training
all_sentence_embeddings, all_text_embeddings = load_embeddings('bert_embeddings')
print("All Sentence Embeddings Shape:", all_sentence_embeddings.shape)
print("All Text Embeddings Shape:", all_text_embeddings.shape)



Loading sentence embeddings from file: sentence_embeddings_1_1000.npy
Loading text embeddings from file: text_embeddings_1_1000.npy
Loading sentence embeddings from file: sentence_embeddings_1001_2000.npy
Loading text embeddings from file: text_embeddings_1001_2000.npy
Loading sentence embeddings from file: sentence_embeddings_2001_3000.npy
Loading text embeddings from file: text_embeddings_2001_3000.npy
Loading sentence embeddings from file: sentence_embeddings_3001_4000.npy
Loading text embeddings from file: text_embeddings_3001_4000.npy
Loading sentence embeddings from file: sentence_embeddings_4001_5000.npy
Loading text embeddings from file: text_embeddings_4001_5000.npy
Loading sentence embeddings from file: sentence_embeddings_5001_5742.npy
Loading text embeddings from file: text_embeddings_5001_5742.npy
All Sentence Embeddings Shape: (5742, 6, 768)
All Text Embeddings Shape: (5742, 768)


In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import os

# Load labels from CSV file
labels_df = pd.read_csv('processed_ruddit_data.csv')
labels = labels_df['class'].values
print("Labels Shape:", labels.shape)

  from .autonotebook import tqdm as notebook_tqdm



Labels Shape: (5742,)


In [3]:
# Define the neural network architecture
max_sentences = all_sentence_embeddings.shape[1]  # Number of sentences per sample
embedding_dim = all_sentence_embeddings.shape[2]  # Embedding dimension

print(max_sentences)
print(embedding_dim)

6
768


In [4]:
# Convert labels to numpy array
labels = np.array(labels)
all_sentence_embeddings = np.array(all_sentence_embeddings)
all_text_embeddings = np.array(all_text_embeddings)

In [5]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate
from sklearn.model_selection import train_test_split

# Define the necessary parameters
num_sentences = 6  # Replace with your actual number of sentences per sample
embedding_dim = 768  # Dimension of the BERT embeddings
num_classes = 4  # Number of classes for classification

# Input shape for sentence embeddings
input_sentences = Input(shape=(num_sentences, embedding_dim))

# Define parallel lines for sentence embeddings
sentence_outputs = []
for _ in range(num_sentences):
    x = Dense(128, activation='relu')(input_sentences[:, _, :])
    x = Dropout(0.2)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    sentence_output = Dense(20, activation='relu')(x)
    sentence_outputs.append(sentence_output)

# Input shape for text embeddings
input_text = Input(shape=(embedding_dim,))

# Define parallel lines for text embeddings
text_outputs = []
for _ in range(3):
    x = Dense(128, activation='relu')(input_text)
    x = Dropout(0.2)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    text_output = Dense(60, activation='relu')(x)
    text_outputs.append(text_output)

# Concatenate the outputs of sentence and text branches
concatenated = Concatenate()(sentence_outputs + text_outputs)

# Define a fully connected layer
x = Dense(256, activation='relu')(concatenated)
x = Dropout(0.2)(x)

# Define the sequential layers
sequential_layers = Sequential([
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

# Connect the fully connected layer with the sequential layers
output = sequential_layers(x)

# Build the model
model = Model(inputs=[input_sentences, input_text], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


In [6]:

num_samples = 5742
num_sentences = 6 
embedding_dim = 768  

X_sentence_embeddings = all_sentence_embeddings  # This should be of shape (num_samples, num_sentences, embedding_dim)
X_text_embeddings = all_text_embeddings  # This should be of shape (num_samples, embedding_dim)
y_labels = labels  # This should be of shape (num_samples,)

# Ensure your data is the correct shape
assert X_sentence_embeddings.shape == (num_samples, num_sentences, embedding_dim)
assert X_text_embeddings.shape == (num_samples, embedding_dim)
assert y_labels.shape[0] == num_samples

# Split data into training and validation sets
X_train_sentences, X_val_sentences, X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_sentence_embeddings, X_text_embeddings, y_labels, test_size=0.2, random_state=42
)

# Train the model
history = model.fit(
    [X_train_sentences, X_train_text],
    y_train,
    validation_data=([X_val_sentences, X_val_text], y_val),
    epochs=40,  # Set the number of epochs
    batch_size=32,  # Set the batch size
    verbose=1  # Print progress during training
)


Epoch 1/40




[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.7661 - loss: 0.7105 - val_accuracy: 0.8303 - val_loss: 0.4797
Epoch 2/40
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8194 - loss: 0.5079 - val_accuracy: 0.8494 - val_loss: 0.4383
Epoch 3/40
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8384 - loss: 0.4234 - val_accuracy: 0.8547 - val_loss: 0.3940
Epoch 4/40
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8471 - loss: 0.4135 - val_accuracy: 0.8503 - val_loss: 0.4080
Epoch 5/40
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8557 - loss: 0.3803 - val_accuracy: 0.8573 - val_loss: 0.3963
Epoch 6/40
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8626 - loss: 0.3644 - val_accuracy: 0.8555 - val_loss: 0.4034
Epoch 7/40
[1m144/144[0m [32m━━━━━━━

In [None]:

# Prepare binary labels for Stage 1: neutral and non-neutral
labels_binary = np.where(labels == 0, 0, 1)

# Separate humor samples for Stage 2 classification 
humor_labels = labels[labels != 0]  # Exclude neeutral samples
humor_sentence_embeddings = all_sentence_embeddings[labels != 0]
humor_text_embeddings = all_text_embeddings[labels != 0]

# Create binary labels for Stage 2 (0 for hate, 1 for support)
humor_labels_binary = np.where(humor_labels == 1, 0, 1)


In [None]:

from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate
from tensorflow.keras.models import Model

# Stage 1: 
input_sentences = Input(shape=(num_sentences, embedding_dim))
input_text = Input(shape=(embedding_dim,))

# Parallel layers for sentence and text embeddings
sentence_layers = [Dense(128, activation='relu')(input_sentences[:, i, :]) for i in range(num_sentences)]
text_layers = [Dense(128, activation='relu')(input_text)] * 3  # Repeating for consistency

# Concatenate layers
combined = Concatenate()(sentence_layers + text_layers)

# Fully connected layers
x = Dense(256, activation='relu')(combined)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
output_stage1 = Dense(1, activation='sigmoid')(x)

# Model for Stage 1
stage1_model = Model(inputs=[input_sentences, input_text], outputs=output_stage1)
stage1_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
stage1_model.summary()


In [8]:

from sklearn.model_selection import train_test_split

# Split data for Stage 1
X_train_sen, X_val_sen, X_train_text, X_val_text, y_train_bin, y_val_bin = train_test_split(
    all_sentence_embeddings, all_text_embeddings, labels_binary, test_size=0.2, random_state=42
)

# Train Stage 1 model
history_stage1 = stage1_model.fit(
    [X_train_sen, X_train_text],
    y_train_bin,
    validation_data=([X_val_sen, X_val_text], y_val_bin),
    epochs=20,
    batch_size=32,
    verbose=1
)


Epoch 1/20




[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.8009 - loss: 0.4613 - val_accuracy: 0.8338 - val_loss: 0.3914
Epoch 2/20
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8322 - loss: 0.3991 - val_accuracy: 0.8407 - val_loss: 0.3784
Epoch 3/20
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8506 - loss: 0.3594 - val_accuracy: 0.8425 - val_loss: 0.3803
Epoch 4/20
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8619 - loss: 0.3294 - val_accuracy: 0.8451 - val_loss: 0.3926
Epoch 5/20
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8771 - loss: 0.3164 - val_accuracy: 0.8433 - val_loss: 0.3878
Epoch 6/20
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8960 - loss: 0.2584 - val_accuracy: 0.8529 - val_loss: 0.3984
Epoch 7/20
[1m144/144[0m [32m━━━━━

In [None]:

# Stage 2:
input_sentences_stage2 = Input(shape=(num_sentences, embedding_dim))
input_text_stage2 = Input(shape=(embedding_dim,))

# Layers for Stage 2
sentence_layers_stage2 = [Dense(128, activation='relu')(input_sentences_stage2[:, i, :]) for i in range(num_sentences)]
text_layers_stage2 = [Dense(128, activation='relu')(input_text_stage2)] * 3

# Concatenate layers
combined_stage2 = Concatenate()(sentence_layers_stage2 + text_layers_stage2)

# Fully connected layers
x_stage2 = Dense(256, activation='relu')(combined_stage2)
x_stage2 = Dropout(0.2)(x_stage2)
x_stage2 = Dense(128, activation='relu')(x_stage2)
x_stage2 = Dropout(0.2)(x_stage2)
output_stage2 = Dense(1, activation='sigmoid')(x_stage2)

# Model for Stage 2
stage2_model = Model(inputs=[input_sentences_stage2, input_text_stage2], outputs=output_stage2)
stage2_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
stage2_model.summary()


In [10]:

# Split data for Stage 2
X_train_sen_humor, X_val_sen_humor, X_train_text_humor, X_val_text_humor, y_train_humor, y_val_humor = train_test_split(
    humor_sentence_embeddings, humor_text_embeddings, humor_labels_binary, test_size=0.2, random_state=42
)

# Train Stage 2 model
history_stage2 = stage2_model.fit(
    [X_train_sen_humor, X_train_text_humor],
    y_train_humor,
    validation_data=([X_val_sen_humor, X_val_text_humor], y_val_humor),
    epochs=20,
    batch_size=32,
    verbose=1
)


Epoch 1/20




[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.6316 - loss: 0.6252 - val_accuracy: 0.8532 - val_loss: 0.3653
Epoch 2/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9141 - loss: 0.2386 - val_accuracy: 0.9450 - val_loss: 0.1413
Epoch 3/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9284 - loss: 0.1771 - val_accuracy: 0.9450 - val_loss: 0.1395
Epoch 4/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9643 - loss: 0.0986 - val_accuracy: 0.9450 - val_loss: 0.1489
Epoch 5/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9706 - loss: 0.0823 - val_accuracy: 0.9404 - val_loss: 0.1794
Epoch 6/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9671 - loss: 0.0995 - val_accuracy: 0.9037 - val_loss: 0.2543
Epoch 7/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Predict on the validation set
y_pred_probs = stage2_model.predict([X_val_sen_humor, X_val_text_humor])

# Step 2: Convert probabilities to binary labels
y_pred_binary = (y_pred_probs > 0.5).astype(int)

# Step 3: Compute the confusion matrix
conf_matrix = confusion_matrix(y_val_humor, y_pred_binary)

# Step 4: Compute metrics for each class
report = classification_report(y_val_humor, y_pred_binary, target_names=["Class 0", "Class 1"])

# Print results
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(report)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step
Confusion Matrix:
[[ 96   2]
 [  9 111]]

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.91      0.98      0.95        98
     Class 1       0.98      0.93      0.95       120

    accuracy                           0.95       218
   macro avg       0.95      0.95      0.95       218
weighted avg       0.95      0.95      0.95       218

