In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from textblob import TextBlob
import spacy
import pandas as pd
import numpy as np
from transformers import create_optimizer

In [None]:
# Load dataset
data_path = "/content/drive/MyDrive/balanced_reddit_dataset.csv"
df = pd.read_csv(data_path)
start = len(df)//20
df = df[start:start+start]  # Reduce dataset size for quicker processing
df['text'] = 'comment: ' + df['comment'].astype(str) + '   parent comment: ' + df['parent_comment'].astype(str)
df['label'] = df['label'].astype(int)
# Preprocess text
def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"@\w+", '', text)  # Remove mentions
    text = re.sub(r"#", '', text)  # Remove hashtags
    return text.lower().strip()

df['text'] = df['text'].apply(preprocess_text)
# Preprocessing
X_new = df['text']
y_new = df['label']

# Train-test split
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42, stratify=y_new)

In [None]:
import pickle
from tensorflow.keras.models import load_model

In [None]:
folder = '/content/drive/MyDrive/drive-download-20241211T071912Z-001'  # Update with your path
with open(f'{folder}/bilstm_tokenizer.pkl', 'rb') as f:
    bilstm_tokenizer = pickle.load(f)
embedding_matrix = np.load(f'{folder}/embedding_matrix.npy')
X_train_pad = np.load(f'{folder}/X_train_pad.npy')
X_test_pad = np.load(f'{folder}/X_test_pad.npy')
y_train = np.load(f'{folder}/y_train.npy')
y_test = np.load(f'{folder}/y_test.npy')
bilstm_model = load_model(f'{folder}/bilstm_model.h5')

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize and encode data
train_encodings = bert_tokenizer(list(X_train), truncation=True, padding=True, max_length=128, return_tensors='tf')
test_encodings = bert_tokenizer(list(X_test), truncation=True, padding=True, max_length=128, return_tensors='tf')


In [None]:
train_labels = tf.convert_to_tensor(y_train)  # Ensure labels are tensors
test_labels = tf.convert_to_tensor(y_test)

train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
    train_labels
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]},
    test_labels
)).batch(32)

# Define optimizer and learning rate scheduler
num_train_steps = len(train_dataset) * 3  # Total steps (num_batches * num_epochs)
optimizer, schedule = create_optimizer(
    init_lr=3e-5, num_train_steps=num_train_steps, num_warmup_steps=0
)

# Compile the model
bert_model.compile(optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = bert_model.fit(train_dataset, validation_data=test_dataset, epochs=3)



In [None]:
#Feature Extraction with SpaCy and TextBlob
nlp = spacy.load('en_core_web_sm')

def extract_features(text):
    doc = nlp(text)
    sentiment = TextBlob(text).sentiment.polarity
    pos_tags = [token.pos_ for token in doc]
    return [sentiment] + pos_tags

X_train_features = X_train.apply(extract_features)
X_test_features = X_test.apply(extract_features)

# Concatenate all features
#bert_embeddings = bert_model.predict(test_dataset)[0]

In [None]:

#Save the BERT model (including configuration, weights, etc.)
bert_model.save_pretrained(f'{folder}/bert_model')

# Save the BERT tokenizer
bert_tokenizer.save_pretrained(f'{folder}/bert_model_tokenizer')


In [None]:
# Pass input through the classification model
train_outputs = bert_model.predict(train_dataset)
test_outputs = bert_model.predict(test_dataset)

# Extract logits
bert_embeddings_train = train_outputs.logits
bert_embeddings_test = test_outputs.logits

# Convert embeddings to NumPy arrays for concatenation
X_train_final = np.hstack([X_train_pad, bert_embeddings_train])
X_test_final = np.hstack([X_test_pad, bert_embeddings_test])




In [None]:
# Train RandomForest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_final, y_train)
y_pred = rf_model.predict(X_test_final)

# Evaluate
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.75      0.74      5052
           1       0.75      0.73      0.74      5057

    accuracy                           0.74     10109
   macro avg       0.74      0.74      0.74     10109
weighted avg       0.74      0.74      0.74     10109



In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy of the model: ")
print(str(round(accuracy_score(y_test, y_pred)*100, 2))+"%")

Accuracy of the model: 
74.13%


In [None]:
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification, create_optimizer

In [None]:
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Tokenize and encode data for RoBERTa
train_encodings = roberta_tokenizer(list(X_train), truncation=True, padding=True, max_length=128, return_tensors='tf')
test_encodings = roberta_tokenizer(list(X_test), truncation=True, padding=True, max_length=128, return_tensors='tf')

train_labels = tf.convert_to_tensor(y_train)  # Ensure labels are tensors
test_labels = tf.convert_to_tensor(y_test)

train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
    train_labels
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]},
    test_labels
)).batch(32)

# Define optimizer and learning rate scheduler
num_train_steps = len(train_dataset) * 3  # Total steps (num_batches * num_epochs)
optimizer, schedule = create_optimizer(
    init_lr=3e-5, num_train_steps=num_train_steps, num_warmup_steps=0
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [None]:
roberta_model.compile(optimizer=optimizer, metrics=['accuracy'])

# Train RoBERTa model
history = roberta_model.fit(train_dataset, validation_data=test_dataset, epochs=3)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
roberta_embeddings = roberta_model.predict(test_dataset)[0]

# Pass input through the classification model
train_outputs = roberta_model.predict(train_dataset)
test_outputs = roberta_model.predict(test_dataset)

# Extract logits from RoBERTa model
roberta_embeddings_train = train_outputs.logits
roberta_embeddings_test = test_outputs.logits

# Convert embeddings to NumPy arrays for concatenation
X_train_final = np.hstack([X_train_pad, roberta_embeddings_train])
X_test_final = np.hstack([X_test_pad, roberta_embeddings_test])




In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_final, y_train)
y_pred = rf_model.predict(X_test_final)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy of the model: ")

In [None]:
from sklearn.metrics import accuracy_score
print(str(round(accuracy_score(y_test, y_pred)*100, 2))+"%")

75.09%


In [None]:
roberta_model.save_pretrained(f'{folder}/roberta_model')

# Save the BERT tokenizer
roberta_tokenizer.save_pretrained(f'{folder}/roberta_model_tokenizer')

In [None]:
folder = '/content/drive/MyDrive/drive-download-20241211T071912Z-001'  # Update with your path
roberta_model_path = f'{folder}/roberta_model'
roberta_tokenizer_path = f'{folder}/roberta_model_tokenizer'
bilstm_path = f'{folder}/bilstm_model.h5'
bilstm_tokenizer_path = f'{folder}/bilstm_tokenizer.pkl'
embedding_matrix_path = f'{folder}/embedding_matrix.npy'

# Load RoBERTa
roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_tokenizer_path)
roberta_model = TFRobertaForSequenceClassification.from_pretrained(roberta_model_path)

# Load BiLSTM and its tokenizer
bilstm_model = load_model(bilstm_path)
with open(bilstm_tokenizer_path, 'rb') as f:
    bilstm_tokenizer = pickle.load(f)

# Load embedding matrix
embedding_matrix = np.load(embedding_matrix_path)


In [None]:
train_encodings = roberta_tokenizer(list(X_train_new), truncation=True, padding=True, max_length=128, return_tensors='tf')
test_encodings = roberta_tokenizer(list(X_test_new), truncation=True, padding=True, max_length=128, return_tensors='tf')

train_labels = tf.convert_to_tensor(y_train_new)
test_labels = tf.convert_to_tensor(y_test_new)

train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
    train_labels
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]},
    test_labels
)).batch(32)
num_train_steps = len(train_dataset) * 3  # Total steps (num_batches * num_epochs)
from transformers import create_optimizer
optimizer, schedule = create_optimizer(
    init_lr=3e-5, num_train_steps=num_train_steps, num_warmup_steps=0
)

# Compile the model
roberta_model.compile(optimizer=optimizer, metrics=['accuracy'])

# Fine-tune the loaded RoBERTa model
history = roberta_model.fit(train_dataset, validation_data=test_dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
roberta_model.save_pretrained(f'{folder}/roberta_model_2')

# Save the BERT tokenizer
roberta_tokenizer.save_pretrained(f'{folder}/roberta_model_tokenizer_2')

In [None]:
train_roberta_outputs = roberta_model.predict(train_dataset).logits
test_roberta_outputs = roberta_model.predict(test_dataset).logits



In [None]:
X_train_pad = bilstm_tokenizer.texts_to_sequences(X_train_new)
X_train_pad = pad_sequences(X_train_pad, maxlen=128, padding='post')

X_test_pad = bilstm_tokenizer.texts_to_sequences(X_test_new)
X_test_pad = pad_sequences(X_test_pad, maxlen=128, padding='post')
optimizer = Adam(learning_rate=0.001)

# Recompile the BiLSTM model with the new optimizer
bilstm_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

bilstm_model.fit(X_train_pad, y_train_new, validation_data=(X_test_pad, y_test_new), epochs=3)

# Extract BiLSTM embeddings
train_bilstm_outputs = bilstm_model.predict(X_train_pad)
test_bilstm_outputs = bilstm_model.predict(X_test_pad)

Epoch 1/3
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - accuracy: 0.5822 - loss: 0.6675 - val_accuracy: 0.6560 - val_loss: 0.6216
Epoch 2/3
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - accuracy: 0.6578 - loss: 0.6183 - val_accuracy: 0.6606 - val_loss: 0.6059
Epoch 3/3
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 14ms/step - accuracy: 0.6769 - loss: 0.5973 - val_accuracy: 0.6734 - val_loss: 0.5966
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [None]:
# Concatenate RoBERTa and BiLSTM embeddings
X_train_final = np.hstack([train_roberta_outputs, train_bilstm_outputs])
X_test_final = np.hstack([test_roberta_outputs, test_bilstm_outputs])

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_final, y_train_new)

# Evaluate Random Forest
y_pred = rf_model.predict(X_test_final)
print(classification_report(y_test_new, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.76      0.75      5062
           1       0.76      0.74      0.75      5047

    accuracy                           0.75     10109
   macro avg       0.75      0.75      0.75     10109
weighted avg       0.75      0.75      0.75     10109



In [None]:
import pickle  # Save the BiLSTM model


bilstm_model.save(f'{folder}/bilstm_model2.h5')

# Save the BiLSTM tokenizer using pickle
with open(f'{folder}/bilstm_tokenizer2.pkl', 'wb') as f:
    pickle.dump(bilstm_tokenizer, f)


In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy of the model: ")
print(str(round(accuracy_score(y_test_new, y_pred)*100, 2))+"%")

Accuracy of the model: 
75.26%
