In [11]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import random
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, silhouette_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import keras_tuner
from keras.utils.vis_utils import plot_model
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification, XLNetConfig, TFXLNetModel
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import XLNetForSequenceClassification, pipeline, TextClassificationPipeline
from transformers import AdamW, get_scheduler
import lime
from lime.lime_text import LimeTextExplainer
import shap
import sys
sys.path.append("/home/alexxgo21/workspace/thesis/modules")
from preprocess_raw_html import preprocess_raw_html


pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None) 


# Set random seeds for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
tf.random.set_seed(seed)



In [12]:
# load the dataset
df = pd.read_parquet("../dataset/philosophy-qna-with-gpt35answer_v1.parquet")
# get the columns of interest
df = df[["view_count","answer_count","is_accepted","answer_score","answer_creation_date","answers","question_score","question_creation_date","link","question","title","gpt35_0125_ans"]]

In [13]:
# function to get two random index if the answer array contains more than 2 answers
def select_random_index(arr,max_ans=2):
    if len(arr) > max_ans:
        return random.sample(range(len(arr)), max_ans)
    return None

# function to update the affected columns in the dataframe based on the selected random index
def update_affected_columns(row):
    if row["temp_index"] is not None:
        row["answers"] = np.array(row["answers"])[row["temp_index"]]
        row["is_accepted"] = np.array(row["is_accepted"])[row["temp_index"]]
        row["answer_score"] = np.array(row["answer_score"])[row["temp_index"]]
        row["answer_creation_date"] = np.array(row["answer_creation_date"])[row["temp_index"]]
    return row

# function to get the gpt answer from the json response
def get_gpt_answer_from_json_response(text):
    return re.sub(r"\n", " ", text["choices"][0]["message"]["content"])

def preprocess_text(text):
    # convert to lower case
    text = text.lower()
    # remove special characters and numbers
    text = re.sub("[^0-9a-zA-Z]+", " ", text)
    text = re.sub("\d+", " ", text)
    # remove stopwords
    text = " ".join([word for word in text.split() if word not in stopwords.words("english")])
    return text

In [14]:
# limit the number of answers per question to 2
df["temp_index"] = df["answers"].apply(lambda x: select_random_index(x))
# update affected columns
df = df.apply(lambda x: update_affected_columns(x), axis=1)
# drop the temp_index column
df = df.drop(columns=["temp_index"])

df_ans_exploded = df[["answers","question","title","gpt35_0125_ans"]]
# explode the answers column
df_ans_exploded = df_ans_exploded.explode("answers").reset_index(drop=True)

# preprocess the raw html
df_ans_exploded["answers"] = df_ans_exploded["answers"].apply(preprocess_raw_html)
# get the gpt answer from the json response
df_ans_exploded["gpt_response"] = df_ans_exploded["gpt35_0125_ans"].apply(get_gpt_answer_from_json_response)

In [18]:
human_response = list(df_ans_exploded["answers"].drop_duplicates())
human_response = [preprocess_text(text) for text in human_response]
gpt_response = list(df_ans_exploded["gpt_response"].drop_duplicates())
gpt_response = [preprocess_text(text) for text in gpt_response]

# 1 - human-generated, 0 - gpt-generated
Y = [1 for _ in range(len(human_response))] + [0 for _ in range(len(gpt_response))] 
X = human_response + gpt_response 

In [19]:
X = np.array(X)
Y = np.array(Y)

# train, test, and val split with StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3)
for train_index, test_index in sss.split(X,Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5)
for val_index, test_index in sss.split(X_test,Y_test):
    X_val, X_test = X_test[val_index], X_test[test_index]
    Y_val, Y_test = Y_test[val_index], Y_test[test_index]

In [20]:
X_train[0]

'important concept statistical experiments issue confounding variable researchers making common mistake assuming correlation implies causation whereas many examples explicitly demonstrate true host methods meant avoid kind mistake like control pairing subjects etc know particularly formal formulation error called false cause fallacy'

In [10]:
import tensorflow as tf

# 1. Check if TensorFlow sees any GPUs
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

# 2. List GPU devices with details
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    print("Name:", gpu.name, "Type:", gpu.device_type)
    
# 3. Get GPU device details
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

# 4. Test GPU with simple operation
with tf.device('/GPU:0'):
    a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
    c = tf.matmul(a, b)
    print("Matrix multiplication result:", c)
    print("Runs on GPU:", c.device.endswith('GPU:0'))

Num GPUs Available: 1
Name: /physical_device:GPU:0 Type: GPU
Physical devices cannot be modified after being initialized
Matrix multiplication result: tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)
Runs on GPU: True


In [11]:
def preprocess_dataset(batch:list, labels:list, batch_size:int, tokenizer:XLNetTokenizer, max_len:int=128):
    # tokenize the batch
    tokenized_batch = tokenizer(
        batch, 
        truncation=True, 
        padding="max_length", 
        max_length=128, 
        return_tensors="tf",
        return_attention_mask=True,
        return_token_type_ids=False,
        add_special_tokens=True
    )
    # one-hot encode the labels
    labels = tf.one_hot(labels, depth=2)
    # create a dataset
    dataset = tf.data.Dataset.from_tensor_slices((
            {
                'input_ids': tokenized_batch['input_ids'],
                'attention_mask': tokenized_batch['attention_mask'],
            },
            labels
        ))
    
    return dataset.shuffle(100).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [12]:
batch_size = 16
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
train_data = preprocess_dataset(list(X_train), list(Y_train), batch_size, tokenizer)
val_data = preprocess_dataset(list(X_val), list(Y_val), batch_size, tokenizer)
test_data = preprocess_dataset(list(X_test), list(Y_test), batch_size, tokenizer)

In [None]:
xlnet = TFXLNetModel.from_pretrained("xlnet-base-cased")

word_inputs = tf.keras.Input(shape=(128,), name='word_inputs', dtype='int32')
xlnet_encodings = xlnet(word_inputs)[0]

# Collect last step from last hidden state (CLS)
doc_encoding = tf.squeeze(xlnet_encodings[:, -1:, :], axis=1)

doc_encoding = tf.keras.layers.Dropout(.1)(doc_encoding)

outputs = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(doc_encoding)

model = tf.keras.Model(inputs=[word_inputs], outputs=[outputs])

In [None]:
class XLNetClassifier(tf.keras.Model):
    def __init__(self, num_classes=2, dropout_rate=0.1):
        super(XLNetClassifier, self).__init__()
        
        # self.config = XLNetConfig.from_pretrained("xlnet-base-cased", num_labels=num_classes)
        xlnet = TFXLNetModel.from_pretrained("xlnet-base-cased")
        # for layer in self.xlnet.layers:
        #     layer.trainable = False
        word_inputs = tf.keras.Input(shape=(128,), name='word_inputs', dtype='int32')
        xlnet_encodings = xlnet(word_inputs)[0]

        # Collect last step from last hidden state (CLS)
        doc_encoding = tf.squeeze(xlnet_encodings[:, -1:, :], axis=1)
    
        doc_encoding = tf.keras.layers.Dropout(.1)(doc_encoding)
     
        outputs = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(doc_encoding)

        model = tf.keras.Model(inputs=[word_inputs], outputs=[outputs])
    

    def call(self, inputs):
        xlnet_output = self.xlnet(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
        )[0]

        cls_output = xlnet_output[:, -1, :]
        
        return self.classifier(cls_output)

model = XLNetClassifier(num_classes=2)

2025-01-26 21:23:18.044684: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 98304000 exceeds 10% of free system memory.
Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLNetModel were initialized from the model checkpoint at xlnet-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetModel for predictions without further training.


In [None]:
class SubclassModelCheckpoint(tf.keras.callbacks.Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min', save_best_only=True):
        super().__init__()
        self.filepath = filepath
        self.monitor = monitor
        self.best = float('inf') if mode == 'min' else float('-inf')
        self.mode = mode
        self.save_best_only = save_best_only
        
    def _is_improvement(self, current):
        if self.mode == 'min':
            return current < self.best
        return current > self.best
    
    def on_epoch_end(self, epoch, logs=None):
        current = logs.get(self.monitor)
        if current is None:
            return
            
        if self.save_best_only:
            if self._is_improvement(current):
                self.best = current
                self.model.save_weights(self.filepath)
                print(f'\nEpoch {epoch+1}: {self.monitor} improved to {current:.4f}, saving model to {self.filepath}')
        else:
            filepath = f"{self.filepath}_epoch_{epoch+1}"
            self.model.save_weights(filepath)
            print(f'\nEpoch {epoch+1}: saving model to {filepath}')

# class WarmupCosineDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
#     def __init__(self, initial_lr, warmup_steps, decay_steps):
#         super().__init__()
#         self.initial_lr = initial_lr
#         self.warmup_steps = warmup_steps
#         self.decay_steps = decay_steps
        
#     def __call__(self, step):
#         # Warmup phase
#         warmup_pct = tf.math.minimum(1.0, step / self.warmup_steps)
#         warmup_lr = self.initial_lr * warmup_pct
        
#         # Cosine decay phase
#         progress = (step - self.warmup_steps) / (self.decay_steps - self.warmup_steps)
#         cosine_decay = 0.5 * (1.0 + tf.math.cos(math.pi * progress))
        
#         # Combine warmup and decay
#         return tf.where(step < self.warmup_steps, warmup_lr, 
#                        self.initial_lr * cosine_decay)

epochs = 100
# total_steps = (len(train_data) // batch_size) * epochs
# warmup_steps = total_steps // 10

# lr_schedule = WarmupCosineDecay(
#     initial_lr=2e-5,
#     warmup_steps=warmup_steps,
#     decay_steps=total_steps
# )

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
        tf.keras.metrics.AUC(name='auc'),
    ]
)

checkpoint_path = "./checkpoints/best_model_weights"
# checkpoint_callback = SubclassModelCheckpoint(
#     filepath=checkpoint_path,
#     monitor='val_accuracy',
#     mode='max',
#     save_best_only=True
# )

callbacks = [
    # Early stopping with patience
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        mode='min'
    ),
    
    # Model checkpoint
    SubclassModelCheckpoint(
        filepath=checkpoint_path,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True
    ),

    tf.keras.callbacks.TensorBoard(
        log_dir='./logs',
        update_freq='epoch',
        profile_batch=0
    ),
    
    # Reduce LR on plateau
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-6
    )
]

history = model.fit(
    train_data,
    validation_data=val_data,
    epochs=epochs,
    callbacks=callbacks,
)

# Load best weights after training
# model.load_weights(checkpoint_path)

2025-01-26 21:23:38.590868: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 98304000 exceeds 10% of free system memory.


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f4d98ec1d90>

In [17]:
model.evaluate(test_data)



[0.24209408462047577,
 0.9158316850662231,
 0.9158316850662231,
 0.9158316850662231,
 0.9709819555282593]