In [None]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import plotly.figure_factory as ff
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib_venn import venn3,venn2,venn2_circles
import seaborn as sns
import string
import skimpy
from ydata_profiling import ProfileReport
import re
import nltk
import missingno as msno
import datetime
import time
from tqdm import trange
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import random

import sys
sys.path.append("/home/alexxgo21/workspace/Thesis/scripts")

from preprocess_raw_html import preprocess_raw_html

import warnings
warnings.filterwarnings("ignore")
# nltk.download('omw-1.4', quiet=True)

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None) 

### Load the dataset

In [20]:
df = pd.read_parquet("../dataset/philosophy-qna-with-gpt35answer_v1.parquet")

In [None]:
sum([len(x) for x in df["answers"]])

In [None]:
df.info()

In [23]:
# profile = ProfileReport(df,explorative=True)
# profile.to_file("profile.html")

### Check for missing values

In [None]:
msno.matrix(df,labels=True)

### Get the related columns for further analysis

In [25]:
df = df[["view_count","answer_count","is_accepted","answer_score","answer_creation_date","answers","question_score","question_creation_date","link","question","title","gpt35_0125_ans"]]

In [None]:
df.head()

In [27]:
def word_count_raw(arr):
    return [len(ans.split()) for ans in arr]

# function to clean raw text
def preprocess_text_with_stopwords_raw(arr):
    for i in range(len(arr)):
        arr[i] = arr[i].lower()
        arr[i] = re.sub("[^0-9a-zA-Z]+", " ", arr[i])
        # text = " ".join([word for word in text.split() if word not in stopwords.words("english")])
    return arr

# ds["length"] = ds["answer"].str.len()
df["word_count"] = df["answers"].apply(lambda x: [preprocess_raw_html(ans) for ans in x])
df["word_count"] = df["word_count"].apply(preprocess_text_with_stopwords_raw)
df["word_count"] = df["word_count"].apply(word_count_raw)
# ds["mean_sentence_len"] = ds["answer"].map(lambda ans: np.mean([len(s) for s in tokenize.sent_tokenize(ans)]))

In [28]:
df["question"] = df["question"].apply(preprocess_raw_html)

In [None]:
df["question"] = df["question"].apply(preprocess_text_with_stopwords)
df["question_word_count"] = df["question"].apply(word_count)

In [None]:
df = df.sort_values(by="question_word_count",ascending=True)

In [None]:
df["gpt_ans_unpack"] = df["gpt35_0125_ans"].apply(lambda x: re.sub(r"\n", " ", x["choices"][0]["message"]["content"]))

In [None]:
df[df["question"].str.contains("water")]

##### Questions date range

In [None]:
questions_date = df["question_creation_date"]
questions_date = questions_date.apply(lambda x: datetime.datetime.fromtimestamp(x))
year = Counter(list(questions_date.dt.year))

ax = sns.barplot(x=list(year.keys()),y=list(year.values()),errwidth=0,color="#3373cc")
for i in ax.containers:
    ax.bar_label(i,)
plt.xlabel("Tahun")
plt.ylabel("Jumlah Pertanyaan")
plt.title("Jumlah Pertanyaan Berdasarkan Tahun")
plt.show()

##### Answer count

In [None]:
answer_count_per_question = df["answer_count"]
answer_count_per_question = Counter(list(answer_count_per_question))

ax = sns.barplot(x=list(answer_count_per_question.keys()),y=list(answer_count_per_question.values()),errwidth=0,color="#3373cc")
for i in ax.containers:
    ax.bar_label(i,)
plt.xlabel("Number of Answer")
plt.ylabel("Count")
plt.title("Number of Answer Per Question")
plt.show()

##### Questions score

In [None]:
question_score = df["question_score"]
question_score = Counter(list(question_score))
plt.figure(figsize=(10,6))
ax = sns.barplot(x=list(question_score.keys()),y=list(question_score.values()),errwidth=0,color="#3373cc")
for i in ax.containers:
    ax.bar_label(i,)

plt.xlabel("Question Score")
plt.ylabel("Count")
plt.title("Score Per Question")
plt.show()

# Exploratory Data Analysis

In [4]:
def limit_number_of_answers(arr,max_ans=2):
    if len(arr) > max_ans:
        return random.choices(arr,k=max_ans)
    return arr
    
df["answers"] = df["answers"].apply(lambda x: limit_number_of_answers(x))
ans = df["answers"].tolist()

In [5]:
ans = [preprocess_raw_html(answer) for sublist in ans for answer in sublist]

In [None]:
len(ans)

In [6]:
questions = df["question"].apply(preprocess_raw_html)

In [7]:
gpt_ans = df["gpt35_0125_ans"].to_list() 
gpt_ans = [re.sub(r"\n", " ", text["choices"][0]["message"]["content"]) for text in gpt_ans]

In [None]:
plt.figure(figsize=(7,6))
ax = sns.barplot(y=[len(ans),len(gpt_ans)],x=["Manusia","GPT-3.5"],errwidth=0)
for i in ax.containers:
    ax.bar_label(i,)
plt.ylabel("Jumlah Jawaban")
plt.title("Jumlah Data Jawaban Manusia dan GPT-3.5")
plt.show()

In [27]:
ds = pd.concat([pd.DataFrame({"text":ans,"label":["Respon manusia" for i in range(len(ans))]}),pd.DataFrame({"text":gpt_ans,"label":["Respon model GPT-3.5" for i in range(len(gpt_ans))]}),pd.DataFrame({"text":questions,"label":["Pertanyaan" for i in range(len(gpt_ans))]})],ignore_index=True)
ds = ds.sample(frac=1)

In [None]:
ds.head(3)

In [None]:
len(ds)

### Length and Word Count Distribution

In [28]:
def word_count(ans):
    return len(ans.split())

# function to clean raw text
def preprocess_text_with_stopwords(text):
    text = text.lower()
    text = re.sub("[^0-9a-zA-Z]+", " ", text)
    # text = " ".join([word for word in text.split() if word not in stopwords.words("english")])
    
    return text

# ds["length"] = ds["answer"].str.len()
ds["word_count"] = ds["text"].apply(preprocess_text_with_stopwords)
ds["word_count"] = ds["word_count"].apply(word_count)
# ds["mean_sentence_len"] = ds["answer"].map(lambda ans: np.mean([len(s) for s in tokenize.sent_tokenize(ans)]))

In [None]:
ds[ds["label"] == "Respon manusia"]["word_count"].describe()

In [None]:
ds[ds["label"] == "Respon model GPT-3.5"]["word_count"].describe()

In [None]:
ds[ds["label"] == "Pertanyaan"]["word_count"].describe()

In [None]:
ds = ds.sort_values(by="word_count",ascending=True)

In [29]:
plot_ds = ds
plot_ds = plot_ds.rename(columns={"word_count":"jumlah kata"})

In [None]:
import matplotlib.colors as mcolors

# Get the current color palette
current_palette = sns.color_palette()

# Convert the first color to a hex code
color = current_palette[1]
hex_code = mcolors.to_hex(color)

print(hex_code)

In [None]:
fig = px.histogram(plot_ds[(plot_ds["label"]=="Respon model GPT-3.5")|(plot_ds["label"]=="Respon manusia")], x='jumlah kata', color='label', barmode='overlay', histnorm='probability density',color_discrete_sequence=["#1f77b4", "#ff7f0e"])
fig.update_layout(
    width=800,  # Set the width of the plot
    height=600  # Set the height of the plot
)
fig.show()
# # Add vertical lines for mean, median, and mode
# for class_name, stats in grouped_df.iterrows():
#     fig.add_vline(x=stats['mean'], line_width=2, line_dash="dash", line_color="red", annotation_text=f"Mean: {stats['mean']:.2f}", annotation_position="top right")
#     fig.add_vline(x=stats['median'], line_width=2, line_dash="dot", line_color="green", annotation_text=f"Median: {stats['median']:.2f}", annotation_position="top left")
#     fig.add_vline(x=stats['mode'], line_width=2, line_dash="solid", line_color="blue", annotation_text=f"Mode: {stats['mode']:.2f}", annotation_position="bottom right")

# # Add range annotations as text
# for class_name, stats in grouped_df.iterrows():
#     fig.add_annotation(x=stats['min'], y=0.05, text=f"Range: {stats['range']:.2f}", showarrow=False)

In [None]:
def visualize(col):
    print()
    plt.figure(figsize=(10,6))
    plt.subplot(1,2,1)
    sns.boxplot(y=ds[col])
    plt.ylabel(col, labelpad=12.5)
    
    plt.subplot(1,2,2)
    sns.kdeplot(ds[ds["label"]=="human-generated"][col])
    sns.kdeplot(ds[ds["label"]=="gpt35-generated"][col])
    plt.legend(ds["label"].unique())
    plt.xlabel('')
    plt.ylabel('')
    
    plt.show()

In [None]:
for col in ds.columns[2:]:
    visualize(col)

In [None]:
# function to clean raw text
def preprocess_text(text):
    text = text.lower()
    text = re.sub("[^0-9a-zA-Z]+", " ", text)
    text = " ".join([word for word in text.split() if word not in stopwords.words("english")])
    
    return text

def preprocess_text_remove_nums(text):
    text = text.lower()
    text = re.sub("[^0-9a-zA-Z]+", " ", text)
    text = re.sub("\d+", " ", text)
    text = " ".join([word for word in text.split() if word not in stopwords.words("english")])
    
    return text

In [None]:
ds["answer_word_list"] = ds["answer"].apply(lambda x:preprocess_text(x).split())

In [None]:
human_corpus = []
gpt_corpus = []
for i in trange(ds.shape[0], ncols=150, nrows=10, colour='green', smoothing=0.8):
    if ds["label"][i] == "human-generated":
        human_corpus += ds["answer_word_list"][i]
    else:
        gpt_corpus += ds["answer_word_list"][i]

In [None]:
mostCommon_human = Counter(human_corpus).most_common(10)
mostCommon_gpt = Counter(gpt_corpus).most_common(10)

### Most Frequent Word

In [None]:
words = []
freq = []
for word, count in mostCommon_human:
    words.append(word)
    freq.append(count)

sns.barplot(x=freq, y=words)
plt.title("Top 10 Most Frequently Occuring Words in Human-Generated Answers")
plt.show()

words = []
freq = []
for word, count in mostCommon_gpt:
    words.append(word)
    freq.append(count)

sns.barplot(x=freq, y=words)
plt.title("Top 10 Most Frequently Occuring Words in GPT-Generated Answers")
plt.show()

### Unique Words

In [None]:
human_set = set(human_corpus)
gpt_set = set(gpt_corpus)

venn2([gpt_set,human_set],('gpt','human'))
venn2_circles(subsets=[gpt_set,human_set],linewidth=1,color='k')
plt.show()

In [None]:
len(set(human_corpus+gpt_corpus))

### Bigrams

In [None]:
cv = CountVectorizer(ngram_range=(2,2))
bigrams = cv.fit_transform(ds[ds["label"]=="human-generated"]["answer"].apply(preprocess_text))
count_values = bigrams.sum(axis=0)
count_values = np.array(count_values)[0]
ngram_freq = pd.DataFrame(sorted([(count_values[i], k) for k, i in cv.vocabulary_.items()], reverse = True))
ngram_freq.columns = ["frequency", "n_gram"]
sns.barplot(x=ngram_freq["frequency"][:10], y=ngram_freq["n_gram"][:10])
plt.title("Top 10 Most Frequently Occuring Bigrams Human Generated Answers")    
plt.show()

In [None]:
cv = CountVectorizer(ngram_range=(2,2))
bigrams = cv.fit_transform(ds[ds["label"]=="gpt35-generated"]["answer"].apply(preprocess_text))
count_values = bigrams.sum(axis=0)
count_values = np.array(count_values)[0]
ngram_freq = pd.DataFrame(sorted([(count_values[i], k) for k, i in cv.vocabulary_.items()], reverse = True))
ngram_freq.columns = ["frequency", "n_gram"]
sns.barplot(x=ngram_freq["frequency"][:10], y=ngram_freq["n_gram"][:10])
plt.title("Top 10 Most Frequently Occuring Bigrams GPT-35 Generated Answer")    
plt.show()

### Trigrams

In [None]:
cv = CountVectorizer(ngram_range=(3,3))
trigrams = cv.fit_transform(ds[ds["label"]=="human-generated"]["answer"].apply(preprocess_text))
count_values = trigrams.sum(axis=0)
count_values = np.array(count_values)[0]
ngram_freq = pd.DataFrame(sorted([(count_values[i], k) for k, i in cv.vocabulary_.items()], reverse = True))
ngram_freq.columns = ["frequency", "n_gram"]
sns.barplot(x=ngram_freq["frequency"][:10], y=ngram_freq["n_gram"][:10])
plt.title("Top 10 Most Frequently Occuring Trigrams on Human Generated Answer")
plt.show()

In [None]:
cv = CountVectorizer(ngram_range=(3,3))
trigrams = cv.fit_transform(ds[ds["label"]=="gpt35-generated"]["answer"].apply(preprocess_text))
count_values = trigrams.sum(axis=0)
count_values = np.array(count_values)[0]
ngram_freq = pd.DataFrame(sorted([(count_values[i], k) for k, i in cv.vocabulary_.items()], reverse = True))
ngram_freq.columns = ["frequency", "n_gram"]
sns.barplot(x=ngram_freq["frequency"][:10], y=ngram_freq["n_gram"][:10])
plt.title("Top 10 Most Frequently Occuring Trigrams on GPT-35 Generated Answer")
plt.show()

### Modeling

### Traditional ML 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import random
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import LabelEncoder

In [None]:
X = ds["answer"].apply(preprocess_text).values
Y = ds["label"].values

# train test split using StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(X,Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

# feature extraction with TF-IDF
vectorizer=TfidfVectorizer(strip_accents = 'ascii', stop_words='english',max_features=5000,ngram_range=(1,5))
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

# encode the label
label_encoder = LabelEncoder()
label_encoder.fit(Y_train)

Y_train_encoded = label_encoder.transform(Y_train)
Y_test_encoded = label_encoder.transform(Y_test)

### Support Vector Machine

In [None]:
svc = SVC(probability=True,C=10,gamma=0.1,kernel="rbf")

# Hyperparameter grid
# param_grid = {"C": [0.1, 1, 10], "kernel": ["rbf","sigmoid","polynomial"], "gamma":[0.1,1,10]}
# Best Parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
# Accuracy: 0.9548872180451128

# Grid search
# grid_search = GridSearchCV(svc, param_grid, cv=5, verbose=2)

# Fit and evaluate
model = svc.fit(X_train_tf, Y_train_encoded)
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_
Y_pred = model.predict(X_test_tf)

accuracy = accuracy_score(Y_test_encoded, Y_pred)

# print("Best Parameters:", best_params)
print("Accuracy:", accuracy) 
print(classification_report(Y_test_encoded,Y_pred))
print(confusion_matrix(Y_test_encoded,Y_pred))


In [None]:
c = make_pipeline(vectorizer, model)

ls_X_test= list(X_test)

class_names = {0: "gpt35-generated", 1:"human-generated"}

In [None]:
import lime
from lime.lime_text import LimeTextExplainer

# create the LIME explainer
# add the class names for interpretability
LIME_explainer = LimeTextExplainer(class_names=class_names)

# choose a random single prediction
idx = 100
# explain the chosen prediction
LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)
# print results
print('Philosophy answer:', ls_X_test[idx])
print('Probability human-generated =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])
print('True class: %s' % class_names.get(list(Y_test_encoded)[idx]))

In [None]:
# show the explainability results with highlighted text
LIME_exp.save_to_file('SVC.html')

### Random Forest

In [None]:
rfc = RandomForestClassifier()
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, Y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)

print("Best Parameters:", best_params)
print("Accuracy:", accuracy) 
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

### Gradient Boosting

In [None]:
gb = GradientBoostingClassifier()
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
}

grid_search = GridSearchCV(gb, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, Y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)

print("Best Parameters:", best_params)
print("Accuracy:", accuracy) 
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

### Deep Learning

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import keras_tuner
from keras.utils.vis_utils import plot_model

In [None]:
try:
    tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
    print("GPU is available.")
except RuntimeError:
    print("GPU is not available.")

print(tf.config.list_physical_devices('GPU'))

In [None]:
X,Y = np.array(ds["answer"].apply(preprocess_text).values), np.array(ds["label"].values)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

for train_index, test_index in sss.split(X,Y):
    X_train, X_test = X[train_index], X[test_index] 
    Y_train, Y_test = Y[train_index], Y[test_index] 

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for val_index, test_index in sss.split(X_test,Y_test):
    X_val, X_test = X_test[val_index], X_test[test_index]
    Y_val, Y_test = Y_test[val_index], Y_test[test_index] 

label_encoder = LabelEncoder()
label_encoder.fit(Y_train)

Y_train = label_encoder.transform(Y_train) 
Y_train = Y_train.reshape(Y_train.shape[0],1)

Y_val = label_encoder.transform(Y_val) 
Y_val = Y_val.reshape(Y_val.shape[0],1)

Y_test = label_encoder.transform(Y_test) 
Y_test = Y_test.reshape(Y_test.shape[0],1)

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 500
trunc_type = "post"
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

sequences = tokenizer.texts_to_sequences(X_train)
validation_sequences = tokenizer.texts_to_sequences(X_val)  
testing_sequences = tokenizer.texts_to_sequences(X_test)

training_set = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
validation_set = pad_sequences(validation_sequences, maxlen=max_length)
testing_set = pad_sequences(testing_sequences, maxlen=max_length)

In [None]:
callback = keras.callbacks.EarlyStopping(monitor="val_loss",patience=10)

In [None]:
# MLP with embedding layer
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")  
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])
history = model.fit(training_set, Y_train, epochs=100, validation_data=(validation_set, Y_val), batch_size=64, callbacks=[callback])

In [None]:
plot_model(model,show_shapes=True, show_layer_names=True)

In [None]:
acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

# Plot training and validation accuracy per epoch
plt.plot(epochs, acc, 'r',label="Training Accuracy")
plt.plot(epochs, val_acc, 'b', label="Validation Accuracy")
plt.title('Training and validation accuracy')
plt.legend()
plt.show()
print("")

# Plot training and validation loss per epoch
plt.plot(epochs, loss, 'r', label="Training Loss")
plt.plot(epochs, val_loss, 'b', label="Validation Loss")
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# Predict the labels for the testing set
Y_pred = model.predict(testing_set)
Y_pred = (Y_pred > 0.5).astype(int)  # Assuming a binary classification with a threshold of 0.5

# Calculate the classification report
report = classification_report(Y_test, Y_pred, target_names=label_encoder.classes_)
print(report)
print(confusion_matrix(Y_test,Y_pred))

### Hyperparameter Tuning with keras_tuner

In [None]:
def build_model(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length))
    layer_type = hp.Choice("layer_type", ["conv1d", "lstm", "bilstm"])
    if layer_type=="conv1d":
        model.add(tf.keras.layers.Conv1D(filters=hp.Int("filters",min_value=32,max_value=128,step=32),kernel_size=5,activation="relu"))
    elif layer_type=="lstm":
        num_layers = hp.Int('num_layers', min_value=1, max_value=2, step=1)
        for i in range(int(num_layers)):
            if i<(int(num_layers)-1):
                model.add(tf.keras.layers.LSTM(hp.Int("units",min_value=32,max_value=64,step=32),return_sequences=True))
            else:
                model.add(tf.keras.layers.LSTM(hp.Int("units",min_value=32,max_value=64,step=32)))
    else:
        num_layers = hp.Int('num_layers', min_value=1, max_value=2, step=1)
        for i in range(int(num_layers)):
            if i<(int(num_layers)-1):
                model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp.Int("units",min_value=32,max_value=64,step=32),return_sequences=True)))
            else:
                model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp.Int("units",min_value=32,max_value=64,step=32))))

    if layer_type=="conv1d":
        layer_type = hp.Choice("layer_type", ["flatten", "global_avg_pooling", "max_pooling"])
        if layer_type=="flatten":
            model.add(tf.keras.layers.Flatten())
        elif layer_type=="global_avg_pooling":
            model.add(tf.keras.layers.GlobalAveragePooling1D())
        else:
            model.add(tf.keras.layers.MaxPooling1D(pool_size=4))

    num_layers = hp.Int('num_layers', min_value=1, max_value=2, step=1)
    for i in range(num_layers):
        model.add(tf.keras.layers.Dense(hp.Int('units', min_value=32, max_value=128, step=32), activation="relu"))
        if hp.Boolean('use_dropout'):
            model.add(tf.keras.layers.Dropout(hp.Float('dropout_rate', min_value=0.1, max_value=0.2, step=0.1)))

    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
    
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(),optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate',[0.001, 0.005, 0.01])), metrics=["accuracy"]) 

    return model 


In [None]:
tuner = keras_tuner.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='hyperparameter_tuning',
                     project_name='Sequence Model Hyperparameter Tuning')

# tuner.search(training_set, Y_train, epochs=100, validation_data=(validation_set, Y_val), batch_size=32, callbacks=[callback])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

In [None]:
tuner.search_space_summary()  
# tuner.results_summary()

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)
plot_model(hypermodel,show_shapes=True, show_layer_names=True)
# hypermodel.fit(training_set, Y_train, epochs=100, validation_data=(validation_set, Y_val), batch_size=64, callbacks=[callback])

In [None]:
# Predict the labels for the testing set
Y_pred = hypermodel.predict(testing_set)
Y_pred = (Y_pred > 0.5).astype(int)  # Assuming a binary classification with a threshold of 0.5

# Calculate the classification report
report = classification_report(Y_test, Y_pred, target_names=label_encoder.classes_)
print(report)
print(confusion_matrix(Y_test,Y_pred))

### Transformer Model with XLNet

In [None]:
from transformers import XLNetTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import XLNetForSequenceClassification, pipeline, TextClassificationPipeline
from transformers import AdamW, get_scheduler

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
ds.info()

In [None]:
ds["answer"] = ds["answer"].apply(preprocess_text)
ds["label"] = ds["label"].apply(lambda x: 1 if x=="human-generated" else 0)

In [None]:
sentences = ds.answer.values
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]
label = ds.label.values

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet/xlnet-base-cased')
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [None]:
MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
input_ids = np.array(input_ids)
label = np.array(label)
attention_masks = np.array(attention_masks)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

for train_index, test_index in sss.split(input_ids,label):
    train_inputs, validation_inputs = input_ids[train_index], input_ids[test_index] 
    train_labels, validation_labels = label[train_index], label[test_index] 
    train_masks, validation_masks = attention_masks[train_index], attention_masks[test_index]

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for validation_index, test_index in sss.split(validation_inputs,validation_labels):
    validation_inputs, test_inputs = validation_inputs[validation_index], validation_inputs[test_index] 
    validation_labels, test_labels = validation_labels[validation_index], validation_labels[test_index] 
    validation_masks, test_masks = validation_masks[validation_index], validation_masks[test_index]


In [None]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(list(train_inputs))
validation_inputs = torch.tensor(list(validation_inputs))
test_inputs = torch.tensor(list(test_inputs))

train_labels = torch.tensor(list(train_labels))
validation_labels = torch.tensor(list(validation_labels))
test_labels = torch.tensor(list(test_labels))

train_masks = torch.tensor(list(train_masks))
validation_masks = torch.tensor(list(validation_masks))
test_masks = torch.tensor(list(test_masks))

batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [None]:
# Load XLNEtForSequenceClassification, the pretrained XLNet model with a single linear classification layer on top. 

model = XLNetForSequenceClassification.from_pretrained("xlnet/xlnet-base-cased", num_labels=2)
model.cuda()

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters,
                     lr=2e-5,eps=1e-6)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []

epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs[0]
    logits = outputs[1]
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      logits = output[0]
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

In [None]:
# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

# Calculate metrics
accuracy = accuracy_score(flat_true_labels, flat_predictions)
precision = precision_score(flat_true_labels, flat_predictions)
recall = recall_score(flat_true_labels, flat_predictions)
f1 = f1_score(flat_true_labels, flat_predictions)
conf_matrix = confusion_matrix(flat_true_labels, flat_predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

In [None]:
torch.save(model, "xlnet_model_v1 (MAX_LEN=128,batch_size=32,epochs=4).pth")

##### Run Test with Trained XLNet Model

In [None]:
# Prediction on test set

base_model = torch.load("./xlnet_model_v1 (MAX_LEN=128,batch_size=32,epochs=4).pth") 

# Put model in evaluation mode
base_model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs = base_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

# Calculate metrics
accuracy = accuracy_score(flat_true_labels, flat_predictions)
precision = precision_score(flat_true_labels, flat_predictions)
recall = recall_score(flat_true_labels, flat_predictions)
f1 = f1_score(flat_true_labels, flat_predictions)
conf_matrix = confusion_matrix(flat_true_labels, flat_predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")


In [None]:
# import shap

# classifier = TextClassificationPipeline(model=base_model,tokenizer=tokenizer,return_all_scores=True)
# explainer = shap.Explainer(classifier)
# shap_values = explainer([sentences[1]])
# shap.plots.text(shap_values[:,:,0])