In [None]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1" 
import sys
import tensorflow as tf
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import random
from tensorflow.keras import backend as K
import gc

info = tf.sysconfig.get_build_info()
print("Built against CUDA:",  info.get("cuda_version"))
print("Built against cuDNN:", info.get("cudnn_version"))
print("GPUs found: ",         tf.config.list_physical_devices("GPU"))

tf.config.experimental.enable_op_determinism()


In [None]:
GLOBAL_SEED = 42

os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
tf.random.set_seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)


In [None]:
print("Python:", sys.version.split()[0])
print("TF:",     tf.__version__)

In [None]:
file_path = '../Experiment/Datasets/Generated Fake Amazon Reviews Dataset.csv'
df = pd.read_csv(file_path)

print(df.head())
print(df.columns.tolist())

In [None]:
    df_amazon_phase1_data, df_amazon_test_data = train_test_split(
        df, test_size=0.2, random_state=GLOBAL_SEED, stratify=df['label']
    )

    X_amazon_test_raw = df_amazon_test_data['text_']
    y_amazon_test_raw = df_amazon_test_data['label']


    X_train_amazon_text = df_amazon_phase1_data['text_']
    y_train_amazon_raw = df_amazon_phase1_data['label']

    texts_train_amazon = X_train_amazon_text.tolist()

In [None]:
yelp_file_path = '../Experiment/Datasets/Mixed Yelp Dataset.csv'
df_yelp = pd.read_csv(yelp_file_path)

df_yelp_phase2_data, df_yelp_test_data = train_test_split(
    df_yelp, test_size=0.2, random_state=GLOBAL_SEED, stratify=df_yelp['LABEL']
)

X_yelp_test_raw = df_yelp_test_data['REVIEW_TEXT']
y_yelp_test_raw = df_yelp_test_data['LABEL'].replace({-1: 'CG', 1: 'OR'})
    
X_train_yelp_text = df_yelp_phase2_data['REVIEW_TEXT']
y_train_yelp_raw = df_yelp_phase2_data['LABEL'].replace({-1: 'CG', 1: 'OR'}) 
        

texts_train_yelp = X_train_yelp_text.tolist()

In [None]:
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification

MODEL_NAME = 'FacebookAI/roberta-base' 

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)
num_labels = 2 
model = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

model.summary()

In [None]:
label_map = {'CG': 0, 'OR': 1} 

y_train_amazon_int = y_train_amazon_raw.map(label_map).values 
y_amazon_test_int = y_amazon_test_raw.map(label_map).values


y_train_yelp_int = y_train_yelp_raw.map(label_map).values
y_yelp_test_int = y_yelp_test_raw.map(label_map).values

In [None]:
max_length = 256

train_encodings_amazon = tokenizer(texts_train_amazon, truncation=True, padding=True, max_length=max_length, return_tensors="tf")
# train_encodings_yelp = tokenizer(texts_train_yelp, truncation=True, padding=True, max_length=max_length, return_tensors="tf") # 

In [None]:
# Hiperparametrų tinklelio paieška

# learning_rates = [1e-5, 3e-5, 5e-5]
# epochs_options = [1, 2, 3]
# batch_size_options = [8, 16, 32] 

# all_run_results = [] 
# results_csv_path = './roberta_grid_search_results_simplified.csv' 


# if os.path.exists(results_csv_path):
#     print(f"Appending to existing results file: {results_csv_path}")

# for current_batch_size in batch_size_options:
#     for current_epochs in epochs_options:
#         for current_lr in learning_rates:
#             print(f"\n--- Training: BS={current_batch_size}, Epochs={current_epochs}, LR={current_lr} ---")
            
#             K.clear_session() 
#             gc.collect()

#             train_dataset = tf.data.Dataset.from_tensor_slices((
#                 dict(train_encodings), y_train_int
#             )).shuffle(buffer_size=len(texts_train), seed=GLOBAL_SEED).batch(current_batch_size).prefetch(tf.data.AUTOTUNE)
            
#             val_dataset = tf.data.Dataset.from_tensor_slices((
#                 dict(val_encodings), y_val_int
#             )).batch(current_batch_size).prefetch(tf.data.AUTOTUNE)

#             model = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
#             optimizer = tf.keras.optimizers.AdamW(learning_rate=current_lr)
#             loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#             metrics_list = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')] 
#             model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics_list)
            
#             history = model.fit(
#                 train_dataset,
#                 validation_data=val_dataset,
#                 epochs=current_epochs,
#                 verbose=1 
#             )
            
#             final_train_accuracy = history.history['accuracy'][-1] if 'accuracy' in history.history and history.history['accuracy'] else None
#             final_val_accuracy = history.history['val_accuracy'][-1] if 'val_accuracy' in history.history and history.history['val_accuracy'] else None
            
#             run_result = {
#                 'batch_size': current_batch_size, 
#                 'epochs': current_epochs, 
#                 'learning_rate': current_lr,
#                 'train_accuracy': final_train_accuracy,
#                 'val_accuracy': final_val_accuracy,
#             }
#             all_run_results.append(run_result)
            
#             df_current_run = pd.DataFrame([run_result])
#             if not os.path.exists(results_csv_path) or os.path.getsize(results_csv_path) == 0:
#                 df_current_run.to_csv(results_csv_path, index=False, header=True)
#             else:
#                 df_current_run.to_csv(results_csv_path, index=False, header=False, mode='a')
#             print(f"  Results appended to {results_csv_path}")

#             del model, optimizer, history 
#             del train_dataset, val_dataset

In [None]:
BEST_LR = 5e-5
BEST_EPOCHS = 3
BEST_BATCH_SIZE = 16

train_dataset_amazon = tf.data.Dataset.from_tensor_slices((dict(train_encodings_amazon), y_train_amazon_int))

train_dataset_amazon = train_dataset_amazon.shuffle(len(texts_train_amazon), seed=GLOBAL_SEED) \
                                           .batch(BEST_BATCH_SIZE) \
                                           .prefetch(tf.data.AUTOTUNE)

K.clear_session() 
gc.collect()

model = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
optimizer = tf.keras.optimizers.AdamW(learning_rate=BEST_LR) 
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_dataset_amazon,
          epochs=BEST_EPOCHS)

In [None]:
X_combined_test_list = X_amazon_test_raw.tolist()
y_combined_test_int_list = y_amazon_test_int.tolist()

X_combined_test_list.extend(X_yelp_test_raw.tolist())
y_combined_test_int_list.extend(y_yelp_test_int.tolist())

y_combined_test_int = np.array(y_combined_test_int_list)

combined_test_encodings = tokenizer(X_combined_test_list, truncation=True, padding=True, max_length=max_length, return_tensors="tf")

combined_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(combined_test_encodings),
    y_combined_test_int
)).batch(BEST_BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

combined_test_predictions_logits = model.predict(combined_test_dataset)
predicted_logits_combined = combined_test_predictions_logits.logits 
y_pred_combined_int = np.argmax(predicted_logits_combined, axis=1)

target_names_combined = ['CG', 'OR']

accuracy_combined = accuracy_score(y_combined_test_int, y_pred_combined_int)
precision_combined = precision_score(y_combined_test_int, y_pred_combined_int, average=None, labels=[0, 1], zero_division=0)
recall_combined = recall_score(y_combined_test_int, y_pred_combined_int, average=None, labels=[0, 1], zero_division=0)
f1_combined = f1_score(y_combined_test_int, y_pred_combined_int, average=None, labels=[0, 1], zero_division=0)

print(f"\nTikslumas: {accuracy_combined:.4f}")
if len(precision_combined) == 2 : 
    print(f"Preciziškumas ({target_names_combined[0]}): {precision_combined[0]:.4f}")
    print(f"Preciziškumas ({target_names_combined[1]}):    {precision_combined[1]:.4f}")
    print(f"Atkūrimas ({target_names_combined[0]}):    {recall_combined[0]:.4f}")
    print(f"Atkūrimas ({target_names_combined[1]}):       {recall_combined[1]:.4f}")
    print(f"F1-Rezultatas ({target_names_combined[0]}):  {f1_combined[0]:.4f}")
    print(f"F1-Rezultatas ({target_names_combined[1]}):     {f1_combined[1]:.4f}")
else: 
    print(f"Preciziškumas: {precision_combined}")
    print(f"Atkūrimas: {recall_combined}")
    print(f"F1-statistikos reikšmė: {f1_combined}")

cm_combined = confusion_matrix(y_combined_test_int, y_pred_combined_int, labels=[0, 1])
plt.figure(figsize=(6, 5))
sns.heatmap(cm_combined, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names_combined, yticklabels=target_names_combined)
plt.title('Painiavos Matrica - Kombinuotas Testų Rinkinys')
plt.xlabel('Nuspėtos žymės')
plt.ylabel('Tikros žymės')
plt.show()

from sklearn.metrics import classification_report
report_combined = classification_report(y_combined_test_int, y_pred_combined_int, target_names=target_names_combined, zero_division=0)
print(report_combined)

In [None]:
new_dataset_path = '../Experiment/Datasets/Google Play App Store For Testing.csv' 
df_new_test = pd.read_csv(new_dataset_path)

text_column_new = 'review' 
label_column_new = 'label'  

X_new_test_text = df_new_test[text_column_new].tolist()
y_new_test_original = df_new_test[label_column_new]

new_label_map = {'fake': 0, 'genuine': 1} 
y_new_test_int = y_new_test_original.map(new_label_map).values

new_test_encodings = tokenizer(X_new_test_text, truncation=True, padding=True, max_length=max_length, return_tensors="tf")

new_tf_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(new_test_encodings),
    y_new_test_int
)).batch(BEST_BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

new_test_predictions_logits = model.predict(new_tf_test_dataset)
new_predicted_logits = new_test_predictions_logits.logits
y_pred_new_test_int = np.argmax(new_predicted_logits, axis=1)

target_names_new_test = ['fake', 'genuine'] 

accuracy_new_test = accuracy_score(y_new_test_int, y_pred_new_test_int)
precision_new_test = precision_score(y_new_test_int, y_pred_new_test_int, average=None, labels=[0, 1], zero_division=0)
recall_new_test = recall_score(y_new_test_int, y_pred_new_test_int, average=None, labels=[0, 1], zero_division=0)
f1_new_test = f1_score(y_new_test_int, y_pred_new_test_int, average=None, labels=[0, 1], zero_division=0)

print(f"\nTikslumas: {accuracy_new_test:.4f}")
print(f"Preciziškumas ({target_names_new_test[0]}): {precision_new_test[0]:.4f}")
print(f"Preciziškumas ({target_names_new_test[1]}):    {precision_new_test[1]:.4f}")
print(f"Atkūrimas ({target_names_new_test[0]}):    {recall_new_test[0]:.4f}")
print(f"Atkūrimas ({target_names_new_test[1]}):       {recall_new_test[1]:.4f}")
print(f"F1-statistikos reikšmė ({target_names_new_test[0]}):  {f1_new_test[0]:.4f}")
print(f"F1-statistikos reikšmė ({target_names_new_test[1]}):     {f1_new_test[1]:.4f}")

cm_new_test = confusion_matrix(y_new_test_int, y_pred_new_test_int, labels=[0, 1])
plt.figure(figsize=(6, 5))
sns.heatmap(cm_new_test, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names_new_test, yticklabels=target_names_new_test)
plt.title('Painiavos Matrica')
plt.xlabel('Nuspėtos žymės')
plt.ylabel('Tikros žymės')
plt.show()

from sklearn.metrics import classification_report 
report_new_test = classification_report(y_new_test_int, y_pred_new_test_int, target_names=target_names_new_test, zero_division=0)
print(report_new_test)

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D 
import numpy as np

batch_sizes    = np.array([8]*9 + [16]*9 + [32]*9)
epochs         = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3] * 3)
learning_rates = np.array([1e-05, 3e-05, 5e-05] * 9)

fig = plt.figure(figsize=(8, 6))
ax  = fig.add_subplot(111, projection='3d')

ax.scatter(batch_sizes, epochs, learning_rates, marker='o', s=50, color='orange', edgecolors='k')

ax.set_xlabel('Partijos dydis', labelpad=10)
ax.set_ylabel('Epochos',      labelpad=10)

ax.set_xticks([8, 16, 32])
ax.set_yticks([1, 2, 3])
ax.set_zticks([1e-05, 3e-05, 5e-05])
ax.set_zticklabels(['1e-05', '3e-05', '5e-05'])
ax.tick_params(axis='z', pad=8)  

plt.tight_layout(rect=(0, 0, 0.88, 1))

fig.text(
    0.88, 0.5, 'Mokymosi sparta',
    va='center', ha='center', rotation='vertical', fontsize=10  
)

plt.show()