In [None]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1" 
import sys
import tensorflow as tf
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import random
from tensorflow.keras import backend as K
import gc
from sklearn.metrics import classification_report
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification

info = tf.sysconfig.get_build_info()
print("Built against CUDA:",  info.get("cuda_version"))
print("Built against cuDNN:", info.get("cudnn_version"))
print("GPUs found: ",         tf.config.list_physical_devices("GPU"))

tf.config.experimental.enable_op_determinism()

In [None]:
GLOBAL_SEED = 42

os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
tf.random.set_seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)

print("Python:", sys.version.split()[0])
print("TF:",     tf.__version__)

In [None]:
file_path = '../Experiment/Datasets/Generated Fake Amazon Reviews Dataset.csv'
df = pd.read_csv(file_path)

print("Amazon dataset shape:", df.shape)
print(df.head())
print(df.columns.tolist())

df_amazon_phase1_data, df_amazon_test_data = train_test_split(
    df, test_size=0.2, random_state=GLOBAL_SEED, stratify=df['label']
)

X_amazon_test_raw = df_amazon_test_data['text_']
y_amazon_test_raw = df_amazon_test_data['label']

X_train_amazon_text = df_amazon_phase1_data['text_']
y_train_amazon_raw = df_amazon_phase1_data['label']

texts_train_amazon = X_train_amazon_text.tolist()

In [None]:
yelp_file_path = '../Experiment/Datasets/Mixed Yelp Dataset.csv'
df_yelp = pd.read_csv(yelp_file_path)

print("Yelp dataset shape:", df_yelp.shape)
print(df_yelp.head())
print(df_yelp.columns.tolist())

df_yelp_phase2_data, df_yelp_test_data = train_test_split(
    df_yelp, test_size=0.2, random_state=GLOBAL_SEED, stratify=df_yelp['LABEL']
)

X_yelp_test_raw = df_yelp_test_data['REVIEW_TEXT']
y_yelp_test_raw = df_yelp_test_data['LABEL'].replace({-1: 'CG', 1: 'OR'})
    
X_train_yelp_text = df_yelp_phase2_data['REVIEW_TEXT']
y_train_yelp_raw = df_yelp_phase2_data['LABEL'].replace({-1: 'CG', 1: 'OR'}) 
        
texts_train_yelp = X_train_yelp_text.tolist()

In [None]:
MODEL_NAME = 'FacebookAI/roberta-base' 

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)
num_labels = 2 

label_map = {'CG': 0, 'OR': 1} 

y_train_amazon_int = y_train_amazon_raw.map(label_map).values 
y_amazon_test_int = y_amazon_test_raw.map(label_map).values

y_train_yelp_int = y_train_yelp_raw.map(label_map).values
y_yelp_test_int = y_yelp_test_raw.map(label_map).values

max_length = 512

train_encodings_amazon = tokenizer(texts_train_amazon, truncation=True, padding=True, max_length=max_length, return_tensors="tf")
train_encodings_yelp = tokenizer(texts_train_yelp, truncation=True, padding=True, max_length=max_length, return_tensors="tf")

texts_train_mixed = texts_train_amazon + texts_train_yelp
y_train_mixed_int = np.concatenate([y_train_amazon_int, y_train_yelp_int])
train_encodings_mixed = tokenizer(texts_train_mixed, truncation=True, padding=True, max_length=max_length, return_tensors="tf")


MIXED_LR = 5e-5
MIXED_EPOCHS = 1
MIXED_BATCH_SIZE = 16


AMAZON_LR = 5e-5
AMAZON_EPOCHS = 1
AMAZON_BATCH_SIZE = 16

YELP_LR = 1e-5
YELP_EPOCHS = 1
YELP_BATCH_SIZE = 16

In [None]:
# Hiperparametrų tinklelio paieška

# learning_rates = [1e-5, 3e-5, 5e-5]
# epochs_options = [1, 2, 3]
# batch_size_options = [8, 16, 32] 

# all_run_results = [] 
# results_csv_path = './roberta_grid_search_results_simplified.csv' 


# if os.path.exists(results_csv_path):
#     print(f"Appending to existing results file: {results_csv_path}")

# for current_batch_size in batch_size_options:
#     for current_epochs in epochs_options:
#         for current_lr in learning_rates:
#             print(f"\n--- Training: BS={current_batch_size}, Epochs={current_epochs}, LR={current_lr} ---")
            
#             K.clear_session() 
#             gc.collect()

#             train_dataset = tf.data.Dataset.from_tensor_slices((
#                 dict(train_encodings), y_train_int
#             )).shuffle(buffer_size=len(texts_train), seed=GLOBAL_SEED).batch(current_batch_size).prefetch(tf.data.AUTOTUNE)
            
#             val_dataset = tf.data.Dataset.from_tensor_slices((
#                 dict(val_encodings), y_val_int
#             )).batch(current_batch_size).prefetch(tf.data.AUTOTUNE)

#             model = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
#             optimizer = tf.keras.optimizers.AdamW(learning_rate=current_lr)
#             loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#             metrics_list = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')] 
#             model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics_list)
            
#             history = model.fit(
#                 train_dataset,
#                 validation_data=val_dataset,
#                 epochs=current_epochs,
#                 verbose=1 
#             )
            
#             final_train_accuracy = history.history['accuracy'][-1] if 'accuracy' in history.history and history.history['accuracy'] else None
#             final_val_accuracy = history.history['val_accuracy'][-1] if 'val_accuracy' in history.history and history.history['val_accuracy'] else None
            
#             run_result = {
#                 'batch_size': current_batch_size, 
#                 'epochs': current_epochs, 
#                 'learning_rate': current_lr,
#                 'train_accuracy': final_train_accuracy,
#                 'val_accuracy': final_val_accuracy,
#             }
#             all_run_results.append(run_result)
            
#             df_current_run = pd.DataFrame([run_result])
#             if not os.path.exists(results_csv_path) or os.path.getsize(results_csv_path) == 0:
#                 df_current_run.to_csv(results_csv_path, index=False, header=True)
#             else:
#                 df_current_run.to_csv(results_csv_path, index=False, header=False, mode='a')
#             print(f"  Results appended to {results_csv_path}")

#             del model, optimizer, history 
#             del train_dataset, val_dataset

In [None]:
train_dataset_mixed = tf.data.Dataset.from_tensor_slices((dict(train_encodings_mixed), y_train_mixed_int))
train_dataset_mixed = train_dataset_mixed.shuffle(len(texts_train_mixed), seed=GLOBAL_SEED) \
                                         .batch(MIXED_BATCH_SIZE) \
                                         .prefetch(tf.data.AUTOTUNE)

K.clear_session() 
gc.collect()

model_mixed = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
optimizer = tf.keras.optimizers.AdamW(learning_rate=MIXED_LR) 
model_mixed.compile(optimizer=optimizer,
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])

history_mixed = model_mixed.fit(train_dataset_mixed, epochs=MIXED_EPOCHS)

# Test
X_combined_test_list = X_amazon_test_raw.tolist()
y_combined_test_int_list = y_amazon_test_int.tolist()

X_combined_test_list.extend(X_yelp_test_raw.tolist())
y_combined_test_int_list.extend(y_yelp_test_int.tolist())

y_combined_test_int = np.array(y_combined_test_int_list)
X_combined_test_raw = pd.Series(X_combined_test_list)

combined_test_encodings = tokenizer(X_combined_test_raw.tolist(), truncation=True, padding=True, 
                                   max_length=max_length, return_tensors="tf")

combined_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(combined_test_encodings),
    y_combined_test_int
)).batch(MIXED_BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

combined_test_predictions = model_mixed.predict(combined_test_dataset)
combined_predicted_logits = combined_test_predictions.logits
combined_y_pred_int = np.argmax(combined_predicted_logits, axis=1)

target_names_combined = ['CG', 'OR']

accuracy_combined = accuracy_score(y_combined_test_int, combined_y_pred_int)
precision_combined = precision_score(y_combined_test_int, combined_y_pred_int, average=None, labels=[0, 1], zero_division=0)
recall_combined = recall_score(y_combined_test_int, combined_y_pred_int, average=None, labels=[0, 1], zero_division=0)
f1_combined = f1_score(y_combined_test_int, combined_y_pred_int, average=None, labels=[0, 1], zero_division=0)

print(f"Tikslumas: {accuracy_combined:.4f}")
print(f"Preciziškumas (Netikras/CG): {precision_combined[0]:.4f}, Preciziškumas (Tikras/OR): {precision_combined[1]:.4f}")
print(f"Atkūrimas (Netikras/CG): {recall_combined[0]:.4f}, Atkūrimas (Tikras/OR): {recall_combined[1]:.4f}")
print(f"F1-Statistikos reikšmė (Netikras/CG): {f1_combined[0]:.4f}, F1-Statistikos reikšmė (Tikras/OR): {f1_combined[1]:.4f}")

print("\nSujungto duomenų rinkinio modelio rezultatų lentelė:")
result_table = pd.DataFrame({
    'Klasė': ['Netikras (CG)', 'Tikras (OR)'],
    'Preciziškumas': [precision_combined[0], precision_combined[1]],
    'Atkūrimas': [recall_combined[0], recall_combined[1]],
    'F1-Statistikos reikšmė': [f1_combined[0], f1_combined[1]],
})
result_table['Tikslumas'] = accuracy_combined
print(result_table.to_string(index=False))

cm_combined = confusion_matrix(y_combined_test_int, combined_y_pred_int, labels=[0, 1])
plt.figure(figsize=(6, 5))
sns.heatmap(cm_combined, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names_combined, yticklabels=target_names_combined)
plt.title('Painiavos Matrica - Modelis ant sujungto testinio duomenų rinkinio')
plt.xlabel('Prognozuojamos Etiketės')
plt.ylabel('Tikrosios Etiketės')
plt.show()

In [None]:
train_dataset_amazon = tf.data.Dataset.from_tensor_slices((dict(train_encodings_amazon), y_train_amazon_int))
train_dataset_amazon = train_dataset_amazon.shuffle(len(texts_train_amazon), seed=GLOBAL_SEED) \
                                          .batch(AMAZON_BATCH_SIZE) \
                                          .prefetch(tf.data.AUTOTUNE)

K.clear_session() 
gc.collect()

model_amazon = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
optimizer = tf.keras.optimizers.AdamW(learning_rate=AMAZON_LR) 
model_amazon.compile(optimizer=optimizer,
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])

history_amazon = model_amazon.fit(train_dataset_amazon, epochs=AMAZON_EPOCHS)

In [None]:
amazon_test_encodings = tokenizer(X_amazon_test_raw.tolist(), truncation=True, padding=True, 
                                 max_length=max_length, return_tensors="tf")

amazon_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(amazon_test_encodings),
    y_amazon_test_int
)).batch(AMAZON_BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

amazon_test_predictions = model_amazon.predict(amazon_test_dataset)
amazon_predicted_logits = amazon_test_predictions.logits
amazon_y_pred_int = np.argmax(amazon_predicted_logits, axis=1)

accuracy_amazon = accuracy_score(y_amazon_test_int, amazon_y_pred_int)
precision_amazon = precision_score(y_amazon_test_int, amazon_y_pred_int, average=None, labels=[0, 1], zero_division=0)
recall_amazon = recall_score(y_amazon_test_int, amazon_y_pred_int, average=None, labels=[0, 1], zero_division=0)
f1_amazon = f1_score(y_amazon_test_int, amazon_y_pred_int, average=None, labels=[0, 1], zero_division=0)

print(f"Tikslumas: {accuracy_amazon:.4f}")
print(f"Preciziškumas (Netikras/CG): {precision_amazon[0]:.4f}, Preciziškumas (Tikras/OR): {precision_amazon[1]:.4f}")
print(f"Atkūrimas (Netikras/CG): {recall_amazon[0]:.4f}, Atkūrimas (Tikras/OR): {recall_amazon[1]:.4f}")
print(f"F1-Statistikos reikšmė (Netikras/CG): {f1_amazon[0]:.4f}, F1-Statistikos reikšmė (Tikras/OR): {f1_amazon[1]:.4f}")

print("\nAmazon Modelio Rezultatų Lentelė:")
result_table_amazon = pd.DataFrame({
    'Klasė': ['CG', 'OR'],
    'Preciziškumas': [precision_amazon[0], precision_amazon[1]],
    'Atkūrimas': [recall_amazon[0], recall_amazon[1]],
    'F1-Statistikos reikšmė': [f1_amazon[0], f1_amazon[1]],
})
result_table_amazon['Tikslumas'] = accuracy_amazon
print(result_table_amazon.to_string(index=False))

cm_amazon = confusion_matrix(y_amazon_test_int, amazon_y_pred_int, labels=[0, 1])
plt.figure(figsize=(6, 5))
sns.heatmap(cm_amazon, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names_combined, yticklabels=target_names_combined)
plt.title('Painiavos Matrica - Amazon Modelis ant amazon testinio duomenų rinkinio')
plt.xlabel('Prognozuojamos Etiketės')
plt.ylabel('Tikrosios Etiketės')
plt.show()

In [None]:
train_dataset_yelp = tf.data.Dataset.from_tensor_slices((dict(train_encodings_yelp), y_train_yelp_int))
train_dataset_yelp = train_dataset_yelp.shuffle(len(texts_train_yelp), seed=GLOBAL_SEED) \
                                      .batch(YELP_BATCH_SIZE) \
                                      .prefetch(tf.data.AUTOTUNE)

K.clear_session() 
gc.collect()

model_yelp = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
optimizer = tf.keras.optimizers.AdamW(learning_rate=YELP_LR) 
model_yelp.compile(optimizer=optimizer,
                   loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                   metrics=['accuracy'])

history_yelp = model_yelp.fit(train_dataset_yelp, epochs=YELP_EPOCHS)

yelp_test_encodings = tokenizer(X_yelp_test_raw.tolist(), truncation=True, padding=True, 
                               max_length=max_length, return_tensors="tf")

yelp_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(yelp_test_encodings),
    y_yelp_test_int
)).batch(YELP_BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

yelp_test_predictions = model_yelp.predict(yelp_test_dataset)
yelp_predicted_logits = yelp_test_predictions.logits
yelp_y_pred_int = np.argmax(yelp_predicted_logits, axis=1)

accuracy_yelp = accuracy_score(y_yelp_test_int, yelp_y_pred_int)
precision_yelp = precision_score(y_yelp_test_int, yelp_y_pred_int, average=None, labels=[0, 1], zero_division=0)
recall_yelp = recall_score(y_yelp_test_int, yelp_y_pred_int, average=None, labels=[0, 1], zero_division=0)
f1_yelp = f1_score(y_yelp_test_int, yelp_y_pred_int, average=None, labels=[0, 1], zero_division=0)

print(f"Tikslumas: {accuracy_yelp:.4f}")
print(f"Preciziškumas (Netikras/CG): {precision_yelp[0]:.4f}, Preciziškumas (Tikras/OR): {precision_yelp[1]:.4f}")
print(f"Atkūrimas (Netikras/CG): {recall_yelp[0]:.4f}, Atkūrimas (Tikras/OR): {recall_yelp[1]:.4f}")
print(f"F1-Statistikos reikšmė (Netikras/CG): {f1_yelp[0]:.4f}, F1-Statistikos reikšmė (Tikras/OR): {f1_yelp[1]:.4f}")

result_table_yelp = pd.DataFrame({
    'Klasė': ['-1 (Netikras)', '1 (Tikras)'],
    'Preciziškumas': [precision_yelp[0], precision_yelp[1]],
    'Atkūrimas': [recall_yelp[0], recall_yelp[1]],
    'F1-Statistikos reikšmė': [f1_yelp[0], f1_yelp[1]],
})
result_table_yelp['Tikslumas'] = accuracy_yelp
print(result_table_yelp.to_string(index=False))

cm_yelp = confusion_matrix(y_yelp_test_int, yelp_y_pred_int, labels=[0, 1])
plt.figure(figsize=(6, 5))
sns.heatmap(cm_yelp, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names_combined, yticklabels=target_names_combined)
plt.title('Painiavos Matrica - Yelp Modelis ant yelp testinio duomenų rinkinio')
plt.xlabel('Prognozuojamos Etiketės')
plt.ylabel('Tikrosios Etiketės')
plt.show()

In [None]:
new_dataset_path = '../Experiment/Datasets/Google Play App Store For Testing.csv' 
df_new_test = pd.read_csv(new_dataset_path)

print(df_new_test.shape)
print(df_new_test.head())

text_column_new = 'review' 
label_column_new = 'label'  

X_new_test_text = df_new_test[text_column_new]
y_new_test_original = df_new_test[label_column_new]

new_label_map = {'fake': 0, 'genuine': 1} 
y_new_test_int = y_new_test_original.map(new_label_map).values

target_names_new_test = ['fake', 'genuine']

In [None]:
gplay_test_encodings = tokenizer(X_new_test_text.tolist(), truncation=True, padding=True, 
                                max_length=max_length, return_tensors="tf")

gplay_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(gplay_test_encodings),
    y_new_test_int
)).batch(AMAZON_BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

amazon_gplay_predictions = model_amazon.predict(gplay_test_dataset)
amazon_gplay_logits = amazon_gplay_predictions.logits
amazon_gplay_pred_int = np.argmax(amazon_gplay_logits, axis=1)

accuracy_amazon_gplay = accuracy_score(y_new_test_int, amazon_gplay_pred_int)
precision_amazon_gplay = precision_score(y_new_test_int, amazon_gplay_pred_int, average=None, labels=[0, 1], zero_division=0)
recall_amazon_gplay = recall_score(y_new_test_int, amazon_gplay_pred_int, average=None, labels=[0, 1], zero_division=0)
f1_amazon_gplay = f1_score(y_new_test_int, amazon_gplay_pred_int, average=None, labels=[0, 1], zero_division=0)

print(f"Tikslumas: {accuracy_amazon_gplay:.4f}")
print(f"Preciziškumas (Netikras): {precision_amazon_gplay[0]:.4f}, Preciziškumas (Tikras): {precision_amazon_gplay[1]:.4f}")
print(f"Atkūrimas (Netikras): {recall_amazon_gplay[0]:.4f}, Atkūrimas (Tikras): {recall_amazon_gplay[1]:.4f}")
print(f"F1-Statistikos reikšmė (Netikras): {f1_amazon_gplay[0]:.4f}, F1-Statistikos reikšmė (Tikras): {f1_amazon_gplay[1]:.4f}")

cm_amazon_gplay = confusion_matrix(y_new_test_int, amazon_gplay_pred_int, labels=[0, 1])
plt.figure(figsize=(6, 5))
sns.heatmap(cm_amazon_gplay, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names_new_test, yticklabels=target_names_new_test)
plt.title('Painiavos Matrica - Amazon duomenų apmokytas modelis ant Google Play duomenų rinkinio')
plt.xlabel('Prognozuojamos Etiketės')
plt.ylabel('Tikrosios Etiketės')
plt.show()

In [None]:
gplay_yelp_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(gplay_test_encodings),
    y_new_test_int
)).batch(YELP_BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

yelp_gplay_predictions = model_yelp.predict(gplay_yelp_test_dataset)
yelp_gplay_logits = yelp_gplay_predictions.logits
yelp_gplay_pred_int = np.argmax(yelp_gplay_logits, axis=1)

accuracy_yelp_gplay = accuracy_score(y_new_test_int, yelp_gplay_pred_int)
precision_yelp_gplay = precision_score(y_new_test_int, yelp_gplay_pred_int, average=None, labels=[0, 1], zero_division=0)
recall_yelp_gplay = recall_score(y_new_test_int, yelp_gplay_pred_int, average=None, labels=[0, 1], zero_division=0)
f1_yelp_gplay = f1_score(y_new_test_int, yelp_gplay_pred_int, average=None, labels=[0, 1], zero_division=0)

print(f"Tikslumas: {accuracy_yelp_gplay:.4f}")
print(f"Preciziškumas (Netikras): {precision_yelp_gplay[0]:.4f}, Preciziškumas (Tikras): {precision_yelp_gplay[1]:.4f}")
print(f"Atkūrimas (Netikras): {recall_yelp_gplay[0]:.4f}, Atkūrimas (Tikras): {recall_yelp_gplay[1]:.4f}")
print(f"F1-Statistikos reikšmė (Netikras): {f1_yelp_gplay[0]:.4f}, F1-Statistikos reikšmė (Tikras): {f1_yelp_gplay[1]:.4f}")