In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("bert_vader.csv")  
features = df[['rating', 'Positive Score', 'Negative Score', 'Compound Score', 
               'Neutral Score', 'Review Length', 'Noun Count', 'Verb Count', 
               'Adjective Count', 'Joy Score', 'Sadness Score', 'BERT Component 1', 
               'BERT Component 2', 'BERT Component 3']].values
labels = df['label'].values
scaler = StandardScaler()
features = scaler.fit_transform(features)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
x_train_tensor = torch.tensor(X_train, dtype=torch.float)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
data = Data(x=x_train_tensor, y=y_train_tensor)


class GraphSAGEModel(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGEModel, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

model = GraphSAGEModel(in_channels=x_train_tensor.shape[1], hidden_channels=16, out_channels=2)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()


edge_index = torch.arange(data.num_nodes, dtype=torch.long).repeat(2, 1)

model.train()
for epoch in range(100):
    optimizer.zero_grad()
    out = model(data.x, edge_index)
    loss = criterion(out, data.y)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():
    embeddings = model(data.x, edge_index).numpy()

np.save("graphsage_encoded_features.npy", embeddings)
print("GraphSAGE embeddings saved as graphsage_encoded_features.npy")

Epoch 10, Loss: 0.5942
Epoch 20, Loss: 0.5315
Epoch 30, Loss: 0.5176
Epoch 40, Loss: 0.5115
Epoch 50, Loss: 0.5064
Epoch 60, Loss: 0.5042
Epoch 70, Loss: 0.5025
Epoch 80, Loss: 0.5010
Epoch 90, Loss: 0.4996
Epoch 100, Loss: 0.4982
GraphSAGE embeddings saved as graphsage_encoded_features.npy


In [None]:
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [18]:
print("Shape of graphsage_encoded_features.npy (GraphSAGE embeddings):", graphsage_embeddings.shape)

Shape of graphsage_encoded_features.npy (GraphSAGE embeddings): (32000, 2)


In [None]:
import joblib 

model_filename = 'lightgbm_model.joblib'
joblib.dump(lgb_classifier, model_filename)
print(f"Model saved to {model_filename}")

Model saved to lightgbm_model.joblib


In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import classification_report
import random
import sys
import joblib
new_data = pd.read_csv('bert_vader.csv') 
true_labels = new_data['label'].values  
new_gae_embeddings = np.random.rand(len(new_data), 2) 
new_graphsage_embeddings = np.random.rand(len(new_data), 2)  
new_combined_embeddings = np.concatenate((new_gae_embeddings, new_graphsage_embeddings), axis=1)
model_filename = 'lightgbm_model.joblib'
loaded_model = joblib.load(model_filename)
new_predictions = loaded_model.predict(new_combined_embeddings)

# Step 7: Before WOA
print("New Classification Report:\n", classification_report(true_labels, new_predictions))

def fitness_lightgbm(params, X, y):
    num_leaves, max_depth, learning_rate = params
    lgb_classifier = lgb.LGBMClassifier(
        num_leaves=int(num_leaves), 
        max_depth=int(max_depth), 
        learning_rate=learning_rate, 
        random_state=42,
        n_estimators=100
    )
    lgb_classifier.fit(X, y)
    y_pred = lgb_classifier.predict(X)
    return -classification_report(y, y_pred, output_dict=True)['0']['f1-score']  

class Whale:
    def __init__(self, dim, minx, maxx, seed):
        self.rnd = random.Random(seed)
        self.position = [minx[i] + (maxx[i] - minx[i]) * self.rnd.random() for i in range(dim)]
        self.fitness = sys.float_info.max 

def woa(fitness, max_iter, n, dim, minx, maxx, X, y):
    whale_population = [Whale(dim, minx, maxx, i) for i in range(n)]
    Xbest = [0.0 for _ in range(dim)]
    Fbest = sys.float_info.max

    for i in range(n):
        whale_population[i].fitness = fitness(whale_population[i].position, X, y)
        if whale_population[i].fitness < Fbest:
            Fbest = whale_population[i].fitness
            Xbest = whale_population[i].position.copy()

    Iter = 0
    stagnation_count = 0
    early_stopping_threshold = 5  

    while Iter < max_iter:
        for i in range(n):
            A = 2 * random.random() - 1
            C = 2 * random.random()
            D = [C * Xbest[j] - whale_population[i].position[j] for j in range(dim)]
            whale_population[i].position = [whale_population[i].position[j] + A * D[j] for j in range(dim)]
            whale_population[i].position = [max(min(whale_population[i].position[j], maxx[j]), minx[j]) for j in range(dim)]
            whale_population[i].fitness = fitness(whale_population[i].position, X, y)

            if whale_population[i].fitness < Fbest:
                Xbest = whale_population[i].position.copy()
                Fbest = whale_population[i].fitness
                stagnation_count = 0 
            else:
                stagnation_count += 1

        if stagnation_count >= early_stopping_threshold:
            print(f"Early stopping at iteration {Iter} with best fitness {Fbest}")
            break

        Iter += 1

    return Xbest


num_whales = 10  
max_iter = 5 
dim = 3 
minx = [2, 3, 0.01]
maxx = [256, 20, 0.5]


best_params = woa(fitness_lightgbm, max_iter, num_whales, dim, minx, maxx, new_combined_embeddings, true_labels)


print("Best parameters found:")
print(f"Num leaves: {int(best_params[0])}, Max depth: {int(best_params[1])}, Learning rate: {best_params[2]}")


lgb_classifier = lgb.LGBMClassifier(num_leaves=int(best_params[0]), max_depth=int(best_params[1]), learning_rate=best_params[2], random_state=42)
lgb_classifier.fit(new_combined_embeddings, true_labels)

y_pred = lgb_classifier.predict(new_combined_embeddings)
print("Classification Report on New Data:\n", classification_report(true_labels, y_pred))

New Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.44      0.54     27880
           1       0.30      0.55      0.39     12120

    accuracy                           0.48     40000
   macro avg       0.50      0.50      0.47     40000
weighted avg       0.58      0.48      0.49     40000

[LightGBM] [Info] Number of positive: 12120, number of negative: 27880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.303000 -> initscore=-0.833053
[LightGBM] [Info] Start training from score -0.833053
[LightGBM] [Info] Number of positive: 12120, number of negative: 27880
[LightGBM] [Info]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[LightGBM] [Info] Number of positive: 12120, number of negative: 27880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.303000 -> initscore=-0.833053
[LightGBM] [Info] Start training from score -0.833053
[LightGBM] [Info] Number of positive: 12120, number of negative: 27880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 4
[LightGBM] [Info] [b

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[LightGBM] [Info] Number of positive: 12120, number of negative: 27880
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000460 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.303000 -> initscore=-0.833053
[LightGBM] [Info] Start training from score -0.833053
[LightGBM] [Info] Number of positive: 12120, number of negative: 27880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.303000 -> initscore=-0.833053
[Lig

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[LightGBM] [Info] Number of positive: 12120, number of negative: 27880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.303000 -> initscore=-0.833053
[LightGBM] [Info] Start training from score -0.833053
[LightGBM] [Info] Number of positive: 12120, number of negative: 27880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 4
[LightGBM] [Info] [b

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score
import joblib
new_data = pd.read_csv('bert_vader.csv')
features = new_data.drop(columns=['review', 'rating', 'label']) 
true_labels = new_data['label'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, true_labels, test_size=0.2, random_state=42)
lgb_classifier = lgb.LGBMClassifier(num_leaves=31, max_depth=-1, learning_rate=0.1, random_state=42)
lgb_classifier.fit(X_train, y_train)
y_pred = lgb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Genuine', 'Fake'], output_dict=True)
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1 = report['weighted avg']['f1-score']
num_test_samples = len(y_test)


print(f"LightGBM Model Evaluation:\n"
      f"Tested Samples: {num_test_samples}\n"
      f"Accuracy: {accuracy:.2f}\n"
      f"Precision: {precision:.2f}\n"
      f"Recall: {recall:.2f}\n"
      f"F1 Score: {f1:.2f}\n")


model_filename = 'lightgbm_model_without_woa.joblib'
joblib.dump(lgb_classifier, model_filename)
print(f"Trained model saved to {model_filename}.")

[LightGBM] [Info] Number of positive: 9669, number of negative: 22331
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000446 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2884
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302156 -> initscore=-0.837051
[LightGBM] [Info] Start training from score -0.837051
LightGBM Model Evaluation:
Tested Samples: 8000
Accuracy: 0.76
Precision: 0.75
Recall: 0.76
F1 Score: 0.75

Trained model saved to lightgbm_model_without_woa.joblib.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
data = pd.read_csv('bert_vader.csv')
X = data[['Positive Score', 'Negative Score', 'Neutral Score', 'Compound Score', 
          'Review Length', 'Noun Count', 'Verb Count', 'Adjective Count', 
          'Joy Score', 'Sadness Score', 'BERT Component 1', 'BERT Component 2', 'BERT Component 3']]
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['Genuine', 'Fake'], output_dict=True)
  
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    f1 = report['weighted avg']['f1-score']
    num_test_samples = len(y_test)

    print(f"{model_name} - Tested Samples: {num_test_samples}, Accuracy: {accuracy:.2f}, "
          f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

Logistic Regression - Tested Samples: 12000, Accuracy: 0.72, Precision: 0.70, Recall: 0.72, F1 Score: 0.70
Random Forest - Tested Samples: 12000, Accuracy: 0.74, Precision: 0.73, Recall: 0.74, F1 Score: 0.73
XGBoost - Tested Samples: 12000, Accuracy: 0.75, Precision: 0.73, Recall: 0.75, F1 Score: 0.73
Neural Network - Tested Samples: 12000, Accuracy: 0.75, Precision: 0.74, Recall: 0.75, F1 Score: 0.74
