In [4]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data
from sklearn.metrics.pairwise import cosine_similarity
from torch_geometric.utils import dense_to_sparse

# Load dataset
labeled_data = pd.read_csv('bert_vader.csv')

# Drop text columns or irrelevant fields
labeled_data = labeled_data.drop(columns=['review'])

X = labeled_data.drop(columns=['label']).values
y = labeled_data['label'].values

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert to tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float)
y_tensor = torch.tensor(y, dtype=torch.long)

# Construct similarity graph
similarity_matrix = cosine_similarity(X_scaled)
threshold = 0.7  # Adjust threshold as required
adjacency_matrix = (similarity_matrix > threshold).astype(int)
edge_index = dense_to_sparse(torch.tensor(adjacency_matrix, dtype=torch.float))[0]

# Create PyTorch Geometric data object
data = Data(x=X_tensor, edge_index=edge_index, y=y_tensor)

torch.save(data, 'graphsage_processed_data.pt')
print("Graph data saved to 'graphsage_processed_data.pt'.")


Graph data saved to 'graphsage_processed_data.pt'.


In [None]:
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
import torch

# Load preprocessed data
data = torch.load('graphsage_processed_data.pt')
print("Loaded graph data from 'graphsage_processed_data.pt'.")


class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x


in_channels = data.x.size(1)
hidden_channels = 64
out_channels = in_channels  # For reconstruction

# Initialize model and optimizer
sage_model = GraphSAGE(in_channels, hidden_channels, out_channels)
optimizer = torch.optim.Adam(sage_model.parameters(), lr=0.01, weight_decay=5e-4)

# Training loop
def train_sage(model, data, epochs=5):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        embeddings = model(data.x, data.edge_index)
        loss = F.mse_loss(embeddings, data.x)  # Reconstruction loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
    
    # Save trained model
    torch.save(model.state_dict(), 'graphsage_model.pt')
    print("Trained GraphSAGE model saved to 'graphsage_model.pt'.")

train_sage(sage_model, data, epochs=5)

  data = torch.load('graphsage_processed_data.pt')


Loaded graph data from 'graphsage_processed_data.pt'.


In [3]:
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
import torch


data = torch.load('graphsage_processed_data.pt')
print("Loaded graph data from 'graphsage_processed_data.pt'.")

# Define the GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x


# Model parameters
in_channels = data.x.size(1)
hidden_channels = 8  # Reduced size
out_channels = in_channels  # For reconstruction

# Initialize model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)
sage_model = GraphSAGE(in_channels, hidden_channels, out_channels).to(device)
optimizer = torch.optim.Adam(sage_model.parameters(), lr=0.01, weight_decay=5e-4)

# Training loop
def train_sage(model, data, epochs=3):  # Reduced epochs
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        embeddings = model(data.x, data.edge_index)
        loss = F.mse_loss(embeddings, data.x)  # Reconstruction loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
    
    # Save trained model
    torch.save(model.state_dict(), 'graphsage_model.pt')
    print("Trained GraphSAGE model saved to 'graphsage_model.pt'.")

train_sage(sage_model, data, epochs=3)


  data = torch.load('graphsage_processed_data.pt')


Loaded graph data from 'graphsage_processed_data.pt'.
Epoch 1, Loss: 1.381186842918396
Epoch 2, Loss: 1.2261302471160889
Epoch 3, Loss: 1.115859031677246
Trained GraphSAGE model saved to 'graphsage_model.pt'.


In [53]:
def extract_embeddings(model: torch.nn.Module, data: Data) -> np.ndarray:
    """Extract embeddings from the GraphSAGE model."""
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        outputs = model(data.x, data.edge_index)  # This might return multiple values
        encoded_features = outputs  # Since GraphSAGE only returns embeddings (usually)
    return encoded_features.numpy()  # Return the embeddings as a numpy array

In [55]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score

# Split the selected features into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Train LightGBM classifier
lgb_classifier = LGBMClassifier(random_state=42)
lgb_classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = lgb_classifier.predict(X_test)

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


[LightGBM] [Info] Number of positive: 9669, number of negative: 22331
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003322 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302156 -> initscore=-0.837051
[LightGBM] [Info] Start training from score -0.837051
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.90      0.82      5549
           1       0.58      0.31      0.40      2451

    accuracy                           0.72      8000
   macro avg       0.67      0.61      0.61      8000
weighted avg       0.70      0.72      0.69      8000

Accuracy: 0.72


In [29]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import joblib

# Assuming 'X_selected' and 'y' are already generated from GraphSAGE embeddings
# Split the selected features into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Train LightGBM classifier
lgb_classifier = LGBMClassifier(random_state=42)
lgb_classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = lgb_classifier.predict(X_test)
y_prob = lgb_classifier.predict_proba(X_test)  # Optional: Prediction probabilities

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Save predictions and model
np.save("graphsage_predictions.npy", y_pred)  # Save predictions
np.save("graphsage_probabilities.npy", y_prob)  # Save probabilities
print("GraphSAGE predictions saved.")

joblib.dump(lgb_classifier, "graphsage_lightgbm.pkl")  # Save model
print("GraphSAGE LightGBM model saved.")


[LightGBM] [Info] Number of positive: 9669, number of negative: 22331
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302156 -> initscore=-0.837051
[LightGBM] [Info] Start training from score -0.837051
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.90      0.82      5549
           1       0.58      0.31      0.40      2451

    accuracy                           0.72      8000
   macro avg       0.67      0.61      0.61      8000
weighted avg       0.70      0.72      0.69      8000

Accuracy: 0.72
GraphSAGE predictions saved.
GraphSAGE LightGBM model saved.


In [21]:
# GAE DIFFERENT MODEL

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data
from sklearn.metrics.pairwise import cosine_similarity
from torch_geometric.utils import dense_to_sparse
import numpy as np
labeled_data = pd.read_csv('3k_labeled_extracted.csv')
unlabeled_data = pd.read_csv('6k_unlabeled_extracted.csv')

labeled_data = labeled_data.drop(columns=['text_'])
unlabeled_data = unlabeled_data.drop(columns=['text_'])

X_labeled = labeled_data.drop(columns=['label']).values
y_labeled = labeled_data['label'].values
X_unlabeled = unlabeled_data.values


scaler = StandardScaler()
X_labeled_scaled = scaler.fit_transform(X_labeled)
X_unlabeled_scaled = scaler.transform(X_unlabeled)

X_labeled_tensor = torch.tensor(X_labeled_scaled, dtype=torch.float)
y_labeled_tensor = torch.tensor(y_labeled, dtype=torch.long)
X_unlabeled_tensor = torch.tensor(X_unlabeled_scaled, dtype=torch.float)

similarity_matrix = cosine_similarity(np.vstack([X_labeled_scaled, X_unlabeled_scaled]))

threshold = 0.7  # Adjust threshold for graph sparsity
adjacency_matrix = (similarity_matrix > threshold).astype(int)


edge_index = dense_to_sparse(torch.tensor(adjacency_matrix, dtype=torch.float))[0]


features = torch.cat([X_labeled_tensor, X_unlabeled_tensor], dim=0)
data = Data(x=features, edge_index=edge_index)


torch.save({
    'features': features,
    'edge_index': edge_index,
    'labels': y_labeled_tensor
}, 'processed_graph_data.pt')
print("Processed data saved to 'processed_graph_data.pt'.")



import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GraphAutoencoder(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphAutoencoder, self).__init__()
        self.encoder = GCNConv(in_channels, hidden_channels)
        self.decoder = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        # Encoder: learn low-dimensional embeddings
        encoded = F.relu(self.encoder(x, edge_index))
        # Decoder: reconstruct from embeddings
        decoded = self.decoder(encoded, edge_index)
        return decoded, encoded

# Model parameters
in_channels = features.size(1)
hidden_channels = 64
out_channels = in_channels

# Initialize the GAE model
gae_model = GraphAutoencoder(in_channels, hidden_channels, out_channels)
optimizer = torch.optim.Adam(gae_model.parameters(), lr=0.01, weight_decay=5e-4)

# Saving the model state
torch.save(gae_model.state_dict(), 'graph_autoencoder_model.pt')
print("Model state saved to 'graph_autoencoder_model.pt'.")

Processed data saved to 'processed_graph_data.pt'.
Model state saved to 'graph_autoencoder_model.pt'.


In [23]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GraphAutoencoder(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphAutoencoder, self).__init__()
        self.encoder = GCNConv(in_channels, hidden_channels)
        self.decoder = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        encoded = F.relu(self.encoder(x, edge_index))
        decoded = self.decoder(encoded, edge_index)
        return decoded, encoded

# Initialize the model parameters
in_channels = features.size(1)
hidden_channels = 64
out_channels = in_channels

# Initialize the GAE model and optimizer
gae_model = GraphAutoencoder(in_channels, hidden_channels, out_channels)
optimizer = torch.optim.Adam(gae_model.parameters(), lr=0.01, weight_decay=5e-4)

def train_gae(data, epochs=5):
    """Train the Graph Autoencoder and return training and validation losses."""
    gae_model.train()
    

    train_losses = []
    val_losses = []  # For future use if you implement validation

    for epoch in range(epochs):
        optimizer.zero_grad()
        decoded, encoded = gae_model(data.x, data.edge_index)


        loss = F.mse_loss(decoded, data.x)

        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())  # Append training loss


        val_loss = F.mse_loss(decoded, data.x)  # Use the same data for simplicity, replace with validation data in practice
        val_losses.append(val_loss.item())

        if epoch % 1 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}')
            


    torch.save(gae_model.state_dict(), 'graph_autoencoder_model_trained.pt')
    print("Trained model state saved to 'graph_autoencoder_model_trained.pt'.")

    return train_losses, val_losses  # Return the lists of losses


train_losses, val_losses = train_gae(data, epochs=5)  # Adjust epochs as needed

Epoch 0, Loss: 2.3672053813934326, Validation Loss: 2.3672053813934326
Epoch 1, Loss: 1.8817138671875, Validation Loss: 1.8817138671875
Epoch 2, Loss: 1.5160062313079834, Validation Loss: 1.5160062313079834
Epoch 3, Loss: 1.2371913194656372, Validation Loss: 1.2371913194656372
Epoch 4, Loss: 1.033754825592041, Validation Loss: 1.033754825592041
Trained model state saved to 'graph_autoencoder_model_trained.pt'.


In [33]:
import pandas as pd
import numpy as np
import torch
import lightgbm as lgb
from sklearn.metrics import classification_report
import joblib


def extract_embeddings(model: torch.nn.Module, data: Data) -> np.ndarray:
    """Extract embeddings from the graph autoencoder model."""
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        _, encoded_features = model(data.x, data.edge_index)
    return encoded_features.numpy()


def evaluate_classifier(classifier, X_train: np.ndarray, y_train: np.ndarray) -> None:
    """Evaluate the trained model and print classification report."""
    y_pred = classifier.predict(X_train)  # Predict on the labeled training data
    report = classification_report(y_train, y_pred, output_dict=True)
    accuracy = report['accuracy'] * 100  # Convert to percentage
    print("Classification Report:\n")
    print(classification_report(y_train, y_pred))
    print(f"Accuracy: {accuracy:.2f}%")

    return accuracy, y_pred


def predict_unlabeled_data(classifier, encoded_features: np.ndarray, num_labeled: int) -> np.ndarray:
    """Predict labels for unlabeled data using the trained classifier."""
    X_unlabeled_features = encoded_features[num_labeled:]  # Get unlabeled features
    return classifier.predict(X_unlabeled_features)


def save_predictions(unlabeled_data: pd.DataFrame, predictions: np.ndarray, filename: str) -> None:
    """Add predicted labels to the unlabeled data and save to a CSV file."""
    unlabeled_data['predicted_label'] = predictions
    unlabeled_data.to_csv(filename, index=False)
    print(f"Unlabeled data with predicted labels saved to '{filename}'.")


def train_and_evaluate(classifier, X_train: np.ndarray, y_train: np.ndarray, model_name: str) -> None:
    """Train the classifier and evaluate it."""
    classifier.fit(X_train, y_train)
    print(f"\nEvaluating {model_name}...")
    accuracy, y_pred = evaluate_classifier(classifier, X_train, y_train)
    
    return accuracy, y_pred


def main():
    # Step 1: Extract embeddings
    encoded_features = extract_embeddings(gae_model, data)
    
    num_labeled = X_labeled_tensor.size(0)  # Get number of labeled samples
    X_train = encoded_features[:num_labeled]  # Labeled features
    y_train = y_labeled_tensor.numpy()  # Labeled targets

    # Step 2: Train LightGBM Classifier
    lgb_classifier = lgb.LGBMClassifier(random_state=42)
    accuracy, y_pred = train_and_evaluate(lgb_classifier, X_train, y_train, "LightGBM Classifier")

    # Step 3: Predict labels for unlabeled data
    predicted_labels = predict_unlabeled_data(lgb_classifier, encoded_features, num_labeled)

    # Step 4: Save predictions
    save_predictions(unlabeled_data, predicted_labels, "unlabeled_data_predictions_22_nov.csv")

    # Step 5: Save models
    torch.save(gae_model.state_dict(), "gae_model_22_nov.pth")
    print("PyTorch model saved as 'gae_model_22_nov.pth'.")

    joblib.dump(lgb_classifier, "lgb_classifier_22_nov.pkl")
    print("LightGBM classifier saved as 'lgb_classifier_22_nov.pkl'.")

    # Display head of predictions
    print("\nHead of the new predictions:")
    print(unlabeled_data.head())  # Display the first few rows


if __name__ == "__main__":
    main()
# END OF GAE MODEL

[LightGBM] [Info] Number of positive: 1484, number of negative: 1516
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001502 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16320
[LightGBM] [Info] Number of data points in the train set: 3000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494667 -> initscore=-0.021334
[LightGBM] [Info] Start training from score -0.021334

Evaluating LightGBM Classifier...
Classification Report:

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1516
           1       0.98      0.99      0.99      1484

    accuracy                           0.99      3000
   macro avg       0.99      0.99      0.99      3000
weighted avg       0.99      0.99      0.99      3000

Accuracy: 98.57%
Unlabeled data with predicted labels save

In [35]:
import numpy as np
import torch
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
import pandas as pd


joblib.dump(lgb_classifier, "gae_lightgbm_model.pkl")
print("GAE + LightGBM model saved as 'gae_lightgbm_model.pkl'.")
predicted_labels = predict_unlabeled_data(lgb_classifier, encoded_features, num_labeled)
unlabeled_data = pd.DataFrame(encoded_features[num_labeled:], columns=[f"feature_{i}" for i in range(encoded_features.shape[1])])  # Create dataframe for unlabeled data
save_predictions(unlabeled_data, predicted_labels, "unlabeled_data_predictions_gae.csv")

torch.save(gae_model.state_dict(), "gae_model.pth")
print("GAE model saved as 'gae_model.pth'.")

GAE + LightGBM model saved as 'gae_lightgbm_model.pkl'.
Unlabeled data with predicted labels saved to 'unlabeled_data_predictions_gae.csv'.
GAE model saved as 'gae_model.pth'.


In [65]:
def extract_embeddings(model: torch.nn.Module, data: Data) -> np.ndarray:
    """Extract embeddings from the GraphSAGE or Graph Autoencoder model."""
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        outputs = model(data.x, data.edge_index)  # This might return multiple values
        
        # If the model is a Graph Autoencoder (GAE), unpack the tuple
        if isinstance(outputs, tuple):
            _, encoded_features = outputs  # GAE returns a tuple (decoded, encoded)
        else:
            encoded_features = outputs  # For GraphSAGE, we just get the embeddings directly
    
    return encoded_features.numpy()  # Return the embeddings as a numpy array


In [67]:
graphsage_embeddings = extract_embeddings(sage_model, data)  # From GraphSAGE
gae_embeddings = extract_embeddings(gae_model, data)  # From GAE

num_labeled = X_labeled_tensor.size(0)
combined_embeddings = np.concatenate([graphsage_embeddings, gae_embeddings], axis=1)

In [91]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_combined = combined_embeddings[:num_labeled] 
y_combined = y_labeled_tensor.numpy() 

X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, random_state=42)

lgb_classifier = lgb.LGBMClassifier(random_state=42)
lgb_classifier.fit(X_train, y_train)

y_pred = lgb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 1124, number of negative: 1126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19890
[LightGBM] [Info] Number of data points in the train set: 2250, number of used features: 78
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499556 -> initscore=-0.001778
[LightGBM] [Info] Start training from score -0.001778
              precision    recall  f1-score   support

           0       0.78      0.73      0.75       390
           1       0.73      0.77      0.75       360

    accuracy                           0.75       750
   macro avg       0.75      0.75      0.75       750
weighted avg       0.75      0.75      0.75       750

