In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.5.3


In [12]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score as calculate_f1_score
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
edge_index_train = torch.load('/content/drive/MyDrive/PROJECT/edge_index.pt')
edge_index_test = torch.load('/content/drive/MyDrive/PROJECT/edge_index_test.pt')
edge_index_val = torch.load('/content/drive/MyDrive/PROJECT/edge_index_val.pt')

features_file_train = '/content/drive/MyDrive/PROJECT/feature_matrix_train.txt'
X_train = np.loadtxt(features_file_train)
features_file_test = '/content/drive/MyDrive/PROJECT/feature_matrix_test.txt'
X_test = np.loadtxt(features_file_test)
features_file_val = '/content/drive/MyDrive/PROJECT/feature_matrix_val.txt'
X_val = np.loadtxt(features_file_val)

labels_test = pd.read_csv("/content/drive/MyDrive/PROJECT/test_filtered.csv")
labels_train = pd.read_csv("/content/drive/MyDrive/PROJECT/train_filtered.csv")
labels_val = pd.read_csv("/content/drive/MyDrive/PROJECT/val_filtered.csv")

y_train = torch.tensor(labels_train['label'].values, dtype=torch.long).to(device)
y_val = torch.tensor(labels_val['label'].values, dtype=torch.long).to(device)
y_true = torch.tensor(labels_test['label'].values, dtype=torch.long).to(device)

# Preprocess features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)

X_test_scaled = scaler.transform(X_test)
X_test = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

X_val_scaled = scaler.transform(X_val)
X_val = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)

# Define the GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=-1)

# Initialize the GraphSAGE model
model = GraphSAGE(in_channels=X_train.shape[1], hidden_channels=128, out_channels=13)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Convert data to appropriate format
edge_index_train = edge_index_train.to(device)
edge_index_val = edge_index_val.to(device)
edge_index_test = edge_index_test.to(device)

# Training
def train(model, optimizer, criterion, X_train, edge_train, y_train, X_val, edge_val, y_val, epochs=500, patience=100):
    best_val_loss = float('inf')
    best_val_acc = 0.0
    current_patience = 0
    train_losses = []
    val_losses = []

    train_f1_scores = []  # Initialize list for training F1 scores
    epochss = []

    for epoch in range(epochs):
        epochss.append(epoch)
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train, edge_train)
        loss = criterion(outputs, y_train)
        train_losses.append(loss.cpu().item())
        loss.backward()
        optimizer.step()

        # Compute training accuracy
        _, predicted_train = torch.max(outputs, 1)
        train_acc = torch.sum(predicted_train == y_train).item() / len(y_train)
        # Calculate training F1 score
        train_f1 = calculate_f1_score(y_train.cpu().numpy(), predicted_train.cpu().numpy(), average='weighted')


        # Save training F1 score
        train_f1_scores.append(train_f1)



        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val, edge_val)
            val_loss = criterion(val_outputs, y_val)
            val_losses.append(val_loss)
            # Compute validation accuracy
            _, predicted_val = torch.max(val_outputs, 1)
            val_acc = torch.sum(predicted_val == y_val).item() / len(y_val)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pt')
            current_patience = 0
        else:
            current_patience += 1
            if current_patience >= patience:
                print(f'Early stopping at epoch {epoch}')
                break

        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item()}, Train Acc: {train_acc:.4f},train F1-score:{train_f1:.4f} Val Loss: {val_loss.item()}, Val Acc: {val_acc:.4f}')

    np.savetxt('/content/drive/MyDrive/PROJECT/train_f1_scores_GS.txt',train_f1_scores)
    np.savetxt('/content/drive/MyDrive/PROJECT/train_loss_GS.txt', train_losses)
    np.savetxt('/content/drive/MyDrive/PROJECT/epochs_GS.txt', epochss)






# Convert data to appropriate format
edge_train = edge_index_train.to(device)
edge_val = edge_index_val.to(device)
edge_test = edge_index_test.to(device)

# Train the model
train(model, optimizer, criterion, X_train, edge_train, y_train, X_val, edge_val, y_val, epochs=500, patience=100)

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

# Testing
model.eval()
with torch.no_grad():
    test_outputs = model(X_test, edge_test)
    test_loss = criterion(test_outputs, y_true)
    _, predicted = torch.max(test_outputs, 1)
    accuracy = torch.sum(predicted == y_true).item() / len(y_true)

print(f'Test Loss: {test_loss.item()}, Test Accuracy: {accuracy}')

from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), predicted.cpu().numpy(), average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')

with open('/content/drive/MyDrive/PROJECT/f1_scores_GS.txt', 'w') as file:
    file.write(f'{f1_score:.4f}')

print("F1-score saved to file.")




Using device: cpu
Epoch [1/500], Loss: 2.571683645248413, Train Acc: 0.0835,train F1-score:0.1311 Val Loss: 1.9960720539093018, Val Acc: 0.6412
Epoch [2/500], Loss: 1.956561803817749, Train Acc: 0.6018,train F1-score:0.5563 Val Loss: 1.5173274278640747, Val Acc: 0.6613
Epoch [3/500], Loss: 1.5179853439331055, Train Acc: 0.6326,train F1-score:0.5715 Val Loss: 1.1520590782165527, Val Acc: 0.6908
Epoch [4/500], Loss: 1.2453346252441406, Train Acc: 0.6464,train F1-score:0.5632 Val Loss: 0.9770436882972717, Val Acc: 0.7021
Epoch [5/500], Loss: 1.1177735328674316, Train Acc: 0.6563,train F1-score:0.5581 Val Loss: 0.9197558760643005, Val Acc: 0.7041
Epoch [6/500], Loss: 1.0572932958602905, Train Acc: 0.6632,train F1-score:0.5699 Val Loss: 0.892148494720459, Val Acc: 0.7296
Epoch [7/500], Loss: 1.0082656145095825, Train Acc: 0.6751,train F1-score:0.6062 Val Loss: 0.8791957497596741, Val Acc: 0.7229
Epoch [8/500], Loss: 0.9709280729293823, Train Acc: 0.6784,train F1-score:0.6361 Val Loss: 0.870

  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the best GIN model
model.load_state_dict(torch.load('best_model.pt'))

# Get the output of the GIN model for the training, validation, and test sets
model.eval()
with torch.no_grad():
    train_outputs = model(X_train, edge_index_train).cpu().numpy()
    val_outputs = model(X_val, edge_index_val).cpu().numpy()
    test_outputs = model(X_test, edge_index_test).cpu().numpy()

# Train a decision tree classifier
decision_tree = DecisionTreeClassifier(max_depth=8)
decision_tree.fit(train_outputs, y_train.cpu().numpy())

# Predict labels using the decision tree
train_pred = decision_tree.predict(train_outputs)
val_pred = decision_tree.predict(val_outputs)
test_pred = decision_tree.predict(test_outputs)

# Evaluate decision tree performance
train_acc = accuracy_score(y_train.cpu().numpy(), train_pred)
val_acc = accuracy_score(y_val.cpu().numpy(), val_pred)
test_acc = accuracy_score(y_true.cpu().numpy(), test_pred)

print(f'Training Accuracy (Decision Tree): {train_acc}')
print(f'Validation Accuracy (Decision Tree): {val_acc}')
print(f'Test Accuracy (Decision Tree): {test_acc}')
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), test_pred, average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')

with open('/content/drive/MyDrive/PROJECT/f1_scores_GS_DT.txt', 'w') as file:
    file.write(f'{f1_score:.4f}')

print("F1-score saved to file.")



Training Accuracy (Decision Tree): 0.957126109529392
Validation Accuracy (Decision Tree): 0.8982597054886211
Test Accuracy (Decision Tree): 0.9054959785522788
Precision: 0.9002, Recall: 0.9055, F1-score: 0.9013
F1-score saved to file.


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the best GIN model
model.load_state_dict(torch.load('best_model.pt'))

# Get the output of the GIN model for the training, validation, and test sets
model.eval()
with torch.no_grad():
    train_outputs = model(X_train, edge_index_train).cpu().numpy()
    val_outputs = model(X_val, edge_index_val).cpu().numpy()
    test_outputs = model(X_test, edge_index_test).cpu().numpy()

# Concatenate the GIN model output with the original feature matrices
X_train_combined = np.concatenate((X_train.cpu().numpy(), train_outputs), axis=1)
X_val_combined = np.concatenate((X_val.cpu().numpy(), val_outputs), axis=1)
X_test_combined = np.concatenate((X_test.cpu().numpy(), test_outputs), axis=1)

# Train a decision tree classifier
decision_tree = DecisionTreeClassifier(max_depth=10)
decision_tree.fit(X_train_combined, y_train.cpu().numpy())

# Predict labels using the decision tree
train_pred = decision_tree.predict(X_train_combined)
val_pred = decision_tree.predict(X_val_combined)
test_pred = decision_tree.predict(X_test_combined)

# Evaluate decision tree performance
train_acc = accuracy_score(y_train.cpu().numpy(), train_pred)
val_acc = accuracy_score(y_val.cpu().numpy(), val_pred)
test_acc = accuracy_score(y_true.cpu().numpy(), test_pred)

print(f'Training Accuracy (Decision Tree): {train_acc}')
print(f'Validation Accuracy (Decision Tree): {val_acc}')
print(f'Test Accuracy (Decision Tree): {test_acc}')

from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), test_pred, average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')

with open('/content/drive/MyDrive/PROJECT/f1_scores_GS_DT_COMB.txt', 'w') as file:
    file.write(f'{f1_score:.4f}')

print("F1-score saved to file.")


Training Accuracy (Decision Tree): 0.9789817451013231
Validation Accuracy (Decision Tree): 0.9089692101740294
Test Accuracy (Decision Tree): 0.9175603217158177
Precision: 0.9162, Recall: 0.9176, F1-score: 0.9144
F1-score saved to file.


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the best GIN model
model.load_state_dict(torch.load('best_model.pt'))

# Get the output of the GIN model for the training, validation, and test sets
model.eval()
with torch.no_grad():
    train_outputs = model(X_train, edge_index_train).cpu().numpy()
    val_outputs = model(X_val, edge_index_val).cpu().numpy()
    test_outputs = model(X_test, edge_index_test).cpu().numpy()

# Train an SVM classifier
svm_classifier = SVC(kernel='linear')  # You can specify different kernel functions (e.g., 'linear', 'poly', 'rbf', etc.)
svm_classifier.fit(train_outputs, y_train.cpu().numpy())

# Predict labels using the SVM classifier
train_pred = svm_classifier.predict(train_outputs)
val_pred = svm_classifier.predict(val_outputs)
test_pred = svm_classifier.predict(test_outputs)

# Evaluate SVM performance
train_acc = accuracy_score(y_train.cpu().numpy(), train_pred)
val_acc = accuracy_score(y_val.cpu().numpy(), val_pred)
test_acc = accuracy_score(y_true.cpu().numpy(), test_pred)

print(f'Training Accuracy (SVM): {train_acc}')
print(f'Validation Accuracy (SVM): {val_acc}')
print(f'Test Accuracy (SVM): {test_acc}')

from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(),test_pred, average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')

with open('/content/drive/MyDrive/PROJECT/f1_scores_GS_SVM.txt', 'w') as file:
    file.write(f'{f1_score:.4f}')

print("F1-score saved to file.")


Training Accuracy (SVM): 0.9395411153910568
Validation Accuracy (SVM): 0.8975903614457831
Test Accuracy (SVM): 0.9054959785522788
Precision: 0.9021, Recall: 0.9055, F1-score: 0.9026
F1-score saved to file.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the best GIN model
model.load_state_dict(torch.load('best_model.pt'))

# Get the output of the GIN model for the training, validation, and test sets
model.eval()
with torch.no_grad():
    train_outputs = model(X_train, edge_index_train).cpu().numpy()
    val_outputs = model(X_val, edge_index_val).cpu().numpy()
    test_outputs = model(X_test, edge_index_test).cpu().numpy()

# Combine the output of the GIN model with the original features
X_train_combined = np.concatenate((X_train.cpu().numpy(), train_outputs), axis=1)
X_val_combined = np.concatenate((X_val.cpu().numpy(), val_outputs), axis=1)
X_test_combined = np.concatenate((X_test.cpu().numpy(), test_outputs), axis=1)

# Train an SVM classifier
svm_classifier = SVC(kernel='linear')  # You can specify different kernel functions (e.g., 'linear', 'poly', 'rbf', etc.)
svm_classifier.fit(X_train_combined, y_train.cpu().numpy())

# Predict labels using the SVM classifier
train_pred = svm_classifier.predict(X_train_combined)
val_pred = svm_classifier.predict(X_val_combined)
test_pred = svm_classifier.predict(X_test_combined)

# Evaluate SVM performance
train_acc = accuracy_score(y_train.cpu().numpy(), train_pred)
val_acc = accuracy_score(y_val.cpu().numpy(), val_pred)
test_acc = accuracy_score(y_true.cpu().numpy(), test_pred)

print(f'Training Accuracy (SVM): {train_acc}')
print(f'Validation Accuracy (SVM): {val_acc}')
print(f'Test Accuracy (SVM): {test_acc}')

from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), test_pred, average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')
with open('/content/drive/MyDrive/PROJECT/f1_scores_GS_SVM_COMB.txt', 'w') as file:
    file.write(f'{f1_score:.4f}')

print("F1-score saved to file.")



Training Accuracy (SVM): 0.9510132306146374
Validation Accuracy (SVM): 0.8995983935742972
Test Accuracy (SVM): 0.9121983914209115
Precision: 0.9080, Recall: 0.9122, F1-score: 0.9091
F1-score saved to file.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score as calculate_f1_score
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
edge_index_train = torch.load('/content/drive/MyDrive/PROJECT/edge_index.pt')
edge_index_test = torch.load('/content/drive/MyDrive/PROJECT/edge_index_test.pt')
edge_index_val = torch.load('/content/drive/MyDrive/PROJECT/edge_index_val.pt')

features_file_train = '/content/drive/MyDrive/PROJECT/feature_matrix_train.txt'
X_train = np.loadtxt(features_file_train)
features_file_test = '/content/drive/MyDrive/PROJECT/feature_matrix_test.txt'
X_test = np.loadtxt(features_file_test)
features_file_val = '/content/drive/MyDrive/PROJECT/feature_matrix_val.txt'
X_val = np.loadtxt(features_file_val)

labels_test = pd.read_csv("/content/drive/MyDrive/PROJECT/test_filtered.csv")
labels_train = pd.read_csv("/content/drive/MyDrive/PROJECT/train_filtered.csv")
labels_val = pd.read_csv("/content/drive/MyDrive/PROJECT/val_filtered.csv")

y_train = torch.tensor(labels_train['label'].values, dtype=torch.long).to(device)
y_val = torch.tensor(labels_val['label'].values, dtype=torch.long).to(device)
y_true = torch.tensor(labels_test['label'].values, dtype=torch.long).to(device)

# Preprocess features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)

X_test_scaled = scaler.transform(X_test)
X_test = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

X_val_scaled = scaler.transform(X_val)
X_val = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.convs = torch.nn.ModuleList([SAGEConv(hidden_channels, hidden_channels) for _ in range(num_layers - 2)])
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        for conv in self.convs:
            x = F.relu(conv(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=-1)
# Initialize the GraphSAGE model
model = GraphSAGE(in_channels=X_train.shape[1], hidden_channels=128, out_channels=13,num_layers=1)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Convert data to appropriate format
edge_index_train = edge_index_train.to(device)
edge_index_val = edge_index_val.to(device)
edge_index_test = edge_index_test.to(device)

# Training
def train(model, optimizer, criterion, X_train, edge_train, y_train, X_val, edge_val, y_val, epochs=500, patience=100):
    best_val_loss = float('inf')
    best_val_acc = 0.0
    current_patience = 0
    train_losses = []
    val_losses = []

    train_f1_scores = []  # Initialize list for training F1 scores
    epochss = []

    for epoch in range(epochs):
        epochss.append(epoch)
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train, edge_train)
        loss = criterion(outputs, y_train)
        train_losses.append(loss.cpu().item())
        loss.backward()
        optimizer.step()

        # Compute training accuracy
        _, predicted_train = torch.max(outputs, 1)
        train_acc = torch.sum(predicted_train == y_train).item() / len(y_train)
        # Calculate training F1 score
        train_f1 = calculate_f1_score(y_train.cpu().numpy(), predicted_train.cpu().numpy(), average='weighted')


        # Save training F1 score
        train_f1_scores.append(train_f1)



        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val, edge_val)
            val_loss = criterion(val_outputs, y_val)
            val_losses.append(val_loss)
            # Compute validation accuracy
            _, predicted_val = torch.max(val_outputs, 1)
            val_acc = torch.sum(predicted_val == y_val).item() / len(y_val)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model1.pt')
            current_patience = 0
        else:
            current_patience += 1
            if current_patience >= patience:
                print(f'Early stopping at epoch {epoch}')
                break

        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item()}, Train Acc: {train_acc:.4f},train F1-score:{train_f1:.4f} Val Loss: {val_loss.item()}, Val Acc: {val_acc:.4f}')

    np.savetxt('/content/drive/MyDrive/PROJECT/layer1/train_f1_scores_GS.txt',train_f1_scores)
    np.savetxt('/content/drive/MyDrive/PROJECT/layer1/train_loss_GS.txt', train_losses)
    np.savetxt('/content/drive/MyDrive/PROJECT/layer1/epochs_GS.txt', epochss)






# Convert data to appropriate format
edge_train = edge_index_train.to(device)
edge_val = edge_index_val.to(device)
edge_test = edge_index_test.to(device)

# Train the model
train(model, optimizer, criterion, X_train, edge_train, y_train, X_val, edge_val, y_val, epochs=500, patience=100)

# Load the best model
model.load_state_dict(torch.load('best_model1.pt'))

# Testing
model.eval()
with torch.no_grad():
    test_outputs = model(X_test, edge_test)
    test_loss = criterion(test_outputs, y_true)
    _, predicted = torch.max(test_outputs, 1)
    accuracy = torch.sum(predicted == y_true).item() / len(y_true)

print(f'Test Loss: {test_loss.item()}, Test Accuracy: {accuracy}')

from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), predicted.cpu().numpy(), average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')

with open('/content/drive/MyDrive/PROJECT/layer1/f1_score_GS.txt', 'w') as file:
    file.write(f'{f1_score:.4f}')

print("F1-score saved to file.")



Using device: cpu
Epoch [1/500], Loss: 2.630472183227539, Train Acc: 0.0605,train F1-score:0.0938 Val Loss: 2.056767702102661, Val Acc: 0.6975
Epoch [2/500], Loss: 2.0232532024383545, Train Acc: 0.6161,train F1-score:0.5896 Val Loss: 1.5753557682037354, Val Acc: 0.6941
Epoch [3/500], Loss: 1.5751813650131226, Train Acc: 0.6563,train F1-score:0.6057 Val Loss: 1.1893659830093384, Val Acc: 0.7021
Epoch [4/500], Loss: 1.257482647895813, Train Acc: 0.6661,train F1-score:0.6030 Val Loss: 0.9811868667602539, Val Acc: 0.7122
Epoch [5/500], Loss: 1.0920981168746948, Train Acc: 0.6717,train F1-score:0.6012 Val Loss: 0.9069438576698303, Val Acc: 0.7209
Epoch [6/500], Loss: 1.0436480045318604, Train Acc: 0.6764,train F1-score:0.6089 Val Loss: 0.878046452999115, Val Acc: 0.7396
Epoch [7/500], Loss: 1.033139944076538, Train Acc: 0.6818,train F1-score:0.6254 Val Loss: 0.8619298338890076, Val Acc: 0.7356
Epoch [8/500], Loss: 0.9932178854942322, Train Acc: 0.6800,train F1-score:0.6381 Val Loss: 0.84481

  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score as calculate_f1_score
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
edge_index_train = torch.load('/content/drive/MyDrive/PROJECT/edge_index.pt')
edge_index_test = torch.load('/content/drive/MyDrive/PROJECT/edge_index_test.pt')
edge_index_val = torch.load('/content/drive/MyDrive/PROJECT/edge_index_val.pt')

features_file_train = '/content/drive/MyDrive/PROJECT/feature_matrix_train.txt'
X_train = np.loadtxt(features_file_train)
features_file_test = '/content/drive/MyDrive/PROJECT/feature_matrix_test.txt'
X_test = np.loadtxt(features_file_test)
features_file_val = '/content/drive/MyDrive/PROJECT/feature_matrix_val.txt'
X_val = np.loadtxt(features_file_val)

labels_test = pd.read_csv("/content/drive/MyDrive/PROJECT/test_filtered.csv")
labels_train = pd.read_csv("/content/drive/MyDrive/PROJECT/train_filtered.csv")
labels_val = pd.read_csv("/content/drive/MyDrive/PROJECT/val_filtered.csv")

y_train = torch.tensor(labels_train['label'].values, dtype=torch.long).to(device)
y_val = torch.tensor(labels_val['label'].values, dtype=torch.long).to(device)
y_true = torch.tensor(labels_test['label'].values, dtype=torch.long).to(device)

# Preprocess features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)

X_test_scaled = scaler.transform(X_test)
X_test = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

X_val_scaled = scaler.transform(X_val)
X_val = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.convs = torch.nn.ModuleList([SAGEConv(hidden_channels, hidden_channels) for _ in range(num_layers - 2)])
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        for conv in self.convs:
            x = F.relu(conv(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=-1)
# Initialize the GraphSAGE model
model = GraphSAGE(in_channels=X_train.shape[1], hidden_channels=128, out_channels=13,num_layers=3)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Convert data to appropriate format
edge_index_train = edge_index_train.to(device)
edge_index_val = edge_index_val.to(device)
edge_index_test = edge_index_test.to(device)

# Training
def train(model, optimizer, criterion, X_train, edge_train, y_train, X_val, edge_val, y_val, epochs=500, patience=100):
    best_val_loss = float('inf')
    best_val_acc = 0.0
    current_patience = 0
    train_losses = []
    val_losses = []

    train_f1_scores = []  # Initialize list for training F1 scores
    epochss = []

    for epoch in range(epochs):
        epochss.append(epoch)
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train, edge_train)
        loss = criterion(outputs, y_train)
        train_losses.append(loss.cpu().item())
        loss.backward()
        optimizer.step()

        # Compute training accuracy
        _, predicted_train = torch.max(outputs, 1)
        train_acc = torch.sum(predicted_train == y_train).item() / len(y_train)
        # Calculate training F1 score
        train_f1 = calculate_f1_score(y_train.cpu().numpy(), predicted_train.cpu().numpy(), average='weighted')


        # Save training F1 score
        train_f1_scores.append(train_f1)



        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val, edge_val)
            val_loss = criterion(val_outputs, y_val)
            val_losses.append(val_loss)
            # Compute validation accuracy
            _, predicted_val = torch.max(val_outputs, 1)
            val_acc = torch.sum(predicted_val == y_val).item() / len(y_val)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model3.pt')
            current_patience = 0
        else:
            current_patience += 1
            if current_patience >= patience:
                print(f'Early stopping at epoch {epoch}')
                break

        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item()}, Train Acc: {train_acc:.4f},train F1-score:{train_f1:.4f} Val Loss: {val_loss.item()}, Val Acc: {val_acc:.4f}')

    np.savetxt('/content/drive/MyDrive/PROJECT/layer3/train_f1_scores_GS.txt',train_f1_scores)
    np.savetxt('/content/drive/MyDrive/PROJECT/layer3/train_loss_GS.txt', train_losses)
    np.savetxt('/content/drive/MyDrive/PROJECT/layer3/epochs_GS.txt', epochss)






# Convert data to appropriate format
edge_train = edge_index_train.to(device)
edge_val = edge_index_val.to(device)
edge_test = edge_index_test.to(device)

# Train the model
train(model, optimizer, criterion, X_train, edge_train, y_train, X_val, edge_val, y_val, epochs=500, patience=100)

# Load the best model
model.load_state_dict(torch.load('best_model3.pt'))

# Testing
model.eval()
with torch.no_grad():
    test_outputs = model(X_test, edge_test)
    test_loss = criterion(test_outputs, y_true)
    _, predicted = torch.max(test_outputs, 1)
    accuracy = torch.sum(predicted == y_true).item() / len(y_true)

print(f'Test Loss: {test_loss.item()}, Test Accuracy: {accuracy}')

from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), predicted.cpu().numpy(), average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')

with open('/content/drive/MyDrive/PROJECT/layer3/f1_score_GS.txt', 'w') as file:
    file.write(f'{f1_score:.4f}')

print("F1-score saved to file.")



Using device: cpu
Epoch [1/500], Loss: 2.62652325630188, Train Acc: 0.0355,train F1-score:0.0289 Val Loss: 2.151719331741333, Val Acc: 0.6700
Epoch [2/500], Loss: 2.122615337371826, Train Acc: 0.6061,train F1-score:0.5161 Val Loss: 1.5809277296066284, Val Acc: 0.6787
Epoch [3/500], Loss: 1.6516495943069458, Train Acc: 0.6429,train F1-score:0.5086 Val Loss: 1.182791829109192, Val Acc: 0.6787
Epoch [4/500], Loss: 1.381618618965149, Train Acc: 0.6439,train F1-score:0.5080 Val Loss: 1.059310793876648, Val Acc: 0.6760
Epoch [5/500], Loss: 1.2540559768676758, Train Acc: 0.6197,train F1-score:0.5310 Val Loss: 1.0099374055862427, Val Acc: 0.6934
Epoch [6/500], Loss: 1.1960561275482178, Train Acc: 0.6014,train F1-score:0.5458 Val Loss: 0.9757529497146606, Val Acc: 0.6948
Epoch [7/500], Loss: 1.1229907274246216, Train Acc: 0.6476,train F1-score:0.5687 Val Loss: 0.9263303279876709, Val Acc: 0.7182
Epoch [8/500], Loss: 1.0631680488586426, Train Acc: 0.6635,train F1-score:0.5872 Val Loss: 0.8725459

  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score as calculate_f1_score
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
edge_index_train = torch.load('/content/drive/MyDrive/PROJECT/edge_index.pt')
edge_index_test = torch.load('/content/drive/MyDrive/PROJECT/edge_index_test.pt')
edge_index_val = torch.load('/content/drive/MyDrive/PROJECT/edge_index_val.pt')

features_file_train = '/content/drive/MyDrive/PROJECT/feature_matrix_train.txt'
X_train = np.loadtxt(features_file_train)
features_file_test = '/content/drive/MyDrive/PROJECT/feature_matrix_test.txt'
X_test = np.loadtxt(features_file_test)
features_file_val = '/content/drive/MyDrive/PROJECT/feature_matrix_val.txt'
X_val = np.loadtxt(features_file_val)

labels_test = pd.read_csv("/content/drive/MyDrive/PROJECT/test_filtered.csv")
labels_train = pd.read_csv("/content/drive/MyDrive/PROJECT/train_filtered.csv")
labels_val = pd.read_csv("/content/drive/MyDrive/PROJECT/val_filtered.csv")

y_train = torch.tensor(labels_train['label'].values, dtype=torch.long).to(device)
y_val = torch.tensor(labels_val['label'].values, dtype=torch.long).to(device)
y_true = torch.tensor(labels_test['label'].values, dtype=torch.long).to(device)

# Preprocess features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)

X_test_scaled = scaler.transform(X_test)
X_test = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

X_val_scaled = scaler.transform(X_val)
X_val = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.convs = torch.nn.ModuleList([SAGEConv(hidden_channels, hidden_channels) for _ in range(num_layers - 2)])
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        for conv in self.convs:
            x = F.relu(conv(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=-1)
# Initialize the GraphSAGE model
model = GraphSAGE(in_channels=X_train.shape[1], hidden_channels=128, out_channels=13,num_layers=4)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Convert data to appropriate format
edge_index_train = edge_index_train.to(device)
edge_index_val = edge_index_val.to(device)
edge_index_test = edge_index_test.to(device)

# Training
def train(model, optimizer, criterion, X_train, edge_train, y_train, X_val, edge_val, y_val, epochs=500, patience=100):
    best_val_loss = float('inf')
    best_val_acc = 0.0
    current_patience = 0
    train_losses = []
    val_losses = []

    train_f1_scores = []  # Initialize list for training F1 scores
    epochss = []

    for epoch in range(epochs):
        epochss.append(epoch)
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train, edge_train)
        loss = criterion(outputs, y_train)
        train_losses.append(loss.cpu().item())
        loss.backward()
        optimizer.step()

        # Compute training accuracy
        _, predicted_train = torch.max(outputs, 1)
        train_acc = torch.sum(predicted_train == y_train).item() / len(y_train)
        # Calculate training F1 score
        train_f1 = calculate_f1_score(y_train.cpu().numpy(), predicted_train.cpu().numpy(), average='weighted')


        # Save training F1 score
        train_f1_scores.append(train_f1)



        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val, edge_val)
            val_loss = criterion(val_outputs, y_val)
            val_losses.append(val_loss)
            # Compute validation accuracy
            _, predicted_val = torch.max(val_outputs, 1)
            val_acc = torch.sum(predicted_val == y_val).item() / len(y_val)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model4.pt')
            current_patience = 0
        else:
            current_patience += 1
            if current_patience >= patience:
                print(f'Early stopping at epoch {epoch}')
                break

        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item()}, Train Acc: {train_acc:.4f},train F1-score:{train_f1:.4f} Val Loss: {val_loss.item()}, Val Acc: {val_acc:.4f}')

    np.savetxt('/content/drive/MyDrive/PROJECT/layer4/train_f1_scores_GS.txt',train_f1_scores)
    np.savetxt('/content/drive/MyDrive/PROJECT/layer4/train_loss_GS.txt', train_losses)
    np.savetxt('/content/drive/MyDrive/PROJECT/layer4/epochs_GS.txt', epochss)






# Convert data to appropriate format
edge_train = edge_index_train.to(device)
edge_val = edge_index_val.to(device)
edge_test = edge_index_test.to(device)

# Train the model
train(model, optimizer, criterion, X_train, edge_train, y_train, X_val, edge_val, y_val, epochs=500, patience=100)

# Load the best model
model.load_state_dict(torch.load('best_model4.pt'))

# Testing
model.eval()
with torch.no_grad():
    test_outputs = model(X_test, edge_test)
    test_loss = criterion(test_outputs, y_true)
    _, predicted = torch.max(test_outputs, 1)
    accuracy = torch.sum(predicted == y_true).item() / len(y_true)

print(f'Test Loss: {test_loss.item()}, Test Accuracy: {accuracy}')

from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), predicted.cpu().numpy(), average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')

with open('/content/drive/MyDrive/PROJECT/layer4/f1_score_GS.txt', 'w') as file:
    file.write(f'{f1_score:.4f}')

print("F1-score saved to file.")



Using device: cpu
Epoch [1/500], Loss: 2.560387134552002, Train Acc: 0.0286,train F1-score:0.0298 Val Loss: 1.93503999710083, Val Acc: 0.6787
Epoch [2/500], Loss: 1.9610605239868164, Train Acc: 0.6425,train F1-score:0.5047 Val Loss: 1.3121299743652344, Val Acc: 0.6780
Epoch [3/500], Loss: 1.5157147645950317, Train Acc: 0.6406,train F1-score:0.5051 Val Loss: 1.1936206817626953, Val Acc: 0.6446
Epoch [4/500], Loss: 1.4234930276870728, Train Acc: 0.5325,train F1-score:0.4844 Val Loss: 1.052481770515442, Val Acc: 0.6760
Epoch [5/500], Loss: 1.1843818426132202, Train Acc: 0.6219,train F1-score:0.5088 Val Loss: 1.0488173961639404, Val Acc: 0.6787
Epoch [6/500], Loss: 1.1609793901443481, Train Acc: 0.6419,train F1-score:0.5064 Val Loss: 1.0263724327087402, Val Acc: 0.6787
Epoch [7/500], Loss: 1.1369439363479614, Train Acc: 0.6430,train F1-score:0.5050 Val Loss: 1.0014164447784424, Val Acc: 0.6787
Epoch [8/500], Loss: 1.0989017486572266, Train Acc: 0.6436,train F1-score:0.5055 Val Loss: 0.9679

  _warn_prf(average, modifier, msg_start, len(result))
