In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.5.3


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import TransformerConv
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score as calculate_f1_score
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
edge_index_train = torch.load('/content/drive/MyDrive/PROJECT/edge_index.pt')
edge_index_test = torch.load('/content/drive/MyDrive/PROJECT/edge_index_test.pt')
edge_index_val = torch.load('/content/drive/MyDrive/PROJECT/edge_index_val.pt')

features_file_train = '/content/drive/MyDrive/PROJECT/feature_matrix_train.txt'
X_train = np.loadtxt(features_file_train)
features_file_test = '/content/drive/MyDrive/PROJECT/feature_matrix_test.txt'
X_test = np.loadtxt(features_file_test)
features_file_val = '/content/drive/MyDrive/PROJECT/feature_matrix_val.txt'
X_val = np.loadtxt(features_file_val)

labels_test = pd.read_csv("/content/drive/MyDrive/PROJECT/test_filtered.csv")
labels_train = pd.read_csv("/content/drive/MyDrive/PROJECT/train_filtered.csv")
labels_val = pd.read_csv("/content/drive/MyDrive/PROJECT/val_filtered.csv")

y_train = torch.tensor(labels_train['label'].values, dtype=torch.long).to(device)
y_val = torch.tensor(labels_val['label'].values, dtype=torch.long).to(device)
y_true = torch.tensor(labels_test['label'].values, dtype=torch.long).to(device)

# Preprocess features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)

X_test_scaled = scaler.transform(X_test)
X_test = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

X_val_scaled = scaler.transform(X_val)
X_val = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)

class TransformerNet(nn.Module):
    def __init__(self, num_features, hidden_dim, num_layers, output_dim):
        super(TransformerNet, self).__init__()

        encoder_layers = nn.TransformerEncoderLayer(d_model=num_features, nhead=4, dim_feedforward=hidden_dim, batch_first=True)  # Set batch_first=True
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)

        self.fc = nn.Linear(num_features, output_dim)

    def forward(self, src):
        output = self.transformer_encoder(src)
        output = F.relu(output)  # Adjust indexing to access the last sequence step
        output = self.fc(output)  # Remove the extra dimension index -1
        return output




# Train the model
def train_model(model, optimizer, criterion, X_train, edge_index_train, y_train, X_val, edge_index_val, y_val, epochs=200, patience=10):
    best_val_loss = float('inf')
    best_val_acc = 0.0
    current_patience = 0
    train_losses = []
    val_losses = []

    train_f1_scores = []  # Initialize list for training F1 scores
    epochss = []


    for epoch in range(epochs):
        epochss.append(epoch)
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(F.log_softmax(outputs, dim=1), y_train)  # Correct dimension for log_softmax
        loss.backward()
        train_losses.append(loss.cpu().item())
        optimizer.step()

        # Compute training accuracy
        _, predicted_train = torch.max(outputs, 1)
        train_acc = torch.sum(predicted_train == y_train).item() / len(y_train)
        train_f1 = calculate_f1_score(y_train.cpu().numpy(), predicted_train.cpu().numpy(), average='weighted')


        # Save training F1 score
        train_f1_scores.append(train_f1)

        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)
            val_loss = criterion(F.log_softmax(val_outputs, dim=1), y_val)  # Correct dimension for log_softmax

            # Compute validation accuracy
            _, predicted_val = torch.max(val_outputs, 1)
            val_acc = torch.sum(predicted_val == y_val).item() / len(y_val)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_transformer_model.pt')
            current_patience = 0
        else:
            current_patience += 1
            if current_patience >= patience:
                print(f'Early stopping at epoch {epoch}')
                break

        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item()}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss.item()}, Val Acc: {val_acc:.4f}')
        np.savetxt('/content/drive/MyDrive/PROJECT/train_f1_scores_GTN.txt',train_f1_scores)
        np.savetxt('/content/drive/MyDrive/PROJECT/train_loss_GTN.txt', train_losses)
        np.savetxt('/content/drive/MyDrive/PROJECT/epochs_GTN.txt', epochss)


# Testing
def test(model, X_test, edge_index_test, y_test):
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test)
        _, predicted = torch.max(test_outputs, 1)
        test_acc = torch.sum(predicted == y_test).item() / len(y_test)

    return test_acc,predicted

# Define optimizer and loss function
model = TransformerNet(num_features=X_train.shape[1], hidden_dim=8, num_layers=1, output_dim=13).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()  # Define criterion here

# Train the model
model.to(device)
train_model(model, optimizer, criterion, X_train, edge_index_train, y_train, X_val, edge_index_val, y_val, epochs=200, patience=10)

# Load the best model
model.load_state_dict(torch.load('best_transformer_model.pt'))

# Testing
test_acc,predicted = test(model, X_test, edge_index_test, y_true)
print(f'Test Accuracy (Transformer): {test_acc}')


Using device: cpu
Epoch [1/200], Loss: 2.7716312408447266, Train Acc: 0.0424, Val Loss: 1.3207136392593384, Val Acc: 0.6787
Epoch [2/200], Loss: 1.322484016418457, Train Acc: 0.6441, Val Loss: 1.1570779085159302, Val Acc: 0.6787
Epoch [3/200], Loss: 1.15559720993042, Train Acc: 0.6441, Val Loss: 1.1631933450698853, Val Acc: 0.6673
Epoch [4/200], Loss: 1.1007338762283325, Train Acc: 0.6445, Val Loss: 1.120144009590149, Val Acc: 0.6593
Epoch [5/200], Loss: 1.0577592849731445, Train Acc: 0.6438, Val Loss: 1.1382434368133545, Val Acc: 0.5669
Epoch [6/200], Loss: 1.0302821397781372, Train Acc: 0.6464, Val Loss: 1.1064059734344482, Val Acc: 0.6131
Epoch [7/200], Loss: 0.9936683773994446, Train Acc: 0.6534, Val Loss: 1.1053533554077148, Val Acc: 0.5930
Epoch [8/200], Loss: 0.9858220219612122, Train Acc: 0.6592, Val Loss: 1.1106908321380615, Val Acc: 0.6111
Epoch [9/200], Loss: 0.9673988223075867, Train Acc: 0.6506, Val Loss: 1.0799754858016968, Val Acc: 0.6359
Epoch [10/200], Loss: 0.96270710

In [None]:
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), predicted.cpu().numpy(), average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')


Precision: 0.4392, Recall: 0.6340, F1-score: 0.5036


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the best GIN model
model.load_state_dict(torch.load('best_transformer_model.pt'))

# Get the output of the GIN model for the training, validation, and test sets
model.eval()
with torch.no_grad():
    train_outputs = model(X_train).cpu().numpy()
    val_outputs = model(X_val).cpu().numpy()
    test_outputs = model(X_test).cpu().numpy()

# Train a decision tree classifier
decision_tree = DecisionTreeClassifier(max_depth=2)
decision_tree.fit(train_outputs, y_train.cpu().numpy())

# Predict labels using the decision tree
train_pred = decision_tree.predict(train_outputs)
val_pred = decision_tree.predict(val_outputs)
test_pred = decision_tree.predict(test_outputs)

# Evaluate decision tree performance
train_acc = accuracy_score(y_train.cpu().numpy(), train_pred)
val_acc = accuracy_score(y_val.cpu().numpy(), val_pred)
test_acc = accuracy_score(y_true.cpu().numpy(), test_pred)

print(f'Training Accuracy (Decision Tree): {train_acc}')
print(f'Validation Accuracy (Decision Tree): {val_acc}')
print(f'Test Accuracy (Decision Tree): {test_acc}')

from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), test_pred, average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')


Training Accuracy (Decision Tree): 0.7167978563054764
Validation Accuracy (Decision Tree): 0.5829986613119144
Test Accuracy (Decision Tree): 0.589142091152815
Precision: 0.4278, Recall: 0.5891, F1-score: 0.4945


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Concatenate the GIN model output with the original feature matrices
X_train_combined = np.concatenate((X_train.cpu().numpy(), train_outputs), axis=1)
X_val_combined = np.concatenate((X_val.cpu().numpy(), val_outputs), axis=1)
X_test_combined = np.concatenate((X_test.cpu().numpy(), test_outputs), axis=1)

# Train a decision tree classifier
decision_tree = DecisionTreeClassifier(max_depth=9)
decision_tree.fit(X_train_combined, y_train.cpu().numpy())

# Predict labels using the decision tree
train_pred = decision_tree.predict(X_train_combined)
val_pred = decision_tree.predict(X_val_combined)
test_pred = decision_tree.predict(X_test_combined)

# Evaluate decision tree performance
train_acc = accuracy_score(y_train.cpu().numpy(), train_pred)
val_acc = accuracy_score(y_val.cpu().numpy(), val_pred)
test_acc = accuracy_score(y_true.cpu().numpy(), test_pred)

print(f'Training Accuracy (Decision Tree): {train_acc}')
print(f'Validation Accuracy (Decision Tree): {val_acc}')
print(f'Test Accuracy (Decision Tree): {test_acc}')

from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), test_pred, average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')


Training Accuracy (Decision Tree): 0.9005191760174175
Validation Accuracy (Decision Tree): 0.7315930388219545
Test Accuracy (Decision Tree): 0.7707774798927614
Precision: 0.7539, Recall: 0.7708, F1-score: 0.7569


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.svm import SVC

In [None]:
# Train an SVM classifier
svm_classifier = SVC(kernel='linear')  # You can specify different kernel functions (e.g., 'linear', 'poly', 'rbf', etc.)
svm_classifier.fit(train_outputs, y_train.cpu().numpy())

# Predict labels using the SVM classifier
train_pred = svm_classifier.predict(train_outputs)
val_pred = svm_classifier.predict(val_outputs)
test_pred = svm_classifier.predict(test_outputs)

# Evaluate SVM performance
train_acc = accuracy_score(y_train.cpu().numpy(), train_pred)
val_acc = accuracy_score(y_val.cpu().numpy(), val_pred)
test_acc = accuracy_score(y_true.cpu().numpy(), test_pred)

print(f'Training Accuracy (SVM): {train_acc}')
print(f'Validation Accuracy (SVM): {val_acc}')
print(f'Test Accuracy (SVM): {test_acc}')
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), test_pred, average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')

Training Accuracy (SVM): 0.7103500251214202
Validation Accuracy (SVM): 0.44109772423025434
Test Accuracy (SVM): 0.510053619302949
Precision: 0.4960, Recall: 0.5101, F1-score: 0.4891


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Train an SVM classifier
svm_classifier = SVC(kernel='linear')  # You can specify different kernel functions (e.g., 'linear', 'poly', 'rbf', etc.)
svm_classifier.fit(X_train_combined, y_train.cpu().numpy())

# Predict labels using the SVM classifier
train_pred = svm_classifier.predict(X_train_combined)
val_pred = svm_classifier.predict(X_val_combined)
test_pred = svm_classifier.predict(X_test_combined)

# Evaluate SVM performance
train_acc = accuracy_score(y_train.cpu().numpy(), train_pred)
val_acc = accuracy_score(y_val.cpu().numpy(), val_pred)
test_acc = accuracy_score(y_true.cpu().numpy(), test_pred)

print(f'Training Accuracy (SVM): {train_acc}')
print(f'Validation Accuracy (SVM): {val_acc}')
print(f'Test Accuracy (SVM): {test_acc}')

from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, roc_curve, auc

# Calculate precision, recall, F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true.cpu().numpy(), test_pred, average='weighted')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}')



Training Accuracy (SVM): 0.8591525707586669
Validation Accuracy (SVM): 0.748995983935743
Test Accuracy (SVM): 0.7298927613941019
Precision: 0.7472, Recall: 0.7299, F1-score: 0.7162


  _warn_prf(average, modifier, msg_start, len(result))
