In [None]:
!pip install torch_geometric

: 

In [None]:
import torch
import torch.nn.functional as F

from torch_geometric.nn import GCNConv
import numpy as np
import pandas as pd
from torch_geometric.data import Data
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore")

: 

In [None]:
edge_data = pd.read_csv('Total_Edge/totaledge_edge_breast-cancer.csv')

X = pd.read_csv('Total_Node/totalnode_brca_tcga.csv')
X = pd.concat([X, pd.read_csv('Total_Node/totalnode_brca_tcga_pan_can_atlas_2018.csv')])
X = pd.concat([X, pd.read_csv('Total_Node/totalnode_brca_tcga_pub2015.csv')])

X = X.dropna()
X = X.iloc[:, :-3]
column_names = X.columns

column_names_index, _ = pd.factorize(column_names)

column_names_dic = {}
for i, column_name in enumerate(column_names):
    column_names_dic[column_name] = column_names_index[i]

edge_data['source'] = edge_data['source'].map(column_names_dic)
edge_data['target'] = edge_data['target'].map(column_names_dic)

edge_data = edge_data[edge_data['weight'] != 0]
sources = edge_data['source']
targets = edge_data['target']

edge_index = torch.tensor([sources, targets], dtype=torch.long)
print(edge_index.shape)

#edge_index생성

In [None]:
X = pd.read_csv('Total_Node/totalnode_brca_tcga.csv')
X = pd.concat([X, pd.read_csv('Total_Node/totalnode_brca_tcga_pan_can_atlas_2018.csv')])
X = pd.concat([X, pd.read_csv('Total_Node/totalnode_brca_tcga_pub2015.csv')])

path_list = ['breast-cancer']

X = X.dropna()

filtered_X = X.loc[X['path'] == path_list[0]]
filtered_X = filtered_X.iloc[:, :-3]
last_three_cols = X.iloc[:, -3:]
y = last_three_cols.copy()
y_2 = y.iloc[:, 1] #cancer_type_detailed

X = filtered_X.values
print(X)

sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)

y_2 = y_2.values

unique_classes = np.unique(y_2)
class_mapping = {cls: idx for idx, cls in enumerate(unique_classes)}
y_2_indices = [class_mapping[value] for value in y_2]
y_2 = torch.tensor(y_2_indices)

X = torch.tensor(X)
print(X)

In [None]:
num_samples = X.shape[0]

train_ratio = 0.7
val_ratio = 0.1
indices = np.random.permutation(num_samples)

# train, validation, test 인덱스 계산
num_train = int(train_ratio * num_samples)
num_val = int(val_ratio * num_samples)
num_test = num_samples - num_train - num_val

# 인덱스를 train, validation, test로 나누기
train_indices = indices[:num_train]
val_indices = indices[num_train:num_train+num_val]
test_indices = indices[num_train+num_val:]

train_mask = np.zeros(num_samples, dtype=int)
train_mask[train_indices] = 1

val_mask = np.zeros(num_samples, dtype=int)
val_mask[val_indices] = 1

test_mask = np.zeros(num_samples, dtype=int)
test_mask[test_indices] = 1

train_mask = torch.tensor(train_mask)
val_mask = torch.tensor(val_mask)
test_mask = torch.tensor(test_mask)

data = Data(x=X, edge_index=edge_index, y=y_2,
            train_mask=train_mask,  val_mask=val_mask, test_mask=test_mask)
print(data)

In [None]:
print(data.edge_index)
print(data.y)
print(data.x)

In [None]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim).double()
        self.conv2 = GCNConv(hidden_dim, hidden_dim).double()
        self.conv3 = GCNConv(hidden_dim, output_dim).double()

    def forward(self, data):
        x, edge_index= data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.leaky_relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.leaky_relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index)

        return F.log_softmax(x, dim=1)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = len(torch.unique(data.y))

model = GCN(data.num_node_features, 32, num_classes).to(device)
dataset = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-6)

train_losses=[]
train_accuracies=[]

val_losses=[]
val_accuracies=[]

model.train()
for epoch in range(200):
    optimizer.zero_grad()

    out = model(dataset)
    loss = F.nll_loss(out[dataset.train_mask==1], dataset.y[dataset.train_mask==1])
    val_loss = F.nll_loss(out[dataset.val_mask==1], dataset.y[dataset.val_mask==1])
    loss.backward()

    optimizer.step()

    # Train accuracy
    pred = out.argmax(dim=1)
    train_correct = pred[dataset.train_mask==1] == dataset.y[dataset.train_mask==1]
    train_acc = train_correct.sum().item() / dataset.train_mask.sum().item()

    train_losses+=[loss.item()]
    train_accuracies+=[train_acc]

    # Validation accuracy
    val_correct = pred[dataset.val_mask==1] == dataset.y[dataset.val_mask==1]
    val_acc = val_correct.sum().item() / dataset.val_mask.sum().item()

    val_losses+=[val_loss.item()]
    val_accuracies+=[val_acc]

    print(f"Epoch: {epoch+1}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

In [None]:
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
model.eval()
pred = model(dataset).argmax(dim=1)
correct = (pred[dataset.train_mask == 1] == dataset.y[dataset.train_mask == 1]).sum()
acc = int(correct) / int(dataset.train_mask.sum())
print(f'Train Accuracy: {acc:f}')

pred = model(dataset).argmax(dim=1)
correct = (pred[dataset.test_mask == 1] == dataset.y[dataset.test_mask == 1]).sum()
acc = int(correct) / int(dataset.test_mask.sum())
print(f'Test Accuracy: {acc:f}')