In [None]:
import networkx as nx
import pandas as pd



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/MyDrive/labelling_file.csv'
df = pd.read_csv(file_path)

In [None]:
import random
import numpy as np
import torch

rd_seed = 42

torch.manual_seed(rd_seed)
np.random.seed(rd_seed)
random.seed(rd_seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
import pandas as pd
train_df = pd.read_csv('/content/drive/MyDrive/labelling_file.csv')

train_df.head()

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1263 entries, 0 to 1262
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   index   1263 non-null   object 
 1   nodes   1263 non-null   object 
 2   edges   1261 non-null   object 
 3   time    1259 non-null   float64
dtypes: float64(1), object(3)
memory usage: 39.6+ KB


In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.4.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.4.0


In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool

class GraphFeature(nn.Module):
    def __init__(self, node_feat, embed_dim):
        super(GraphFeature, self).__init__()

        self.conv_l1 = GCNConv(node_feat, 8)
        self.conv_l2 = GCNConv(8, 16)
        self.embedding = nn.Linear(16, embed_dim)

    def forward(self, x, edge_idx, batch):
        x = F.elu(self.conv_l1(x, edge_idx))
        x = F.elu(self.conv_l2(x, edge_idx))

        x = global_mean_pool(x, batch) # read-out layer

        x = self.embedding(x)

        return x

In [None]:
import torch
from torch.utils.data import Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np


class CPGDataset(Dataset):
    def __init__(self, csv_file, train_ratio=0.8):
        super(CPGDataset, self).__init__()

        self.df = pd.read_csv(csv_file)
        self.df.fillna(0, inplace=True)
        self.train_ratio = train_ratio
        self._split_data()

        self.target_result = torch.tensor(self.df['time'].values.astype(np.float32))
        self.wo_cpgs_df = self.df.drop(columns=['index', 'nodes', 'edges', 'time'])
        self.wo_cpgs_df = torch.tensor(self.wo_cpgs_df.values.astype(np.float32))

        self.train_graph_list = self.cpgs2graph(self.train_df['nodes'], self.train_df['edges'], self.train_df['time'])
        self.val_graph_list = self.cpgs2graph(self.val_df['nodes'], self.val_df['edges'], self.val_df['time'])
        self.graph_list = self.cpgs2graph(self.df['nodes'], self.df['edges'], self.df['time'])

        self.label_encoder = LabelEncoder()

        all_features = []
        for idx in range(len(self.graph_list)):
            graph_data, time = self.graph_list[idx]
            features = [data['feature'] for _, data in graph_data.nodes(data=True)]
            all_features.extend(features)

        self.label_encoder.fit(all_features)

    def _split_data(self):
        # 'time'을 기준으로 정렬
        self.df = self.df.sort_values(by='time')

        # 데이터셋을 train_ratio에 따라 나누기
        split_idx = int(len(self.df) * self.train_ratio)
        self.train_df = self.df.iloc[:split_idx]
        self.val_df = self.df.iloc[split_idx:]

    def __len__(self):
        return len(self.train_graph_list)

    def cpgs2graph_single(self, nodes, edges, time):
        # 노드 및 엣지 정보 추출
        nodes_info = [node.split(':') for node in nodes.split('|') if node]

        if isinstance(edges, str):
            edges_info = [edge.split('->') for edge in edges.split('|') if edge]
        elif isinstance(edges, int):
            edges_info = []
        else:
            raise ValueError("Invalid type for 'edges'. Should be either str or int.")

        # 그래프 생성
        G = nx.Graph()

        # 노드 추가
        for node_info in nodes_info:
            if len(node_info) > 1:  # 노드 정보가 제대로 있다면
                G.add_node(node_info[0], feature=node_info[1])

        # 엣지 추가
        for edge_info in edges_info:
            # 엣지 특성을 추가
            if len(edge_info) > 1:
                source, target_feature = edge_info[0], edge_info[1].split(':')
                if len(target_feature) > 1:
                    target, feature = target_feature[0], target_feature[1]
                    G.add_edge(source, target, feature=feature)
                else:
                    print("Error: Edge feature is missing.")
            else:
                print("Error: Incomplete edge information.")

        # 그래프에 'time' 라벨 추가
        G.graph['time'] = time

        return G

    def cpgs2graph(self, nodes_list, edges_list, time_list):
        print('Convert "CPG"csv to graph')

        graph_list = []
        for nodes, edges, time in zip(nodes_list, edges_list, time_list):
            try:
                # 노드와 엣지를 그래프로 변환
                G = self.cpgs2graph_single(nodes, edges, time)
                graph_list.append((G, time))
            except Exception as e:
                print(f"Error creating graph: {e}")

        print('Complete!')
        return graph_list


    def visualize_graph(self, G, time):
        pos = nx.spring_layout(G)  # 레이아웃 결정

        # 노드 및 엣지 시각화
        nx.draw_networkx_nodes(G, pos, node_size=700)
        nx.draw_networkx_edges(G, pos)
        nx.draw_networkx_labels(G, pos)

        # 엣지에 대한 특성 시각화
        edge_labels = nx.get_edge_attributes(G, 'feature')
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

        # 수정된 부분: 그래프에 'time' 라벨 출력
        print(f"Graph with time: {G.graph['time']}")
        plt.title(f"Graph with time: {G.graph['time']}")
        plt.show()


    def __getitem__(self, idx):
        graph_data, time = self.graph_list[idx]
        y = self.target_result[idx]

        nodes_info = [(node, self.label_encoder.transform([data['feature']])[0]) for node, data in graph_data.nodes(data=True)]
        edges_info = [(u, v, data['feature']) for u, v, data in graph_data.edges(data=True)]

        node_indices = [node[0] for node in nodes_info]
        edges = [(node_indices.index(u), node_indices.index(v)) for u, v, _ in edges_info]

        # edge_index 텐서 생성
        edge_index = torch.tensor(list(zip(*edges)), dtype=torch.long)

        # node feature 텐서 생성
        x = torch.tensor([node[1] for node in nodes_info], dtype=torch.float).view(-1, 1)

        # 타겟 텐서 생성
        y = y.clone().detach()


        data = Data(x=x, edge_index=edge_index, y=y)

        return data


In [None]:
dataset = CPGDataset(csv_file='/content/drive/MyDrive/labelling_file.csv')

print("Output shape:", output.shape)
print("Target shape:", data.y.view(-1).long().shape)

# DataLoader 생성
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# DataLoader에서 첫 번째 배치 가져오기
batch = next(iter(train_loader))

# 첫 번째 배치의 데이터 출력
print(batch)


Convert "CPG"csv to graph
Error: Edge feature is missing.
Error: Incomplete edge information.
Complete!
Convert "CPG"csv to graph
Complete!
Convert "CPG"csv to graph
Error: Edge feature is missing.
Error: Incomplete edge information.
Complete!
Output shape: torch.Size([5549, 5])
Target shape: torch.Size([64])
DataBatch(x=[7117, 1], edge_index=[2, 14625], y=[64], batch=[7117], ptr=[65])


In [None]:
# 테스트 그래프 시각화

dataset = CPGDataset('/content/drive/MyDrive/labelling_file.csv', train_ratio=0.8)

print()
print(len(dataset))

for i, (graph, time) in enumerate(dataset.graph_list):
    print(f"Visualizing Graph {i + 1}")
    dataset.visualize_graph(graph, time)


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # 첫 번째 GCN 레이어
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        # 두 번째 GCN 레이어
        x = self.conv2(x, edge_index)

        # 최종 노드 임베딩
        return F.log_softmax(x, dim=1)

# 데이터셋 및 모델 초기화
dataset = CPGDataset(csv_file='/content/drive/MyDrive/labelling_file.csv')
num_node_features = dataset.wo_cpgs_df.size(1)  # 노드 피처의 차원
num_classes = 5
model = GCN(num_node_features=num_node_features, num_classes=num_classes).to(device)

# 훈련 및 검증 데이터로더 설정
batch_size = 64
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 손실 함수 및 옵티마이저 설정
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)


# 학습 루프
epochs = 10
model.train()
for epoch in range(epochs):
    for data in train_loader:
        data = data.to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y.long())
        loss.backward()
        optimizer.step()


# 검증 데이터를 사용하여 성능 평가
model.eval()
val_correct = 0
val_total = 0
with torch.no_grad():
    for val_data in val_loader:
        val_data = val_data.to(device)
        val_output = model(val_data)
        _, val_predicted = torch.max(val_output, 1)
        val_total += val_data.target_result.size(0)
        val_correct += (val_predicted == val_data.target_result.view(-1).long()).sum().item()

val_accuracy = 100 * val_correct / val_total
print(f'Epoch {epoch + 1}/{epochs}, Validation Accuracy: {val_accuracy:.2f}%')

ModuleNotFoundError: ignored