# MLNS data challenge

Guillaume Levy, Clement Wang, Adberrahim Namouh, Gaspard Berthelier

Ce notebook sert à diviser la train data en train et val data

à lancer avant la pipeline

## Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/MLNS/data_challenge

Mounted at /content/drive
/content/drive/My Drive/MLNS/data_challenge


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
from tqdm.notebook import tqdm
import itertools

## Data

In [3]:
# Load the node information with embeddings
node_info = pd.read_csv("data/node_information.csv", header=None)
node_info.columns = ["node_id"] + [f"embedding_{i}" for i in range(1, 933)]
print("Node info shape : ",node_info.shape)

Node info shape :  (3597, 933)


In [4]:
# Load training and testing sets
train_data = pd.read_csv(
    "data/train.txt",
    header=None,
    sep=" ",
    names=["source_node", "target_node", "label"],
)
test_data = pd.read_csv(
    "data/test.txt", header=None, sep=" ", names=["source_node", "target_node"]
)

print("Train shape : ", train_data.shape)
print("Test shape : ", test_data.shape)

Train shape :  (10496, 3)
Test shape :  (3498, 2)


In [5]:
# Create graph and add node attribute
train_graph = train_data[train_data.label == 1]
G: nx.Graph = nx.from_edgelist(train_graph[["source_node", "target_node"]].values)
for i in tqdm(range(1, 933)):
    nx.set_node_attributes(
        G, node_info.set_index("node_id")[f"embedding_{i}"].to_dict(), f"embedding_{i}"
    )

  0%|          | 0/932 [00:00<?, ?it/s]

In [6]:
# Panda random permutation
train_data = train_data.sample(frac=1)

## Split

In [7]:
train_data_edges, train_data_edges_index = (
    train_data.values[np.where(train_data.label == 1)[0]],
    train_data.index[np.where(train_data.label == 1)[0]],
)

print("Number of total connected edges : ",len(train_data_edges))

Number of total connected edges :  5248


In [13]:
# Split train val but we need to keep a connected graph

split_size = 0.2
nb_edges_to_remove = int(split_size * len(train_data_edges))

edges_removed = []
edges_index_removed = []
pbar = tqdm(total=nb_edges_to_remove)
for edge, edge_index in zip(train_data_edges, train_data_edges_index):
    graph_copy = G.copy()
    graph_copy.remove_edge(edge[0], edge[1])
    if nx.is_connected(graph_copy):
        edges_removed.append(edge)
        edges_index_removed.append(edge_index)
        G.remove_edge(edge[0], edge[1])
        pbar.update(1)
        if len(edges_removed) == nb_edges_to_remove:
            break
assert nx.is_connected(G)

  0%|          | 0/1049 [00:00<?, ?it/s]

In [14]:
# We add connections with labels 0
train_data_no_edges, train_data_no_edges_index = (
    train_data.values[np.where(train_data.label == 0)[0]],
    train_data.index[np.where(train_data.label == 0)[0]],
)

print("Number of non connected edges : ",len(train_data_no_edges))

Number of non connected edges :  5248


In [15]:
train_data_no_edges, train_data_no_edges_index = (
    train_data_no_edges[: int(split_size * len(train_data_no_edges))],
    train_data_no_edges_index[: int(split_size * len(train_data_no_edges))],
)

# We add the edges removed
val_edges = np.concatenate([train_data_no_edges, edges_removed], axis=0)
val_edges_index = np.concatenate([train_data_no_edges_index, edges_index_removed], axis=0)

In [16]:
val_data_split = pd.DataFrame(val_edges, columns=["source_node", "target_node", "label"], index=val_edges_index)
train_data_split = train_data.drop(val_edges_index)

print("Validation shape : ",val_data_split.shape)
print("Train shape : ",train_data_split.shape)

Validation shape :  (2098, 3)
Train shape :  (8398, 3)


In [17]:
val_data_split.to_csv("data/val_split.csv", index=False)
train_data_split.to_csv("data/train_split.csv", index=False)