In [1]:
#! pip install xgboost
#! pip install dgl
#! pip install mxnet

In [28]:
import os
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

raw_data_dir = "input_raw_data"
transactions_df = pd.read_csv(os.path.join(raw_data_dir, "transaction.csv"))
identity_df = pd.read_csv(os.path.join(raw_data_dir, "identity.csv"))

In [29]:
transactions_df.head(5)

Unnamed: 0,TransactionID,TransactionDT,card_no,card_type,email_domain,ProductCD,TransactionAmt,isFraud
0,9c90c7e2-2600-4628-a868-179287eee169,2012-01-15 00:00:20,30545481171260,JCB 16 digit,smith-henry.info,T,1198,0
1,02951e68-f8ff-4f00-b515-5df8bcb0edcb,2012-01-15 00:00:44,6549624810102543,VISA 13 digit,yahoo.com,Y,409,0
2,17071ccd-3c3d-476a-a3cd-b73638a803d4,2012-01-15 00:02:18,4792410992636220,JCB 15 digit,hotmail.com,Y,1101,0
3,5288f832-6673-473a-80f6-fdb98dd99278,2012-01-15 00:03:01,3573503852773765,VISA 16 digit,hotmail.com,Y,2228,0
4,e37a6dd1-182e-4b87-8b8e-7dd9bf78cea0,2012-01-15 00:03:24,3597910599495184,Diners Club / Carte Blanche,gmail.com,L,2393,0


In [30]:
identity_df.head(5)

Unnamed: 0,TransactionID,IpAddress,PhoneNo,DeviceID
0,f9980c5d-e8bf-4431-9ddf-15b4e7a9d7ff,104.32.122.67,7993004217,657817920030
1,011955b3-14fa-4342-85f4-3a9c3759b7d7,164.180.250.14,6712423669,8984812524268
2,a61677ac-a0cb-4113-9409-87bcfe7e6e44,99.31.120.70,+1-353-046-6749x00004,3092060466768
3,f46e41fc-5d06-4f60-b730-a5d7ecec90f1,10.225.43.49,125-534-6633x1775,2516617099683
4,4cc7b6f5-9579-4149-80d8-8cb6c807ee6c,218.46.8.47,(374)448-9917,7683358940269


In [31]:
full_identity_df = transactions_df.merge(identity_df, on="TransactionID", how="left")

# drop transcations time column as it is not useful for constructing graph.
full_identity_df.drop(["TransactionDT"], axis=1, inplace=True)

# Re-arange the order of column names for better visualization
full_identity_df = full_identity_df[
    [
        "TransactionID",
        "card_no",
        "card_type",
        "email_domain",
        "IpAddress",
        "PhoneNo",
        "DeviceID",
        "ProductCD",
        "TransactionAmt",
        "isFraud",
    ]
]
full_identity_df.head(5)

Unnamed: 0,TransactionID,card_no,card_type,email_domain,IpAddress,PhoneNo,DeviceID,ProductCD,TransactionAmt,isFraud
0,9c90c7e2-2600-4628-a868-179287eee169,30545481171260,JCB 16 digit,smith-henry.info,121.92.230.58,+1-038-395-3162x55186,7163249193818,T,1198,0
1,02951e68-f8ff-4f00-b515-5df8bcb0edcb,6549624810102543,VISA 13 digit,yahoo.com,118.244.38.209,001-923-541-7445x63231,3807588936118,Y,409,0
2,17071ccd-3c3d-476a-a3cd-b73638a803d4,4792410992636220,JCB 15 digit,hotmail.com,66.25.190.213,955-326-1614,9673128747953,Y,1101,0
3,5288f832-6673-473a-80f6-fdb98dd99278,3573503852773765,VISA 16 digit,hotmail.com,42.93.10.194,+1-339-892-9034x139,9965894358908,Y,2228,0
4,e37a6dd1-182e-4b87-8b8e-7dd9bf78cea0,3597910599495184,Diners Club / Carte Blanche,gmail.com,154.168.168.7,278.713.5654,7523612045351,L,2393,0


In [32]:
def get_data():
    data_prefix = "preprocessed-data/"

    if not os.path.exists(data_prefix):
        print("""Expected the following folder {} to contain the preprocessed data. 
                 Run data processing first in main notebook before running baselines comparisons""".format(data_prefix))
        return

    features = pd.read_csv(data_prefix + "features_xgboost.csv", header=None)
    labels = pd.read_csv(data_prefix + "tags.csv").set_index('TransactionID')
    valid_users = pd.read_csv(data_prefix + "validation.csv", header=None)
    test_users = pd.read_csv(data_prefix + "test.csv", header=None)
    
    valid_X = features.merge(valid_users, on=[0], how='inner')
    test_X = features.merge(test_users, on=[0], how='inner')
    
    train_index = ~((features[0].isin(test_users[0].values) | (features[0].isin(valid_users[0].values))))   
    train_X = features[train_index]
    valid_y = labels.loc[valid_X[0]]
    test_y = labels.loc[test_X[0]]
    train_y = labels.loc[train_X[0]]
    
    train_X.set_index([0], inplace=True)
    valid_X.set_index([0], inplace=True)
    test_X.set_index([0], inplace=True)

    train_data = train_y.join(train_X)  # first column is the label 'isFraud'
    valid_data = valid_y.join(valid_X)
    test_data = test_y.join(test_X)
    return train_data, valid_data, test_data

In [33]:
features = pd.read_csv("preprocessed-data/" + "features_xgboost.csv", header=None)

In [34]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,9c90c7e2-2600-4628-a868-179287eee169,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,3.078457
1,02951e68-f8ff-4f00-b515-5df8bcb0edcb,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,2.611723
2,17071ccd-3c3d-476a-a3cd-b73638a803d4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,3.041787
3,5288f832-6673-473a-80f6-fdb98dd99278,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,3.347915
4,e37a6dd1-182e-4b87-8b8e-7dd9bf78cea0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,3.378943


In [35]:
## XGBOOST training

In [36]:
train_data_df, valid_data_df, test_data_df = get_data()

In [37]:
train_data_df.head()

Unnamed: 0_level_0,isFraud,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
9c90c7e2-2600-4628-a868-179287eee169,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,3.078457
02951e68-f8ff-4f00-b515-5df8bcb0edcb,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,2.611723
17071ccd-3c3d-476a-a3cd-b73638a803d4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,3.041787
5288f832-6673-473a-80f6-fdb98dd99278,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,3.347915
e37a6dd1-182e-4b87-8b8e-7dd9bf78cea0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,3.378943


In [38]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [39]:
X_train = train_data_df.drop('isFraud', axis=1)  
y_train = train_data_df['isFraud']  

X_val = valid_data_df.drop('isFraud', axis=1)  
y_val = valid_data_df['isFraud']

In [40]:
# Convert the datasets into DMatrix objects (optimized data structure for XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)


In [41]:
# Define the parameters for the XGBoost model
params = {
    'max_depth': 4,          # Depth of each tree
    'eta': 0.1,              # Learning rate
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss' # Evaluation metric for validation data
}

In [42]:
# Train the XGBoost model
num_rounds = 100  # Number of boosting rounds
eval_set = [(dtrain, 'train'), (dval, 'eval')]  # Tracking the training and validation error
bst = xgb.train(params, dtrain, num_rounds, evals=eval_set, early_stopping_rounds=10)

[0]	train-logloss:0.45441	eval-logloss:0.44955
[1]	train-logloss:0.43863	eval-logloss:0.43370
[2]	train-logloss:0.42606	eval-logloss:0.42105
[3]	train-logloss:0.41588	eval-logloss:0.41080
[4]	train-logloss:0.40757	eval-logloss:0.40241
[5]	train-logloss:0.40075	eval-logloss:0.39550
[6]	train-logloss:0.39512	eval-logloss:0.38979
[7]	train-logloss:0.39046	eval-logloss:0.38507
[8]	train-logloss:0.38659	eval-logloss:0.38113
[9]	train-logloss:0.38339	eval-logloss:0.37786
[10]	train-logloss:0.38074	eval-logloss:0.37514
[11]	train-logloss:0.37852	eval-logloss:0.37288
[12]	train-logloss:0.37668	eval-logloss:0.37099
[13]	train-logloss:0.37516	eval-logloss:0.36942
[14]	train-logloss:0.37389	eval-logloss:0.36811
[15]	train-logloss:0.37284	eval-logloss:0.36703
[16]	train-logloss:0.37197	eval-logloss:0.36612
[17]	train-logloss:0.37124	eval-logloss:0.36537
[18]	train-logloss:0.37064	eval-logloss:0.36475
[19]	train-logloss:0.37014	eval-logloss:0.36422
[20]	train-logloss:0.36973	eval-logloss:0.36379
[2

In [43]:
# Predict the probabilities of the positive class for validation data
y_val_pred_prob = bst.predict(dval)

In [44]:
# Predict the probabilities of the positive class for validation data
y_val_pred_prob = bst.predict(dval)

# Convert probabilities to binary predictions using a threshold (e.g., 0.5)
y_val_pred = [1 if prob > 0.5 else 0 for prob in y_val_pred_prob]

# Calculate the accuracy for the validation set
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {accuracy:.2f}')

# Calculate precision and recall for the validation set
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
print(f'Validation Precision: {precision:.2f}')
print(f'Validation Recall: {recall:.2f}')

# Calculate the AUC for the validation set
auc = roc_auc_score(y_val, y_val_pred_prob)
print(f'Validation AUC: {auc:.2f}')

Validation Accuracy: 0.83
Validation Precision: 0.53
Validation Recall: 0.35
Validation AUC: 0.77


In [19]:
processed_files= os.listdir("preprocessed-data")

In [20]:
processed_files

['relation_PhoneNo_edgelist.csv',
 'features.csv',
 'relation_card_no_edgelist.csv',
 'relation_email_domain_edgelist.csv',
 'relation_card_type_edgelist.csv',
 'tags.csv',
 'features_xgboost.csv',
 'validation.csv',
 'test.csv',
 'relation_DeviceID_edgelist.csv',
 'relation_TransactionID_edgelist.csv',
 'relation_IpAddress_edgelist.csv']

In [72]:
edges = ",".join(map(lambda x: x.split("/")[-1], [file for file in processed_files if "relation" in file]))

params = {
    "nodes": "features.csv",
    "edges": "relation*",
    "labels": "tags.csv",
    "model": "rgcn",
    "num-gpus": 1,
    "batch-size": 1000,
    "embedding-size": 1024,
    "n-neighbors": 100,
    "n-layers": 2,
    "n-epochs": 10,
    "optimizer": "adam",
    "lr": 1e-2,
}

print("Graph is constructed using the following edgelists:\n{}".format("\n".join(edges.split(","))))

Graph is constructed using the following edgelists:
relation_PhoneNo_edgelist.csv
relation_card_no_edgelist.csv
relation_email_domain_edgelist.csv
relation_card_type_edgelist.csv
relation_DeviceID_edgelist.csv
relation_TransactionID_edgelist.csv
relation_IpAddress_edgelist.csv


In [73]:
edges

'relation_PhoneNo_edgelist.csv,relation_card_no_edgelist.csv,relation_email_domain_edgelist.csv,relation_card_type_edgelist.csv,relation_DeviceID_edgelist.csv,relation_TransactionID_edgelist.csv,relation_IpAddress_edgelist.csv'

In [74]:
os.environ['DGLBACKEND'] = 'mxnet'
import mxnet as mx
from mxnet import nd, gluon, autograd
import dgl

In [93]:
import numpy as np
import pandas as pd
from mxnet import nd


def get_features(id_to_node, node_features):
    """

    :param id_to_node: dictionary mapping node names(id) to dgl node idx
    :param node_features: path to file containing node features
    :return: (np.ndarray, list) node feature matrix in order and new nodes not yet in the graph
    """
    indices, features, new_nodes = [], [], []
    max_node = max(id_to_node.values())
    with open(node_features, "r") as fh:
        for line in fh:
            node_feats = line.strip().split(",")
            node_id = node_feats[0]
            feats = np.array(list(map(float, node_feats[1:])))
            features.append(feats)
            if node_id not in id_to_node:
                max_node += 1
                id_to_node[node_id] = max_node
                new_nodes.append(max_node)

            indices.append(id_to_node[node_id])

    features = np.array(features).astype('float32')
    features = features[np.argsort(indices), :]
    features = nd.array(features, dtype='float32')
    return features, new_nodes


def get_labels(id_to_node, n_nodes, target_node_type, labels_path, masked_nodes_path_valid, masked_nodes_path_test, additional_mask_rate=0):
    """

    :param id_to_node: dictionary mapping node names(id) to dgl node idx
    :param n_nodes: number of user nodes in the graph
    :param target_node_type: column name for target node type
    :param labels_path: filepath containing labelled nodes
    :param masked_nodes_path: filepath containing list of nodes to be masked
    :param additional_mask_rate: additional_mask_rate: float for additional masking of nodes with labels during training
    :return: (list, list) train and test mask array
    """
    node_to_id = {v: k for k, v in id_to_node.items()}
    user_to_label = pd.read_csv(labels_path).astype({target_node_type:str}).set_index(target_node_type)
    labels = user_to_label.loc[pd.Series(node_to_id)[np.arange(n_nodes)].values].values.flatten()
    masked_nodes_valid = read_masked_nodes(masked_nodes_path_valid)
    masked_nodes_test = read_masked_nodes(masked_nodes_path_test)
    train_mask, valid_mask, test_mask = _get_mask(id_to_node, node_to_id, n_nodes, masked_nodes_valid, masked_nodes_test, additional_mask_rate=additional_mask_rate)
    return labels, train_mask, valid_mask, test_mask


def read_masked_nodes(masked_nodes_path):
    """
    Returns a list of nodes extracted from the path passed in

    :param masked_nodes_path: filepath containing list of nodes to be masked i.e test users
    :return: list
    """
    with open(masked_nodes_path, "r") as fh:
        masked_nodes = [line.strip() for line in fh]
    return masked_nodes


def _get_mask(id_to_node, node_to_id, num_nodes, masked_nodes_valid, masked_nodes_test,  additional_mask_rate):
    """
    Returns the train and test mask arrays

    :param id_to_node: dictionary mapping node names(id) to dgl node idx
    :param node_to_id: dictionary mapping dgl node idx to node names(id)
    :param num_nodes: number of user/account nodes in the graph
    :param masked_nodes: list of nodes to be masked during training, nodes without labels
    :param additional_mask_rate: float for additional masking of nodes with labels during training
    :return: (list, list) train and test mask array
    """
    train_mask = np.ones(num_nodes)
    valid_mask = np.zeros(num_nodes)    
    test_mask = np.zeros(num_nodes)
    for node_id in masked_nodes_valid:
        train_mask[id_to_node[node_id]] = 0
        valid_mask[id_to_node[node_id]] = 1
    for node_id in masked_nodes_test:
        train_mask[id_to_node[node_id]] = 0
        test_mask[id_to_node[node_id]] = 1
    if additional_mask_rate and additional_mask_rate < 1:
        unmasked = np.array([idx for idx in range(num_nodes) if node_to_id[idx] not in masked_nodes])
        yet_unmasked = np.random.permutation(unmasked)[:int(additional_mask_rate*num_nodes)]
        train_mask[yet_unmasked] = 0
    return train_mask, valid_mask, test_mask


def _get_node_idx(id_to_node, node_type, node_id, ptr):
    if node_type in id_to_node:
        if node_id in id_to_node[node_type]:
            node_idx = id_to_node[node_type][node_id]
        else:
            id_to_node[node_type][node_id] = ptr
            node_idx = ptr
            ptr += 1
    else:
        id_to_node[node_type] = {}
        id_to_node[node_type][node_id] = ptr
        node_idx = ptr
        ptr += 1

    return node_idx, id_to_node, ptr


def parse_edgelist(edges, id_to_node, header=False, source_type='user', sink_type='user'):
    """
    Parse an edgelist path file and return the edges as a list of tuple
    :param edges: path to comma separated file containing bipartite edges with header for edgetype
    :param id_to_node: dictionary containing mapping for node names(id) to dgl node indices
    :param header: boolean whether or not the file has a header row
    :param source_type: type of the source node in the edge. defaults to 'user' if no header
    :param sink_type: type of the sink node in the edge. defaults to 'user' if no header.
    :return: (list, dict) a list containing edges of a single relationship type as tuples and updated id_to_node dict.
    """
    edge_list = []
    source_pointer, sink_pointer = 0, 0
    with open(edges, "r") as fh:
        for i, line in enumerate(fh):
            source, sink = line.strip().split(",")
            if i == 0:
                if header:
                    source_type, sink_type = source, sink
                if source_type in id_to_node:
                    source_pointer = max(id_to_node[source_type].values()) + 1
                if sink_type in id_to_node:
                    sink_pointer = max(id_to_node[sink_type].values()) + 1
                continue

            source_node, id_to_node, source_pointer = _get_node_idx(id_to_node, source_type, source, source_pointer)
            if source_type == sink_type:
                sink_node, id_to_node, source_pointer = _get_node_idx(id_to_node, sink_type, sink, source_pointer)
            else:
                sink_node, id_to_node, sink_pointer = _get_node_idx(id_to_node, sink_type, sink, sink_pointer)

            edge_list.append((source_node, sink_node))

    return edge_list, id_to_node, source_type, sink_type


def read_edges(edges, nodes=None):
    """
    Read edges and node features

    :param edges: path to comma separated file containing all edges
    :param nodes: path to comma separated file containing all nodes + features
    :return: (list, list, list, dict) sources, sinks, features and id_to_node dictionary containing mappings
    from node names(id) to dgl node indices
    """
    node_pointer = 0
    id_to_node = {}
    features = []
    sources, sinks = [], []
    if nodes is not None:
        with open(nodes, "r") as fh:
            for line in fh:
                node_feats = line.strip().split(",")
                node_id = node_feats[0]
                if node_id not in id_to_node:
                    id_to_node[node_id] = node_pointer
                    node_pointer += 1
                    if len(node_feats) > 1:
                        feats = np.array(list(map(float, node_feats[1:])))
                        features.append(feats)
        with open(edges, "r") as fh:
            for line in fh:
                source, sink = line.strip().split(",")
                sources.append(id_to_node[source])
                sinks.append(id_to_node[sink])
    else:
        with open(edges, "r") as fh:
            for line in fh:
                source, sink = line.strip().split(",")
                if source not in id_to_node:
                    id_to_node[source] = node_pointer
                    node_pointer += 1
                if sink not in id_to_node:
                    id_to_node[sink] = node_pointer
                    node_pointer += 1
                sources.append(id_to_node[source])
                sinks.append(id_to_node[sink])

    return sources, sinks, features, id_to_node


In [94]:
import os
import re
import dgl
import numpy as np


def get_edgelists(edgelist_expression, directory):
    if "," in edgelist_expression:
        return edgelist_expression.split(",")
    files = os.listdir(directory)
    compiled_expression = re.compile(edgelist_expression)
    return [filename for filename in files if compiled_expression.match(filename)]

def construct_graph(training_dir, edges, nodes, target_node_type, heterogeneous=True):
    if heterogeneous:
        print("Getting relation graphs from the following edge lists : {} ".format(edges))
        edgelists, id_to_node = {}, {}
        for i, edge in enumerate(edges):
            edgelist, id_to_node, src, dst = parse_edgelist(os.path.join(training_dir, edge), id_to_node, header=True)
            if src == target_node_type:
                src = 'target'
            if dst == target_node_type:
                dst = 'target'
            edgelists[(src, 'relation{}'.format(i), dst)] = edgelist
            print("Read edges for relation{} from edgelist: {}".format(i, os.path.join(training_dir, edge)))

            # reverse edge list so that relation is undirected
            edgelists[(dst, 'reverse_relation{}'.format(i), src)] = [(b, a) for a, b in edgelist]
        # get features for target nodes
        features, new_nodes = get_features(id_to_node[target_node_type], os.path.join(training_dir, nodes))
        print("Read in features for target nodes")
        # handle target nodes that have features but don't have any connections
        # if new_nodes:
        #     edgelists[('target', 'relation'.format(i+1), 'none')] = [(node, 0) for node in new_nodes]
        #     edgelists[('none', 'reverse_relation{}'.format(i + 1), 'target')] = [(0, node) for node in new_nodes]

        # add self relation
        edgelists[('target', 'self_relation', 'target')] = [(t, t) for t in id_to_node[target_node_type].values()]

        g = dgl.heterograph(edgelists)
        print(
            "Constructed heterograph with the following metagraph structure: Node types {}, Edge types{}".format(
                g.ntypes, g.canonical_etypes))
        print("Number of nodes of type target : {}".format(g.number_of_nodes('target')))

        g.nodes['target'].data['features'] = features

        id_to_node = id_to_node[target_node_type]

    else:
        sources, sinks, features, id_to_node = read_edges(os.path.join(training_dir, edges[0]),
                                                          os.path.join(training_dir, nodes))

        # add self relation
        all_nodes = sorted(id_to_node.values())
        sources.extend(all_nodes)
        sinks.extend(all_nodes)

        g = dgl.graph((sources, sinks))

        if features:
            g.ndata['features'] = np.array(features).astype('float32')

        print('read graph from node list and edge list')

        features = g.ndata['features']

    return g, features, id_to_node


In [95]:

def normalize(feature_matrix):
    mean = nd.mean(feature_matrix, axis=0)
    stdev = nd.sqrt(nd.sum((feature_matrix - mean)**2, axis=0)/feature_matrix.shape[0])
    return (feature_matrix - mean) / stdev


def get_dataloader(data_size, batch_size, mini_batch=True):
    batch_size = batch_size if mini_batch else data_size
    train_dataloader = gluon.data.BatchSampler(gluon.data.RandomSampler(data_size), batch_size, 'keep')
    test_dataloader = gluon.data.BatchSampler(gluon.data.SequentialSampler(data_size), batch_size, 'keep')

    return train_dataloader, test_dataloader

In [96]:
training_dir="preprocessed-data"

edges = get_edgelists(params['edges'], training_dir)

print(edges)
print(params["nodes"])

g, features, id_to_node = construct_graph(training_dir, edges, params["nodes"], "TransactionID")

features = normalize(nd.array(features))

g.nodes['target'].data['features'] = features


print("Getting labels")
n_nodes = g.number_of_nodes('target')
labels, train_mask, valid_mask, test_mask = get_labels(
    id_to_node,
    n_nodes,
    'TransactionID',
    os.path.join(training_dir, 'tags.csv'),
    os.path.join(training_dir, 'validation.csv'),
    os.path.join(training_dir, 'test.csv'),
)
print("Got labels")

labels = nd.array(labels).astype('float32')
train_mask = nd.array(train_mask).astype('float32')
valid_mask = nd.array(valid_mask).astype('float32')
test_mask = nd.array(test_mask).astype('float32')

n_nodes = sum([g.number_of_nodes(n_type) for n_type in g.ntypes]) 
n_edges = sum([g.number_of_edges(e_type) for e_type in g.etypes])

['relation_PhoneNo_edgelist.csv', 'relation_card_no_edgelist.csv', 'relation_email_domain_edgelist.csv', 'relation_card_type_edgelist.csv', 'relation_DeviceID_edgelist.csv', 'relation_TransactionID_edgelist.csv', 'relation_IpAddress_edgelist.csv']
features.csv
Getting relation graphs from the following edge lists : ['relation_PhoneNo_edgelist.csv', 'relation_card_no_edgelist.csv', 'relation_email_domain_edgelist.csv', 'relation_card_type_edgelist.csv', 'relation_DeviceID_edgelist.csv', 'relation_TransactionID_edgelist.csv', 'relation_IpAddress_edgelist.csv'] 
Read edges for relation0 from edgelist: preprocessed-data/relation_PhoneNo_edgelist.csv
Read edges for relation1 from edgelist: preprocessed-data/relation_card_no_edgelist.csv
Read edges for relation2 from edgelist: preprocessed-data/relation_email_domain_edgelist.csv
Read edges for relation3 from edgelist: preprocessed-data/relation_card_type_edgelist.csv
Read edges for relation4 from edgelist: preprocessed-data/relation_DeviceID