#Package Section

In [None]:
import sys
import numpy as np
import copy
from numpy import linalg as LA
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics
import time
# for sparse matrix
from scipy import sparse
#early stop
from keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

#Classes and functions

In [None]:
############------------graph_encoder_embed_start----------------###############
def graph_encoder_embed(X, Y, n, **kwargs):
  """
    graph embedding function
    input X is sparse csr matrix of adjacency matrix
    -- if there is a connection between node i and node j:
    ---- X(i,j) = 1, no edge weight
    ---- X(i,j) = edge weight.
    -- if there is no connection between node i and node j:
    ---- X(i,j) = 0, 
    ---- note there is no storage for this in sparse matrix. 
    ---- No storage means 0 in sparse matrix.
    input Y is a list of labels:
    -- value -1 indicate no lable
    -- value >=0 indicate real label
    input train_idx: a list of indices of input X for training set 
  """
  defaultKwargs = {'Correlation': True}
  kwargs = { **defaultKwargs, **kwargs}


  # assign k to the max along the first column
  # Note for python, label Y starts from 0. Python index starts from 0. thus size k should be max + 1
  k = Y[:,0].max() + 1

  #nk: 1*n array, contains the number of observations in each class
  nk = np.zeros((1,k))
  for i in range(k):
    nk[0,i] = np.count_nonzero(Y[:,0]==i)
  
  #W: sparse matrix for encoder marix. W[i,k] = {1/nk if Yi==k, otherwise 0}
  W = sparse.dok_matrix((n, k), dtype=np.float32)

  for i in range(Y.shape[0]):
    k_i = Y[i,0]
    if k_i >=0:
      W[i,k_i] = 1/nk[0,k_i]
  
  W = sparse.csr_matrix(W)
  Z = X.dot(W)
  
  return Z, W


############------------graph_encoder_embed_end------------------###############



#Packages for Drive Files

In [None]:
# import packages
## for mount drive purpose
import os
from google.colab import drive

#Mount Drive

In [None]:
# mount drive
drive.mount('/content/drive/', force_remount=True)
os.chdir('/content/drive/My Drive/Colab_Notebooks/Graph_ML/semi_dr.shen')

Mounted at /content/drive/


# import ipynb packages

In [None]:
!pip install import-ipynb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting import-ipynb
  Downloading import_ipynb-0.1.4-py3-none-any.whl (4.1 kB)
Installing collected packages: import-ipynb
Successfully installed import-ipynb-0.1.4


In [None]:
import import_ipynb
from test_cases import Model, Case

# Test Cases 

## Test sparse matrix

In [None]:
A = sparse.csr_matrix(np.array([[4,5,6], [7,8,9],[10,11,12]]))
B = sparse.csr_matrix(np.array([[1,0,0], [0,2,0],[0,0,3]]))

In [None]:
C = A.dot(B)

In [None]:
print(C)

  (0, 2)	18
  (0, 1)	10
  (0, 0)	4
  (1, 2)	27
  (1, 1)	16
  (1, 0)	7
  (2, 2)	36
  (2, 1)	22
  (2, 0)	10


In [None]:
# this is csr, using csc will be faster to slice columns
C_new = sparse.lil_matrix(C[:,[0,2]])

In [None]:
print(C_new)

  (0, 0)	4
  (0, 1)	18
  (1, 0)	7
  (1, 1)	27
  (2, 0)	10
  (2, 1)	36


## Graph Encoder test case


In [None]:
class Encoder_case:
  def __init__(self, A,Y,n):
    Encoder_case.X = A
    Encoder_case.Y = Y
    Encoder_case.n = n

###Case 1

A = 

\begin{bmatrix}
0 & 1 & 1 & 1 & 0\\
1 & 0 & 1 & 1 & 1\\
1 & 1 & 0 & 1 & 1\\
1 & 1 & 1 & 0 & 1\\
0 & 1 & 1 & 1 & 0
\end{bmatrix}

Labels = [0,0,0,1,1] 


In [None]:
A = np.ones((5,5))
A[0,4] = 0
A[4,0] = 0
np.fill_diagonal(A, 0)

Y = np.array([[0,0,0,1,1]]).reshape((5,1))

print(A)
print(Y)

Encoder_case = Encoder_case(A,Y,5)

[[0. 1. 1. 1. 0.]
 [1. 0. 1. 1. 1.]
 [1. 1. 0. 1. 1.]
 [1. 1. 1. 0. 1.]
 [0. 1. 1. 1. 0.]]
[[0]
 [0]
 [0]
 [1]
 [1]]


#### [Original]Laplacian = False, correclation = False, DiagA = False

In [None]:
Dataset = DataPreprocess(Encoder_case, Laplacian = False, DiagA = False)
print(Dataset.X)
print(Dataset.Y)
print(Dataset.n)

[array([[0., 1., 1.],
       [0., 2., 1.],
       [0., 3., 1.],
       [1., 0., 1.],
       [1., 2., 1.],
       [1., 3., 1.],
       [1., 4., 1.],
       [2., 0., 1.],
       [2., 1., 1.],
       [2., 3., 1.],
       [2., 4., 1.],
       [3., 0., 1.],
       [3., 1., 1.],
       [3., 2., 1.],
       [3., 4., 1.],
       [4., 1., 1.],
       [4., 2., 1.],
       [4., 3., 1.]])]
[[0]
 [0]
 [0]
 [1]
 [1]]
5


In [None]:
Z, W = graph_encoder_embed(Dataset.X[0], Dataset.Y, Dataset.n, Correlation = False)
print(Z)
print(W)

[[1.33333333 1.        ]
 [1.33333333 2.        ]
 [1.33333333 2.        ]
 [2.         1.        ]
 [1.33333333 1.        ]]
[[0.33333333 0.        ]
 [0.33333333 0.        ]
 [0.33333333 0.        ]
 [0.         0.5       ]
 [0.         0.5       ]]


#### SPARCE MATRIX, Laplacian = False, correclation = False, DiagA = False
note: z matrix valus is 1/2 of the edge list version. doesn't affect the training and testing does not adding double weights here.

In [None]:
X_sparce = sparse.csr_matrix(Encoder_case.X)

In [None]:
Z, W = graph_encoder_embed(X_sparce, Encoder_case.Y, Encoder_case.n)

In [None]:
print(Z)
print(W)

  (0, 1)	0.5
  (0, 0)	0.6666666865348816
  (1, 1)	1.0
  (1, 0)	0.6666666865348816
  (2, 1)	1.0
  (2, 0)	0.6666666865348816
  (3, 1)	0.5
  (3, 0)	1.0000000298023224
  (4, 1)	0.5
  (4, 0)	0.6666666865348816
  (0, 0)	0.33333334
  (1, 0)	0.33333334
  (2, 0)	0.33333334
  (3, 1)	0.5
  (4, 1)	0.5


#### Laplacian = False, correclation = True, DiagA = False

In [None]:
Dataset = DataPreprocess(Encoder_case, Laplacian = False, DiagA = False)
print(Dataset.X)
print(Dataset.Y)
print(Dataset.n)

[array([[0., 1., 1.],
       [0., 2., 1.],
       [0., 3., 1.],
       [1., 0., 1.],
       [1., 2., 1.],
       [1., 3., 1.],
       [1., 4., 1.],
       [2., 0., 1.],
       [2., 1., 1.],
       [2., 3., 1.],
       [2., 4., 1.],
       [3., 0., 1.],
       [3., 1., 1.],
       [3., 2., 1.],
       [3., 4., 1.],
       [4., 1., 1.],
       [4., 2., 1.],
       [4., 3., 1.]])]
[[0]
 [0]
 [0]
 [1]
 [1]]
5


In [None]:
Z, W = graph_encoder_embed(Dataset.X[0], Dataset.Y, Dataset.n, Correlation = True)
print(Z)
print(W)

[[0.8        0.6       ]
 [0.5547002  0.83205029]
 [0.5547002  0.83205029]
 [0.89442719 0.4472136 ]
 [0.8        0.6       ]]
[[0.33333333 0.        ]
 [0.33333333 0.        ]
 [0.33333333 0.        ]
 [0.         0.5       ]
 [0.         0.5       ]]


#### Laplacian = True, correclation = False, DiagA = False

In [None]:
Dataset = DataPreprocess(Encoder_case, Laplacian = True, DiagA = False)
print(Dataset.X)
print(Dataset.Y)
print(Dataset.n)

[array([[0.        , 1.        , 0.14433757],
       [0.        , 2.        , 0.14433757],
       [0.        , 3.        , 0.14433757],
       [1.        , 0.        , 0.14433757],
       [1.        , 2.        , 0.125     ],
       [1.        , 3.        , 0.125     ],
       [1.        , 4.        , 0.14433757],
       [2.        , 0.        , 0.14433757],
       [2.        , 1.        , 0.125     ],
       [2.        , 3.        , 0.125     ],
       [2.        , 4.        , 0.14433757],
       [3.        , 0.        , 0.14433757],
       [3.        , 1.        , 0.125     ],
       [3.        , 2.        , 0.125     ],
       [3.        , 4.        , 0.14433757],
       [4.        , 1.        , 0.14433757],
       [4.        , 2.        , 0.14433757],
       [4.        , 3.        , 0.14433757]])]
[[0]
 [0]
 [0]
 [1]
 [1]]
5


In [None]:
Z, W = graph_encoder_embed(Dataset.X[0], Dataset.Y, Dataset.n, Correlation = False)
print(Z)
print(W)

[[0.19245009 0.14433757]
 [0.17955838 0.26933757]
 [0.17955838 0.26933757]
 [0.26289171 0.14433757]
 [0.19245009 0.14433757]]
[[0.33333333 0.        ]
 [0.33333333 0.        ]
 [0.33333333 0.        ]
 [0.         0.5       ]
 [0.         0.5       ]]


In [None]:
se

### test encoder_1

In [None]:
A = np.array([
 [0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0],
 [1, 0, 0, 1, 0, 0, 0, 0],
 [0, 1, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0]])
print(A)

[[0 0 1 0 0 0 0 0]
 [0 0 0 1 0 0 0 0]
 [1 0 0 1 0 0 0 0]
 [0 1 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


In [None]:
Y = np.array([[1,1,0,1,2,1,1,1]]).reshape((8,1))
print(Y)

[[1]
 [1]
 [0]
 [1]
 [2]
 [1]
 [1]
 [1]]


In [None]:
Encoder_case = Encoder_case(A,Y,8)

####DiagA=false; Correlation=false; Laplacian=false

In [None]:
Dataset = DataPreprocess(Encoder_case, Laplacian = False, DiagA = False)
Z, W = graph_encoder_embed(Dataset.X[0], Dataset.Y, Dataset.n, Correlation = False)
print(Z)
print(W)

[[2.         0.         0.        ]
 [0.         0.33333333 0.        ]
 [0.         0.66666667 0.        ]
 [2.         0.33333333 0.        ]
 [0.         0.         0.        ]
 [0.         0.         0.        ]
 [0.         0.         0.        ]
 [0.         0.         0.        ]]
[[0.         0.16666667 0.        ]
 [0.         0.16666667 0.        ]
 [1.         0.         0.        ]
 [0.         0.16666667 0.        ]
 [0.         0.         1.        ]
 [0.         0.16666667 0.        ]
 [0.         0.16666667 0.        ]
 [0.         0.16666667 0.        ]]


In [None]:
X_sparce = sparse.csr_matrix(Encoder_case.X)
Z, W = graph_encoder_embed(X_sparce, Encoder_case.Y, Encoder_case.n)
print(Z)
print(W)

  (0, 0)	1.0
  (1, 1)	0.1666666716337204
  (2, 1)	0.3333333432674408
  (3, 0)	1.0
  (3, 1)	0.1666666716337204
  (0, 1)	0.16666667
  (1, 1)	0.16666667
  (2, 0)	1.0
  (3, 1)	0.16666667
  (4, 2)	1.0
  (5, 1)	0.16666667
  (6, 1)	0.16666667
  (7, 1)	0.16666667


## [Sparse]Supervised Learning

In [None]:
# https://www.kaggle.com/c/talkingdata-mobile-user-demographics/discussion/22567
# https://github.com/tkipf/pygcn/blob/1600b5b748b3976413d1e307540ccc62605b4d6d/pygcn/utils.py#L73

def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = X.shape[0]/batch_size
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = to_categorical(y[batch_index])
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

In [None]:
class Hyperperameters:
  """
    define perameters for GNN.
    default values are for GNN learning -- "Leaner" ==2:
      embedding via partial label, then learn unknown label via two-layer NN

  """
  def __init__(self):
    # there is no scaled conjugate gradiant in keras optimiser, use defualt instead
    # use whatever default
    self.learning_rate = 0.01  # Initial learning rate.
    self.epochs = 100 #Number of epochs to train.
    self.hidden = 20 #Number of units in hidden layer 
    self.val_split = 0.1 #Split 10% of training data for validation
    self.loss = 'categorical_crossentropy' # loss function

class GNN:
  def __init__(self, DataSets):
    GNN.DataSets = DataSets
    GNN.hyperM = Hyperperameters()
    GNN.model = self.GNN_model()  #model summary: GNN.model.summary()
      
 
  def GNN_model(self):
    """
      build GNN model
    """
    hyperM = self.hyperM
    DataSets = self.DataSets

    z_train = DataSets.z_train
    k = DataSets.d

    feature_num = z_train.shape[1]
    
    model = keras.Sequential([
    keras.layers.Flatten(input_shape = (feature_num,)),  # input layer 
    keras.layers.Dense(hyperM.hidden, activation='relu'),  # hidden layer -- no tansig activation function in Keras, use relu instead
    keras.layers.Dense(k, activation='softmax') # output layer, matlab used softmax for patternnet default ??? max(opts.neuron,K)? opts 
    ])

    optimizer = keras.optimizers.Adam(learning_rate = hyperM.learning_rate)

    model.compile(optimizer='adam',
                  loss=hyperM.loss,
                  metrics=['accuracy'])

    return model
    
  def GNN_run(self):
    """
      Train and test directly.
      Do not learn from the unknown labels.
    """
    gnn = copy.deepcopy(self)
    hyperM = gnn.hyperM
    DataSets = self.DataSets
    k = DataSets.d
    z_train = DataSets.z_train
    y_train = DataSets.y_train
    y_test = DataSets.y_test
    z_test = DataSets.z_test
    model = gnn.model    

    early_stopping_callback = EarlyStopping(monitor='loss', patience=5, verbose=0)
    checkpoint_callback = ModelCheckpoint('GNN.h5', monitor='loss', save_best_only=True, mode='min', verbose=0)
    
    train_strat = time.time()
    history = model.fit(batch_generator(z_train, y_train, 32, True),
                    epochs=hyperM.epochs,
                    steps_per_epoch=z_train.shape[0],
                    callbacks=[early_stopping_callback, checkpoint_callback],
                    verbose=0)
    train_end = time.time()
    train_time = train_end - train_strat 

    y_test_one_hot = to_categorical(y_test) 
    # set verbose to 0 to silent the output
    test_loss, test_acc = gnn.model.evaluate(z_test.toarray(),  y_test_one_hot, verbose=0) 
    return test_acc, train_time

In [None]:
n = 3000
case = Case(n)

In [None]:
case_10 = case.case_10_fully_known()
case_10.summary()

name:

    SBM with 3 classes and defined probabilities with fully known labels
    80% for training and 20% for testing
    
n:
<class 'int'>
3000
d:
<class 'int'>
3
X:
(3000, 3000)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 1 0]]
Y:
(3000, 1)
[[1]
 [0]
 [2]
 ...
 [2]
 [2]
 [2]]


In [None]:
X_sparse = sparse.csr_matrix(case_10.X)

emb_strat = time.time()
Z, W = graph_encoder_embed(X_sparse, case_10.Y, case_10.n)
emb_end = time.time()
emb_time = emb_end - emb_strat

train_idx = case_10.train_idx
test_idx = case_10.test_idx
case_10.z_train= Z[train_idx]
case_10.z_test = Z[test_idx]
case_10.y_train = case_10.Y[train_idx].ravel()
case_10.y_test = case_10.Y_test.ravel() 

gnn = GNN(case_10)
acc, train_time  = gnn.GNN_run()

print("--- embed %s seconds ---" % emb_time)
print("--- train %s seconds ---" % train_time)
print("--- accuracy: %s ---" % acc)

--- embed 0.04400968551635742 seconds ---
--- train 213.84093713760376 seconds ---
--- accuracy: 0.95333331823349 ---


## [Original]Supervised Learning

In [None]:
n = 3000
case = Case(n)

In [None]:
# get all combinations of different emb settings 

sets_no = 8
L_set = [True, False]
Diag_set = [True, False]
Corre_set = [True, False]
comb = [L_set, Diag_set, Corre_set]
comb_set = []

ele_list = [None, None, None]
for ele1 in comb[0]:
  ele_list[0] = ele1
  for ele2 in comb[1]:
    ele_list[1] = ele2
    for ele3 in comb[2]:
      ele_list[2] = ele3
      comb_set.append(ele_list.copy())

print(comb_set)
print(len(comb_set))

[[True, True, True], [True, True, False], [True, False, True], [True, False, False], [False, True, True], [False, True, False], [False, False, True], [False, False, False]]
8


In [None]:
def average_restuls(case_num, comb_set, learner_no):
  results = []
  for comb in comb_set:
    acc_final, train_time_final, emb_time_final, total_time_final = 0,0,0,0
    for i in range(10):
      test_case = copy.deepcopy(case_num)
      acc, train_time, emb_time, total_time = Run(test_case, "su", Learner = learner_no, Laplacian = comb[0], DiagA = comb[1], Correlation = comb[2])
      acc_final += acc
      train_time_final += train_time
      emb_time_final += emb_time 
      total_time_final += total_time
    
    acc_final /= 10
    train_time_final /= 10
    emb_time_final /= 10
    total_time_final /= 10

    result = comb + [acc_final, train_time_final, emb_time_final, total_time_final]
    results.append(result)
  
  return results

def plot(results):
  df = pd.DataFrame(results,
  index=['set_01','set_02','set_03','set_04','set_05','set_06','set_07','set_08'],
  columns=['Laplacian','DiagA', 'Correlation', 'Accuracy', 'Train_Time(s)', 'Emb_Time(s)', 'Total_Time(s)'])

  df = df.style.format({
    'Emb_Time(s)': '{:0.2f}',
    'Train_Time(s)': '{:0.5f}',
    'Total_Time(s)': '{:0.2f}'
  })

  display(df)

### Supervised

#### GNN

##### case 10

In [None]:
case_10 = case.case_10_fully_known()
case_10.summary()

name:

    SBM with 3 classes and defined probabilities with fully known labels
    80% for training and 20% for testing
    
n:
<class 'int'>
3000
d:
<class 'int'>
3
X:
(3000, 3000)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 1 0]]
Y:
(3000, 1)
[[1]
 [0]
 [2]
 ...
 [2]
 [2]
 [2]]


In [None]:
print(case_10.bd)

0.13


In [None]:
results = average_restuls(case_10, comb_set, 0)

acc:  0.95333331823349
--- embed 3.419264078140259 seconds ---
--- train 21.11810326576233 seconds ---
--- total 40.59562110900879 seconds ---
acc:  0.95333331823349
--- embed 3.4034903049468994 seconds ---
--- train 12.623661756515503 seconds ---
--- total 31.16509985923767 seconds ---
acc:  0.9549999833106995
--- embed 3.418227434158325 seconds ---
--- train 12.319038391113281 seconds ---
--- total 30.31583523750305 seconds ---
acc:  0.95333331823349
--- embed 3.3595666885375977 seconds ---
--- train 12.580832242965698 seconds ---
--- total 31.665188550949097 seconds ---
acc:  0.949999988079071
--- embed 3.4427661895751953 seconds ---
--- train 12.615651845932007 seconds ---
--- total 30.693434953689575 seconds ---
acc:  0.9516666531562805
--- embed 3.4687247276306152 seconds ---
--- train 12.653229475021362 seconds ---
--- total 31.406693696975708 seconds ---
acc:  0.9516666531562805
--- embed 3.4163429737091064 seconds ---
--- train 21.00416111946106 seconds ---
--- total 39.331142

In [None]:
plot(results)

Unnamed: 0,Laplacian,DiagA,Correlation,Accuracy,Train_Time(s),Emb_Time(s),Total_Time(s)
set_01,True,True,True,0.952167,16.79191,3.41,35.29
set_02,True,True,False,0.48,14.96333,3.42,33.36
set_03,True,False,True,0.951833,16.69279,3.46,36.04
set_04,True,False,False,0.48,15.7976,3.44,35.08
set_05,False,True,True,0.953,15.7844,3.32,24.09
set_06,False,True,False,0.954833,15.64148,3.31,23.85
set_07,False,False,True,0.953,12.97473,3.45,21.31
set_08,False,False,False,0.954667,16.57532,3.43,24.88


## profiling


1.   https://colab.research.google.com/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/01.07-Timing-and-Profiling.ipynb
2.   https://perso.crans.org/besson/publis/notebooks/Profiling_in_a_Jupyter_notebook.html

In [None]:
! pip install memory_profiler

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
%load_ext memory_profiler

In [None]:
! pip install hypothesis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# https://github.com/HypothesisWorks/hypothesis/issues/985
# doesn't help
import inspect
from hypothesis import settings

def is_debugging():
    for frame in inspect.stack():
        if frame[1].endswith("pydevd.py"):
            return True
    return False

if is_debugging():
    # The pycharm debugger requires to
    # run hypothesis with the `use_coverage=False` flag.
    try:
        settings.register_profile("debug", use_coverage=False)
        settings.load_profile(os.getenv(u'HYPOTHESIS_PROFILE', 'debug'))
    except:
        pass
else:
    settings.load_profile(os.getenv(u'HYPOTHESIS_PROFILE', 'default'))

In [None]:
case_10 = case.case_10_fully_known()
case_10.summary()

name:

    SBM with 3 classes and defined probabilities with fully known labels
    80% for training and 20% for testing
    
n:
<class 'int'>
3000
d:
<class 'int'>
3
X:
(3000, 3000)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 1 0]]
Y:
(3000, 1)
[[1]
 [0]
 [2]
 ...
 [2]
 [2]
 [2]]


### Sparse embed

In [None]:
%%file mprun.py

import numpy as np
# for sparse matrix
from scipy import sparse

def graph_encoder_embed(X, Y, n, **kwargs):
  """
    graph embedding function
    input X is sparse csr matrix of adjacency matrix
    -- if there is a connection between node i and node j:
    ---- X(i,j) = 1, no edge weight
    ---- X(i,j) = edge weight.
    -- if there is no connection between node i and node j:
    ---- X(i,j) = 0, 
    ---- note there is no storage for this in sparse matrix. 
    ---- No storage means 0 in sparse matrix.
    input Y is a list of labels:
    -- value -1 indicate no lable
    -- value >=0 indicate real label
    input train_idx: a list of indices of input X for training set 
  """
  defaultKwargs = {'Correlation': True}
  kwargs = { **defaultKwargs, **kwargs}


  # assign k to the max along the first column
  # Note for python, label Y starts from 0. Python index starts from 0. thus size k should be max + 1
  k = Y[:,0].max() + 1

  #nk: 1*n array, contains the number of observations in each class
  nk = np.zeros((1,k))
  for i in range(k):
    nk[0,i] = np.count_nonzero(Y[:,0]==i)
  
  #W: sparse matrix for encoder marix. W[i,k] = {1/nk if Yi==k, otherwise 0}
  W = sparse.dok_matrix((n, k), dtype=np.float32)

  for i in range(Y.shape[0]):
    k_i = Y[i,0]
    if k_i >=0:
      W[i,k_i] = 1/nk[0,k_i]
  
  W = sparse.csr_matrix(W)
  Z = X.dot(W)
  
  return Z, W


Writing mprun.py


In [None]:
X_sparse = sparse.csr_matrix(case_10.X)

In [None]:
from mprun import graph_encoder_embed

In [None]:
%mprun -f graph_encoder_embed Z,W = graph_encoder_embed(X_sparse, case_10.Y, case_10.n)




### origin embed

In [None]:
def adj_to_edg(A):
  """
    input is the adjacency matrix: A
    other variables in this function:
    s: number of edges
    return edg_list -- matrix format with shape(edg_sum,3):
    example row in edg_list(matrix): [vertex1, vertex2, connection weight from Adj matrix]
  """
  # check the len of the second dimenson of A
  if A.shape[1] <= 3:
    edg = A
  else:
    n = A.shape[0]
    # construct the initial edgg_list matrix with the size of (edg_sum, 3)
    edg_list = []
    for i in range(n):
      for j in range(n):
        if A[i,j] > 0:
          edg_list.append([i, j, A[i,j]])
    edg = np.array(edg_list)
  return edg

In [None]:
%%file mprun_origin.py

import numpy as np

def graph_encoder_embed(X,Y,n,**kwargs):
  """
    input X is s*3 edg list: nodei, nodej, connection weight(i,j)
    graph embedding function
  """
  defaultKwargs = {'Correlation': True}
  kwargs = { **defaultKwargs, **kwargs}


  # assign k to the max along the first column
  # Note for python, label Y starts from 0. Python index starts from 0. thus size k should be max + 1
  k = Y[:,0].max() + 1

  #nk: 1*n array, contains the number of observations in each class
  #W: encoder marix. W[i,k] = {1/nk if Yi==k, otherwise 0}
  nk = np.zeros((1,k))
  W = np.zeros((n,k))

  for i in range(k):
    nk[0,i] = np.count_nonzero(Y[:,0]==i)

  for i in range(Y.shape[0]):
    k_i = Y[i,0]
    if k_i >=0:
      W[i,k_i] = 1/nk[0,k_i]

  # Edge List Version in O(s)
  Z = np.zeros((n,k))
  i = 0
  for row in X:
    [v_i, v_j, edg_i_j] = row
    v_i = int(v_i)
    v_j = int(v_j)

    label_i = Y[v_i][0] 
    label_j = Y[v_j][0]

    if label_j >= 0:
      Z[v_i, label_j] = Z[v_i, label_j] + W[v_j, label_j]*edg_i_j
    if (label_i >= 0) and (v_i != v_j):
      Z[v_j, label_i] = Z[v_j, label_i] + W[v_i, label_i]*edg_i_j
  
  return Z, W


Overwriting mprun_origin.py


In [None]:
from mprun_origin import graph_encoder_embed

In [None]:
X_edg = adj_to_edg(case_10.X)

In [None]:
Z,W = graph_encoder_embed(X_edg, case_10.Y, case_10.n)

In [None]:
emb_strat = time.time()
Z, W = graph_encoder_embed(X_edg, case_10.Y, case_10.n)
emb_end = time.time()
emb_time = emb_end - emb_strat
print(emb_time)

3.58587646484375


doc: https://pypi.org/project/memory-profiler/
The first column represents the line number of the code that has been profiled, the second column (Mem usage) the memory usage of the Python interpreter after that line has been executed. The third column (Increment) represents the difference in memory of the current line with respect to the last one. The last column (Line Contents) prints the code that has been profiled.

In [None]:
%mprun -f graph_encoder_embed Z,W = graph_encoder_embed(X_edg, case_10.Y, case_10.n)


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/local/lib/python3.7/dist-packages/memory_profiler.py", line 845, in enable
    sys.settrace(self.trace_memory_usage)

