In [1]:
import numpy as np
import pandas as pd
import argparse
from icecream import ic

def convert_clustering_to_net(clustering_result):
    net = []
    for i in range(clustering_result.shape[0]):
        for item in clustering_result[i,]:
            net.append([i, item, 1])
    net = np.array(net)
    return(net)


def build_network(characteristic_file_path, method='knn'):
    patient_characteristic = pd.read_csv(characteristic_file_path, header=0)
    clustering_data = np.array(patient_characteristic.iloc[:,1:26])
    # ic()
    # ic(clustering_data)

    from sklearn.metrics import silhouette_score
    if method == 'knn':
        from sklearn.neighbors import NearestNeighbors
        print('knn is used for network building.')
        
        clustering_scores = []
        for k in range(1,10):
            neighbors = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(clustering_data)
            distances, indices = neighbors.kneighbors(clustering_data)
            clustering_scores[k-1] = silhouette_score(clustering_data, indices)
        optimal_k = clustering_scores.index(max(clustering_scores))+1
        print('The best performing k is', optimal_k)

        neighbors = NearestNeighbors(n_neighbors=optimal_k, algorithm='ball_tree').fit(clustering_data)
        distances, indices = neighbors.kneighbors(clustering_data)
    net = convert_clustering_to_net(indices)
    return(net)

def main():
    net = build_network('../data/covid/COVID_encoded.csv')
    print('Saving network...')
    np.savetxt('../data/covid/net.txt', net, fmt='%i', delimiter='\t')
    # ic(net)
    # ic(clustering_result[1,])
    # print(clustering_result)


In [51]:
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(clustering_data)
distances, indices = neighbors.kneighbors(clustering_data)

net = convert_clustering_to_net(indices)
net[,0]
from scipy import sparse
net = sparse.csr_matrix(net)
print(net)

[[  0   0   1]
 [  0  13   1]
 [  0  11   1]
 ...
 [104 102   1]
 [104  99   1]
 [104 103   1]]


In [71]:
row = net[:, 0]
column = net[:, 1]
data = net[:,2]
from scipy import sparse
sparse_net = sparse.csr_matrix((data, (row, column)), shape=(105, 105))
densed_net = sparse_net.toarray()

In [78]:
print(densed_net[1,])
# print(np.where(densed_net[0,] != 0)[1])
# print(densed_net[0,])

[1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [83]:
clustering_assignment = np.zeros(net.shape[0])
group_count = 1
round_finished = False

In [38]:
used_index = []
for i in range(densed_net.shape[0]):
    if i in used_index:
        continue
    else:
        while round_finished == False:
            current_non_zero_index = np.where(densed_net[i,] != 0)[1]
            related_samples = current_non_zero_index
            clustering_assignment[current_non_zero_index] = group_count
            group_count += 1
            print(current_non_zero_index)
        # used_index.append(current_non_zero_index)
        # print('no ok')

[ 1 11 13 16 18]
[ 0  4 11 13 14]
[12 19 27 28 36]
[29 31 37 38 42]
[ 6 15 21 22 25]
[ 9 19 28 33 34]
[ 4 15 21 22 25]
[39 41 43 46 51]
[20 26 30 32 50]
[27 33 34 42 44]
[52 54 55 57 62]
[ 0  1 13 14 17]
[ 2  5 19 28 36]
[ 0 16 17 18 23]
[11 17 18 23 24]
[ 4  6 21 22 25]
[13 17 18 23 24]
[13 16 18 23 24]
[13 16 17 23 24]
[ 2 12 28 33 36]
[ 8 26 30 50 56]
[ 6 22 25 32 35]
[ 6 21 25 32 35]
[13 16 17 18 24]
[14 16 17 18 23]
[ 6 21 22 32 35]
[ 8 20 27 30 56]
[ 9 28 33 44 49]
[12 19 27 33 36]
[31 38 42 45 47]
[ 8 26 32 51 53]
[14 29 37 38 42]
[21 25 35 40 43]
[ 9 27 34 44 49]
[ 9 33 38 42 44]
[21 25 32 40 43]
[12 19 27 28 49]
[31 34 38 42 48]
[31 37 42 47 48]
[41 52 55 62 66]
[32 35 43 46 51]
[ 7 39 67 71 74]
[31 37 38 47 48]
[32 40 46 51 53]
[ 9 34 47 48 49]
[29 47 48 54 57]
[32 40 43 51 53]
[38 42 45 48 54]
[42 44 47 54 57]
[ 9 44 54 57 58]
[56 59 65 71 74]
[40 43 46 53 61]
[55 60 62 64 68]
[40 43 46 51 61]
[45 47 48 55 57]
[52 54 57 60 64]
[50 59 66 69 72]
[47 54 55 60 64]
[49 63 64 68 7

In [37]:
used_index = []
for i in range(densed_net.shape[0]):
    if i in used_index:
        continue
    else:
        current_non_zero_index = np.where(densed_net[i,] != 0)[1]
        clustering_assignment[current_non_zero_index] = group_count
        group_count += 1

In [82]:
x = np.random.rand(10,2)
neighbors = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(x)
distances, indices = neighbors.kneighbors(x)

print(indices)

[[0 4 1]
 [1 0 4]
 [2 6 9]
 [3 5 9]
 [4 0 1]
 [5 9 1]
 [6 2 9]
 [7 5 1]
 [8 0 4]
 [9 2 6]]


In [4]:
feature_file = open('../data/covid/feature.txt', 'a')
feature_file.write(str(0)+'\t')
feature_file.close()