-
Notifications
You must be signed in to change notification settings - Fork 6
/
clusterExpression.py
69 lines (60 loc) · 1.89 KB
/
clusterExpression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import csv
import sys
import numpy as np
import scipy.cluster.hierarchy as hac
import matplotlib.pyplot as plt
def read_data_as_dict(file):
'''
Builds a dictionary out of the expression CSV.
The dict has the following format:
{'CELL_LINE_1':{
'GENE1':1234, 'GENE2':2345, ...
}
'CELL_LINE_2':{
'GENE1':1234, 'GENE2':2345, ...
}
.
.
.
}
:param file: Path to the csv file.
:return: Dict (as above format).
'''
with open(file) as csv_file:
csv_reader = csv.reader(csv_file)
cells = next(csv_reader, [])
cells.pop(0)
dict = {}
for cell in cells:
dict[cell] = {}
for c in csv_reader:
for k, l in enumerate(c[1:]):
dict[cells[k]][c[0]] = float(l)
return dict
def convert_dict_to_matrix(cell_dict):
'''
Clustering algorithms require a numpy matrix.
Take a dictionary (from the read_data_as_dict) and make it into a numpy matrix.
:param cell_dict: Dict from read_data_as_dict.
:return: a numpy matrix s.t. every row is a vector of exp. of genes
'''
matrix = []
for c in cell_dict:
arr = []
for d in cell_dict[c]:
arr.append(cell_dict[c][d])
matrix.append(arr)
return np.array(matrix)
if __name__ == '__main__':
if (len(sys.argv) < 2):
print("Please run as python clusterExpression.py <path_to_genx.csv>")
# Read the csv as a 2d-dictionary/matrix
dict_matrix = read_data_as_dict(sys.argv[1])
matrix = convert_dict_to_matrix(dict_matrix)
# Perform linkage / clustering using wards method
# https://en.wikipedia.org/wiki/Ward%27s_method
z = hac.linkage(matrix, method='ward')
# Construct a dendrogram from that clustering
d = hac.dendrogram(z, labels=list(dict_matrix.keys()))
# Show a plot of the dendrogram.
plt.show()