/
tfidf_thresholds.py
80 lines (70 loc) · 2.97 KB
/
tfidf_thresholds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# TODO: calculate tfidf thresholds
import sqlite3
import numpy as np
conn = sqlite3.connect('cellmesh/data/cellmesh.db')
cursor = conn.cursor()
# calculate mouse threshold
cursor.execute('SELECT tfidf FROM cell_gene WHERE taxid=10090')
results = cursor.fetchall()
results = [x[0] for x in results]
results = np.array(results)
results_sorted = np.sort(results)
percentile_90 = results_sorted[int(results_sorted.shape[0]*0.90)]
print('90th percentile tfidf for mouse:', percentile_90)
percentile_93 = results_sorted[int(results_sorted.shape[0]*0.93)]
print('93th percentile tfidf for mouse:', percentile_93)
# calculate human threshold
cursor.execute('SELECT tfidf FROM cell_gene WHERE taxid=9606')
results = cursor.fetchall()
results = [x[0] for x in results]
results = np.array(results)
results_sorted = np.sort(results)
percentile_90 = results_sorted[int(results_sorted.shape[0]*0.90)]
print('90th percentile tfidf for human:', percentile_90)
percentile_93 = results_sorted[int(results_sorted.shape[0]*0.93)]
print('93th percentile tfidf for human:', percentile_93)
# calculate worm threshold
cursor.execute('SELECT tfidf FROM cell_gene WHERE taxid=6239')
results = cursor.fetchall()
results = [x[0] for x in results]
results = np.array(results)
results_sorted = np.sort(results)
percentile_90 = results_sorted[int(results_sorted.shape[0]*0.90)]
print('90th percentile tfidf for worm:', percentile_90)
percentile_93 = results_sorted[int(results_sorted.shape[0]*0.93)]
print('93th percentile tfidf for worm:', percentile_93)
conn.close()
conn = sqlite3.connect('cellmesh/data/anatomy_mesh.db')
cursor = conn.cursor()
print('ANATOMY RESULTS')
# calculate mouse threshold
cursor.execute('SELECT tfidf FROM cell_gene WHERE taxid=10090')
results = cursor.fetchall()
results = [x[0] for x in results]
results = np.array(results)
results_sorted = np.sort(results)
percentile_90 = results_sorted[int(results_sorted.shape[0]*0.90)]
print('90th percentile tfidf for mouse:', percentile_90)
percentile_93 = results_sorted[int(results_sorted.shape[0]*0.93)]
print('93th percentile tfidf for mouse:', percentile_93)
# calculate human threshold
cursor.execute('SELECT tfidf FROM cell_gene WHERE taxid=9606')
results = cursor.fetchall()
results = [x[0] for x in results]
results = np.array(results)
results_sorted = np.sort(results)
percentile_90 = results_sorted[int(results_sorted.shape[0]*0.90)]
print('90th percentile tfidf for human:', percentile_90)
percentile_93 = results_sorted[int(results_sorted.shape[0]*0.93)]
print('93th percentile tfidf for human:', percentile_93)
# calculate worm threshold
cursor.execute('SELECT tfidf FROM cell_gene WHERE taxid=6239')
results = cursor.fetchall()
results = [x[0] for x in results]
results = np.array(results)
results_sorted = np.sort(results)
percentile_90 = results_sorted[int(results_sorted.shape[0]*0.90)]
print('90th percentile tfidf for worm:', percentile_90)
percentile_93 = results_sorted[int(results_sorted.shape[0]*0.93)]
print('93th percentile tfidf for worm:', percentile_93)
conn.close()