Skip to content
This repository has been archived by the owner on Nov 28, 2023. It is now read-only.

added some utility codes #87

Merged
merged 35 commits into from
Aug 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
34d66d3
updated plot_utils.py
LilySnow Jul 2, 2019
adf0460
add SuccessHitRate.py to utils
LilySnow Jul 2, 2019
eb40e12
updates on plot_utils.py
LilySnow Jul 5, 2019
8e3e77b
updated plot_utils.py
LilySnow Jul 8, 2019
7829a47
add count_hits.py
LilySnow Jul 12, 2019
a513ae5
updatedcount_hits.py
LilySnow Jul 13, 2019
a02e32d
added run_slurmFLs.py for cartesius
LilySnow Jul 13, 2019
31103db
updated run_slurmFLs.py
LilySnow Jul 15, 2019
5035ed3
fixed issue85
LilySnow Jul 16, 2019
7b6c0f5
rename get_subset.py get_h5subset.py
LilySnow Jul 16, 2019
a95b7f8
tidy up code
LilySnow Jul 16, 2019
a0789a6
fixed the x-label of success rate plot
LilySnow Jul 16, 2019
4300fe6
update success function
CunliangGeng Jul 22, 2019
dd338a8
tidied up code
LilySnow Jul 25, 2019
a9bc9ac
Merge branch 'successrate' of https://github.com/DeepRank/deeprank in…
LilySnow Jul 25, 2019
e0642e1
tidy up code
LilySnow Jul 25, 2019
18ce03a
added counting cases_wo_hits to plot_utils.py
LilySnow Jul 25, 2019
23efdfd
Update rankingMetrics.py
CunliangGeng Jul 26, 2019
03b8590
Update rankingMetrics.py
CunliangGeng Jul 26, 2019
7a5c1ad
Delete count_hits.py
CunliangGeng Jul 26, 2019
413800c
Update get_h5subset.py
CunliangGeng Jul 26, 2019
23fa906
Delete get_subset.py
CunliangGeng Jul 26, 2019
9ad85e4
added unitest for plot_util.py
LilySnow Jul 26, 2019
6cf93c1
Merge branch 'successrate' of https://github.com/DeepRank/deeprank in…
LilySnow Jul 26, 2019
c44b9f4
add rpy2
CunliangGeng Jul 26, 2019
5705205
Update get_h5subset.py
CunliangGeng Jul 26, 2019
dac611e
Update test_hitrate_successrate.py
CunliangGeng Jul 26, 2019
c2dd6ab
Update setup.py
CunliangGeng Jul 26, 2019
1a0a2b1
Update setup.py
CunliangGeng Jul 26, 2019
67c0404
test import
CunliangGeng Jul 29, 2019
36e6cde
split plot_utils to calcualtion and plot
CunliangGeng Jul 29, 2019
058e867
Merge pull request #95 from DeepRank/test_successrate
CunliangGeng Jul 29, 2019
2ed7614
update h5subset
CunliangGeng Jul 29, 2019
435f02b
update plot_utils
CunliangGeng Jul 29, 2019
3a21ae1
update test_hitrate_sucrate
CunliangGeng Jul 29, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deeprank/learn/NeuralNet.py
Original file line number Diff line number Diff line change
Expand Up @@ -975,7 +975,7 @@ def _plot_boxplot_class(self,figname):
for pts,t in zip(out,tar):
r = F.softmax(torch.FloatTensor(pts), dim=0).data.numpy()
data[t].append(r[1])
confusion[t][r[1]>0.5] += 1
confusion[t][bool(r[1]>0.5)] += 1

#print(" {:5s}: {:s}".format(l,str(confusion)))

Expand Down
33 changes: 27 additions & 6 deletions deeprank/learn/rankingMetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def hitrate(rs):

Example:

>>> r = [0,1,1]
>>> hit_rate(r,nr)
>>> rs = [0,1,1]
>>> hitrate(r)


Attributes:
Expand All @@ -27,14 +27,34 @@ def hitrate(rs):
Returns:
hirate (array): [recall@1,recall@2,...]
"""
nr = np.max((1,np.sum(rs)))
nr = np.max((1, np.sum(rs)))
return np.cumsum(rs) / nr


def success(rs):
"""Success for positions ≤ k.

Example:
>>> rs = [0, 0, 1, 0, 1, 0]
>>> success(rs)
[0, 0, 1, 1, 1, 1]

Args:
rs (array): binary relevance array

Returns:
success (array): [success@≤1, success@≤2,...]
"""
success = np.cumsum(rs) > 0

return success.astype(np.int)


def avprec(rs):
return [average_precision(rs[:i]) for i in range(1,len(rs))]
return [average_precision(rs[:i]) for i in range(1, len(rs))]


def recall(rs,nr):
def recall(rs, nr):
"""recall rate
First element is rank 1, Relevance is binray

Expand All @@ -56,6 +76,7 @@ def recall(rs,nr):

return np.sum(rs)/nr


def mean_reciprocal_rank(rs):
"""Score is reciprocal of the rank of the first relevant item

Expand Down Expand Up @@ -272,4 +293,4 @@ def ndcg_at_k(r, k, method=0):
dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
if not dcg_max:
return 0.
return dcg_at_k(r, k, method) / dcg_max
return dcg_at_k(r, k, method) / dcg_max
169 changes: 169 additions & 0 deletions deeprank/utils/cal_hitrate_successrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import numpy as np
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extract the calculation of hit/success rate from plot_utils to this new file, which could avoid CI failure due to the import of rpy2 module.

import pandas as pd
from deeprank.learn import rankingMetrics


def evaluate(data):
'''
Calculate success rate and hit rate.

<INPUT>
data: a data frame.

label caseID modelID target DR HS
Test 1AVX 1AVX_ranair-it0_5286 0 0.503823 6.980802
Test 1AVX 1AVX_ti5-itw_354w 1 0.502845 -95.158100
Test 1AVX 1AVX_ranair-it0_6223 0 0.511688 -11.961460

<OUTPUT>
out_df: a data frame.
success: binary variable, indicating whether this case is a success when evaluating its top N models.

out_df :
label caseID success_DR hitRate_DR success_HS hitRate_HS
train 1ZHI 1 0.1 0 0.01
train 1ZHI 1 0.2 1 0.3

where success =[0, 0, 1, 1, 1,...]: starting from rank 3 this case is a success

'''

out_df = pd.DataFrame()
labels = data.label.unique() # ['train', 'test', 'valid']

for l in labels:
# l = 'train', 'test' or 'valid'

out_df_tmp = pd.DataFrame()

df = data.loc[data.label == l].copy()
methods = df.columns
methods = methods[4:]
df_grped = df.groupby('caseID')

for M in methods:
# df_sorted = df_one_case.apply(pd.DataFrame.sort_values, by= M, ascending=True)

success = []
hitrate = []
caseIDs = []
for caseID, df_one_case in df_grped:
df_sorted = df_one_case.sort_values(by=M, ascending=True)
hitrate.extend(rankingMetrics.hitrate(
df_sorted['target'].astype(np.int)))
success.extend(rankingMetrics.success(
df_sorted['target'].astype(np.int)))
caseIDs.extend([caseID] * len(df_one_case))

# hitrate = df_sorted['target'].apply(rankingMetrics.hitrate) # df_sorted['target']: class IDs for each model
# success = hitrate.apply(rankingMetrics.success) # success =[0, 0, 1, 1, 1,...]: starting from rank 3 this case is a success

out_df_tmp['label'] = [l] * len(df) # train, valid or test
out_df_tmp['caseID'] = caseIDs
out_df_tmp[f'success_{M}'] = success
out_df_tmp[f'hitRate_{M}'] = hitrate

out_df = pd.concat([out_df, out_df_tmp])

return out_df


def ave_evaluate(data):
'''
Calculate the average of each column over all cases.

INPUT:
data =
label caseID success_HS hitRate_HS success_DR hitRate_DR

train 1AVX 0.0 0.0 0.0 0.0
train 1AVX 1.0 1.0 1.0 1.0

train 2ACB 0.0 0.0 0.0 0.0
train 2ACB 1.0 1.0 1.0 1.0

test 7CEI 0.0 0.0 0.0 0.0
test 7CEI 1.0 1.0 1.0 1.0

test 5ACD 0.0 0.0 0.0 0.0
test 5ACD 1.0 1.0 1.0 1.0

OUTPUT:
new_data =
label caseID success_HS hitRate_HS success_DR hitRate_DR

train 1AVX 0.0 0.0 0.0 0.0
train 1AVX 1.0 1.0 1.0 1.0

train 2ACB 0.0 0.0 0.0 0.0
train 2ACB 1.0 1.0 1.0 1.0

test 7CEI 0.0 0.0 0.0 0.0
test 7CEI 1.0 1.0 1.0 1.0

test 5ACD 0.0 0.0 0.0 0.0
test 5ACD 1.0 1.0 1.0 1.0

'''

new_data = pd.DataFrame()
for l, perf_per_case in data.groupby('label'):
# l = 'train', 'test' or 'valid'

# count the model number for each case
grouped = perf_per_case.groupby('caseID')
num_models = grouped.apply(len)
num_cases = len(grouped)

# --
top_N = min(num_models)
perf_ave = pd.DataFrame()
perf_ave['label'] = [l] * top_N

for col in perf_per_case.columns[2:]:
# perf_per_case.columns = ['label', 'caseID', 'success_HS', 'hitRate_HS', 'success_DR', 'hitRate_DR']
perf_ave[col] = np.zeros(top_N)

for _, perf_case in grouped:
perf_ave[col] = perf_ave[col][0:top_N] + \
np.array(perf_case[col][0:top_N])

perf_ave[col] = perf_ave[col]/num_cases

new_data = pd.concat([new_data, perf_ave])

return new_data


def add_rank(df):
'''
INPUT (a data frame):
label success_DR hitRate_DR success_HS hitRate_HS
Test 0.0 0.000000 0.0 0.000000
Test 0.0 0.000000 1.0 0.012821

Train 0.0 0.000000 1.0 0.012821
Train 0.0 0.000000 1.0 0.025641

OUTPUT:
label success_DR hitRate_DR success_HS hitRate_HS rank
Test 0.0 0.000000 0.0 0.000000 0.000949
Test 0.0 0.000000 1.0 0.012821 0.001898

Train 0.0 0.000000 1.0 0.012821 0.002846
Train 0.0 0.000000 1.0 0.025641 0.003795

'''

# -- add the 'rank' column to df
rank = []
for _, df_per_label in df.groupby('label'):
num_mol = len(df_per_label)
rank_raw = np.array(range(num_mol)) + 1
rank.extend(rank_raw/num_mol)
df['rank'] = rank

df['label'] = pd.Categorical(df['label'], categories=[
'Train', 'Valid', 'Test'])

return df
48 changes: 48 additions & 0 deletions deeprank/utils/get_h5subset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env python
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will be updated to be able to extract not only 5 but first N models.

"""
Extract first N groups of a hdf5 to a new hdf5 file.

Usage: python {0} <hdf5 input file> <hdf5 output file> <number of groups to write>
Example: python {0} ./001_1GPW.hdf5 ./001_1GPW_sub10.hdf5 10
"""
import sys
import h5py

USAGE = __doc__.format(__file__)


def check_input(args):
if len(args) != 3:
sys.stderr.write(USAGE)
sys.exit(1)


def get_h5subset(fin, fout, n):
"""Extract first number of groups and write to a new hdf5 file.

Args:
fin (hdf5): input hdf5 file.
fout (hdf5): output hdf5 file.
n (int): first n groups to write.
"""
n = int(n)
h5 = h5py.File(fin, "r")
h5out = h5py.File(fout, "w")
print(f"First {n} groups in {fin}:")
for i in list(h5)[0:n]:
print(i)
h5.copy(h5[i], h5out)

print()
print(f"Groups in {fout}:")
print(list(h5out))
h5.close()
h5out.close()
print()
print(f"{fout} generated.")


if __name__ == "__main__":
check_input(sys.argv[1:])
fin, fout, n = sys.argv[1:]
get_h5subset(fin, fout, n)
Loading