Skip to content
This repository has been archived by the owner on Nov 28, 2023. It is now read-only.

Commit

Permalink
Merge pull request #87 from DeepRank/successrate
Browse files Browse the repository at this point in the history
added some utility codes
  • Loading branch information
CunliangGeng committed Aug 12, 2019
2 parents eb8e42b + 3a21ae1 commit fb2e2c2
Show file tree
Hide file tree
Showing 12 changed files with 1,125 additions and 620 deletions.
2 changes: 1 addition & 1 deletion deeprank/learn/NeuralNet.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,7 +993,7 @@ def _plot_boxplot_class(self,figname):
for pts,t in zip(out,tar):
r = F.softmax(torch.FloatTensor(pts), dim=0).data.numpy()
data[t].append(r[1])
confusion[t][r[1]>0.5] += 1
confusion[t][bool(r[1]>0.5)] += 1

#print(" {:5s}: {:s}".format(l,str(confusion)))

Expand Down
33 changes: 27 additions & 6 deletions deeprank/learn/rankingMetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def hitrate(rs):
Example:
>>> r = [0,1,1]
>>> hit_rate(r,nr)
>>> rs = [0,1,1]
>>> hitrate(r)
Attributes:
Expand All @@ -27,14 +27,34 @@ def hitrate(rs):
Returns:
hirate (array): [recall@1,recall@2,...]
"""
nr = np.max((1,np.sum(rs)))
nr = np.max((1, np.sum(rs)))
return np.cumsum(rs) / nr


def success(rs):
"""Success for positions ≤ k.
Example:
>>> rs = [0, 0, 1, 0, 1, 0]
>>> success(rs)
[0, 0, 1, 1, 1, 1]
Args:
rs (array): binary relevance array
Returns:
success (array): [success@≤1, success@≤2,...]
"""
success = np.cumsum(rs) > 0

return success.astype(np.int)


def avprec(rs):
return [average_precision(rs[:i]) for i in range(1,len(rs))]
return [average_precision(rs[:i]) for i in range(1, len(rs))]


def recall(rs,nr):
def recall(rs, nr):
"""recall rate
First element is rank 1, Relevance is binray
Expand All @@ -56,6 +76,7 @@ def recall(rs,nr):

return np.sum(rs)/nr


def mean_reciprocal_rank(rs):
"""Score is reciprocal of the rank of the first relevant item
Expand Down Expand Up @@ -272,4 +293,4 @@ def ndcg_at_k(r, k, method=0):
dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
if not dcg_max:
return 0.
return dcg_at_k(r, k, method) / dcg_max
return dcg_at_k(r, k, method) / dcg_max
169 changes: 169 additions & 0 deletions deeprank/utils/cal_hitrate_successrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import numpy as np
import pandas as pd
from deeprank.learn import rankingMetrics


def evaluate(data):
'''
Calculate success rate and hit rate.
<INPUT>
data: a data frame.
label caseID modelID target DR HS
Test 1AVX 1AVX_ranair-it0_5286 0 0.503823 6.980802
Test 1AVX 1AVX_ti5-itw_354w 1 0.502845 -95.158100
Test 1AVX 1AVX_ranair-it0_6223 0 0.511688 -11.961460
<OUTPUT>
out_df: a data frame.
success: binary variable, indicating whether this case is a success when evaluating its top N models.
out_df :
label caseID success_DR hitRate_DR success_HS hitRate_HS
train 1ZHI 1 0.1 0 0.01
train 1ZHI 1 0.2 1 0.3
where success =[0, 0, 1, 1, 1,...]: starting from rank 3 this case is a success
'''

out_df = pd.DataFrame()
labels = data.label.unique() # ['train', 'test', 'valid']

for l in labels:
# l = 'train', 'test' or 'valid'

out_df_tmp = pd.DataFrame()

df = data.loc[data.label == l].copy()
methods = df.columns
methods = methods[4:]
df_grped = df.groupby('caseID')

for M in methods:
# df_sorted = df_one_case.apply(pd.DataFrame.sort_values, by= M, ascending=True)

success = []
hitrate = []
caseIDs = []
for caseID, df_one_case in df_grped:
df_sorted = df_one_case.sort_values(by=M, ascending=True)
hitrate.extend(rankingMetrics.hitrate(
df_sorted['target'].astype(np.int)))
success.extend(rankingMetrics.success(
df_sorted['target'].astype(np.int)))
caseIDs.extend([caseID] * len(df_one_case))

# hitrate = df_sorted['target'].apply(rankingMetrics.hitrate) # df_sorted['target']: class IDs for each model
# success = hitrate.apply(rankingMetrics.success) # success =[0, 0, 1, 1, 1,...]: starting from rank 3 this case is a success

out_df_tmp['label'] = [l] * len(df) # train, valid or test
out_df_tmp['caseID'] = caseIDs
out_df_tmp[f'success_{M}'] = success
out_df_tmp[f'hitRate_{M}'] = hitrate

out_df = pd.concat([out_df, out_df_tmp])

return out_df


def ave_evaluate(data):
'''
Calculate the average of each column over all cases.
INPUT:
data =
label caseID success_HS hitRate_HS success_DR hitRate_DR
train 1AVX 0.0 0.0 0.0 0.0
train 1AVX 1.0 1.0 1.0 1.0
train 2ACB 0.0 0.0 0.0 0.0
train 2ACB 1.0 1.0 1.0 1.0
test 7CEI 0.0 0.0 0.0 0.0
test 7CEI 1.0 1.0 1.0 1.0
test 5ACD 0.0 0.0 0.0 0.0
test 5ACD 1.0 1.0 1.0 1.0
OUTPUT:
new_data =
label caseID success_HS hitRate_HS success_DR hitRate_DR
train 1AVX 0.0 0.0 0.0 0.0
train 1AVX 1.0 1.0 1.0 1.0
train 2ACB 0.0 0.0 0.0 0.0
train 2ACB 1.0 1.0 1.0 1.0
test 7CEI 0.0 0.0 0.0 0.0
test 7CEI 1.0 1.0 1.0 1.0
test 5ACD 0.0 0.0 0.0 0.0
test 5ACD 1.0 1.0 1.0 1.0
'''

new_data = pd.DataFrame()
for l, perf_per_case in data.groupby('label'):
# l = 'train', 'test' or 'valid'

# count the model number for each case
grouped = perf_per_case.groupby('caseID')
num_models = grouped.apply(len)
num_cases = len(grouped)

# --
top_N = min(num_models)
perf_ave = pd.DataFrame()
perf_ave['label'] = [l] * top_N

for col in perf_per_case.columns[2:]:
# perf_per_case.columns = ['label', 'caseID', 'success_HS', 'hitRate_HS', 'success_DR', 'hitRate_DR']
perf_ave[col] = np.zeros(top_N)

for _, perf_case in grouped:
perf_ave[col] = perf_ave[col][0:top_N] + \
np.array(perf_case[col][0:top_N])

perf_ave[col] = perf_ave[col]/num_cases

new_data = pd.concat([new_data, perf_ave])

return new_data


def add_rank(df):
'''
INPUT (a data frame):
label success_DR hitRate_DR success_HS hitRate_HS
Test 0.0 0.000000 0.0 0.000000
Test 0.0 0.000000 1.0 0.012821
Train 0.0 0.000000 1.0 0.012821
Train 0.0 0.000000 1.0 0.025641
OUTPUT:
label success_DR hitRate_DR success_HS hitRate_HS rank
Test 0.0 0.000000 0.0 0.000000 0.000949
Test 0.0 0.000000 1.0 0.012821 0.001898
Train 0.0 0.000000 1.0 0.012821 0.002846
Train 0.0 0.000000 1.0 0.025641 0.003795
'''

# -- add the 'rank' column to df
rank = []
for _, df_per_label in df.groupby('label'):
num_mol = len(df_per_label)
rank_raw = np.array(range(num_mol)) + 1
rank.extend(rank_raw/num_mol)
df['rank'] = rank

df['label'] = pd.Categorical(df['label'], categories=[
'Train', 'Valid', 'Test'])

return df
48 changes: 48 additions & 0 deletions deeprank/utils/get_h5subset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env python
"""
Extract first N groups of a hdf5 to a new hdf5 file.
Usage: python {0} <hdf5 input file> <hdf5 output file> <number of groups to write>
Example: python {0} ./001_1GPW.hdf5 ./001_1GPW_sub10.hdf5 10
"""
import sys
import h5py

USAGE = __doc__.format(__file__)


def check_input(args):
if len(args) != 3:
sys.stderr.write(USAGE)
sys.exit(1)


def get_h5subset(fin, fout, n):
"""Extract first number of groups and write to a new hdf5 file.
Args:
fin (hdf5): input hdf5 file.
fout (hdf5): output hdf5 file.
n (int): first n groups to write.
"""
n = int(n)
h5 = h5py.File(fin, "r")
h5out = h5py.File(fout, "w")
print(f"First {n} groups in {fin}:")
for i in list(h5)[0:n]:
print(i)
h5.copy(h5[i], h5out)

print()
print(f"Groups in {fout}:")
print(list(h5out))
h5.close()
h5out.close()
print()
print(f"{fout} generated.")


if __name__ == "__main__":
check_input(sys.argv[1:])
fin, fout, n = sys.argv[1:]
get_h5subset(fin, fout, n)
Loading

0 comments on commit fb2e2c2

Please sign in to comment.