This repository has been archived by the owner on Nov 28, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 27
added some utility codes #87
Merged
Merged
Changes from all commits
Commits
Show all changes
35 commits
Select commit
Hold shift + click to select a range
34d66d3
updated plot_utils.py
LilySnow adf0460
add SuccessHitRate.py to utils
LilySnow eb40e12
updates on plot_utils.py
LilySnow 8e3e77b
updated plot_utils.py
LilySnow 7829a47
add count_hits.py
LilySnow a513ae5
updatedcount_hits.py
LilySnow a02e32d
added run_slurmFLs.py for cartesius
LilySnow 31103db
updated run_slurmFLs.py
LilySnow 5035ed3
fixed issue85
LilySnow 7b6c0f5
rename get_subset.py get_h5subset.py
LilySnow a95b7f8
tidy up code
LilySnow a0789a6
fixed the x-label of success rate plot
LilySnow 4300fe6
update success function
CunliangGeng dd338a8
tidied up code
LilySnow a9bc9ac
Merge branch 'successrate' of https://github.com/DeepRank/deeprank in…
LilySnow e0642e1
tidy up code
LilySnow 18ce03a
added counting cases_wo_hits to plot_utils.py
LilySnow 23efdfd
Update rankingMetrics.py
CunliangGeng 03b8590
Update rankingMetrics.py
CunliangGeng 7a5c1ad
Delete count_hits.py
CunliangGeng 413800c
Update get_h5subset.py
CunliangGeng 23fa906
Delete get_subset.py
CunliangGeng 9ad85e4
added unitest for plot_util.py
LilySnow 6cf93c1
Merge branch 'successrate' of https://github.com/DeepRank/deeprank in…
LilySnow c44b9f4
add rpy2
CunliangGeng 5705205
Update get_h5subset.py
CunliangGeng dac611e
Update test_hitrate_successrate.py
CunliangGeng c2dd6ab
Update setup.py
CunliangGeng 1a0a2b1
Update setup.py
CunliangGeng 67c0404
test import
CunliangGeng 36e6cde
split plot_utils to calcualtion and plot
CunliangGeng 058e867
Merge pull request #95 from DeepRank/test_successrate
CunliangGeng 2ed7614
update h5subset
CunliangGeng 435f02b
update plot_utils
CunliangGeng 3a21ae1
update test_hitrate_sucrate
CunliangGeng File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from deeprank.learn import rankingMetrics | ||
|
||
|
||
def evaluate(data): | ||
''' | ||
Calculate success rate and hit rate. | ||
|
||
<INPUT> | ||
data: a data frame. | ||
|
||
label caseID modelID target DR HS | ||
Test 1AVX 1AVX_ranair-it0_5286 0 0.503823 6.980802 | ||
Test 1AVX 1AVX_ti5-itw_354w 1 0.502845 -95.158100 | ||
Test 1AVX 1AVX_ranair-it0_6223 0 0.511688 -11.961460 | ||
|
||
<OUTPUT> | ||
out_df: a data frame. | ||
success: binary variable, indicating whether this case is a success when evaluating its top N models. | ||
|
||
out_df : | ||
label caseID success_DR hitRate_DR success_HS hitRate_HS | ||
train 1ZHI 1 0.1 0 0.01 | ||
train 1ZHI 1 0.2 1 0.3 | ||
|
||
where success =[0, 0, 1, 1, 1,...]: starting from rank 3 this case is a success | ||
|
||
''' | ||
|
||
out_df = pd.DataFrame() | ||
labels = data.label.unique() # ['train', 'test', 'valid'] | ||
|
||
for l in labels: | ||
# l = 'train', 'test' or 'valid' | ||
|
||
out_df_tmp = pd.DataFrame() | ||
|
||
df = data.loc[data.label == l].copy() | ||
methods = df.columns | ||
methods = methods[4:] | ||
df_grped = df.groupby('caseID') | ||
|
||
for M in methods: | ||
# df_sorted = df_one_case.apply(pd.DataFrame.sort_values, by= M, ascending=True) | ||
|
||
success = [] | ||
hitrate = [] | ||
caseIDs = [] | ||
for caseID, df_one_case in df_grped: | ||
df_sorted = df_one_case.sort_values(by=M, ascending=True) | ||
hitrate.extend(rankingMetrics.hitrate( | ||
df_sorted['target'].astype(np.int))) | ||
success.extend(rankingMetrics.success( | ||
df_sorted['target'].astype(np.int))) | ||
caseIDs.extend([caseID] * len(df_one_case)) | ||
|
||
# hitrate = df_sorted['target'].apply(rankingMetrics.hitrate) # df_sorted['target']: class IDs for each model | ||
# success = hitrate.apply(rankingMetrics.success) # success =[0, 0, 1, 1, 1,...]: starting from rank 3 this case is a success | ||
|
||
out_df_tmp['label'] = [l] * len(df) # train, valid or test | ||
out_df_tmp['caseID'] = caseIDs | ||
out_df_tmp[f'success_{M}'] = success | ||
out_df_tmp[f'hitRate_{M}'] = hitrate | ||
|
||
out_df = pd.concat([out_df, out_df_tmp]) | ||
|
||
return out_df | ||
|
||
|
||
def ave_evaluate(data): | ||
''' | ||
Calculate the average of each column over all cases. | ||
|
||
INPUT: | ||
data = | ||
label caseID success_HS hitRate_HS success_DR hitRate_DR | ||
|
||
train 1AVX 0.0 0.0 0.0 0.0 | ||
train 1AVX 1.0 1.0 1.0 1.0 | ||
|
||
train 2ACB 0.0 0.0 0.0 0.0 | ||
train 2ACB 1.0 1.0 1.0 1.0 | ||
|
||
test 7CEI 0.0 0.0 0.0 0.0 | ||
test 7CEI 1.0 1.0 1.0 1.0 | ||
|
||
test 5ACD 0.0 0.0 0.0 0.0 | ||
test 5ACD 1.0 1.0 1.0 1.0 | ||
|
||
OUTPUT: | ||
new_data = | ||
label caseID success_HS hitRate_HS success_DR hitRate_DR | ||
|
||
train 1AVX 0.0 0.0 0.0 0.0 | ||
train 1AVX 1.0 1.0 1.0 1.0 | ||
|
||
train 2ACB 0.0 0.0 0.0 0.0 | ||
train 2ACB 1.0 1.0 1.0 1.0 | ||
|
||
test 7CEI 0.0 0.0 0.0 0.0 | ||
test 7CEI 1.0 1.0 1.0 1.0 | ||
|
||
test 5ACD 0.0 0.0 0.0 0.0 | ||
test 5ACD 1.0 1.0 1.0 1.0 | ||
|
||
''' | ||
|
||
new_data = pd.DataFrame() | ||
for l, perf_per_case in data.groupby('label'): | ||
# l = 'train', 'test' or 'valid' | ||
|
||
# count the model number for each case | ||
grouped = perf_per_case.groupby('caseID') | ||
num_models = grouped.apply(len) | ||
num_cases = len(grouped) | ||
|
||
# -- | ||
top_N = min(num_models) | ||
perf_ave = pd.DataFrame() | ||
perf_ave['label'] = [l] * top_N | ||
|
||
for col in perf_per_case.columns[2:]: | ||
# perf_per_case.columns = ['label', 'caseID', 'success_HS', 'hitRate_HS', 'success_DR', 'hitRate_DR'] | ||
perf_ave[col] = np.zeros(top_N) | ||
|
||
for _, perf_case in grouped: | ||
perf_ave[col] = perf_ave[col][0:top_N] + \ | ||
np.array(perf_case[col][0:top_N]) | ||
|
||
perf_ave[col] = perf_ave[col]/num_cases | ||
|
||
new_data = pd.concat([new_data, perf_ave]) | ||
|
||
return new_data | ||
|
||
|
||
def add_rank(df): | ||
''' | ||
INPUT (a data frame): | ||
label success_DR hitRate_DR success_HS hitRate_HS | ||
Test 0.0 0.000000 0.0 0.000000 | ||
Test 0.0 0.000000 1.0 0.012821 | ||
|
||
Train 0.0 0.000000 1.0 0.012821 | ||
Train 0.0 0.000000 1.0 0.025641 | ||
|
||
OUTPUT: | ||
label success_DR hitRate_DR success_HS hitRate_HS rank | ||
Test 0.0 0.000000 0.0 0.000000 0.000949 | ||
Test 0.0 0.000000 1.0 0.012821 0.001898 | ||
|
||
Train 0.0 0.000000 1.0 0.012821 0.002846 | ||
Train 0.0 0.000000 1.0 0.025641 0.003795 | ||
|
||
''' | ||
|
||
# -- add the 'rank' column to df | ||
rank = [] | ||
for _, df_per_label in df.groupby('label'): | ||
num_mol = len(df_per_label) | ||
rank_raw = np.array(range(num_mol)) + 1 | ||
rank.extend(rank_raw/num_mol) | ||
df['rank'] = rank | ||
|
||
df['label'] = pd.Categorical(df['label'], categories=[ | ||
'Train', 'Valid', 'Test']) | ||
|
||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#!/usr/bin/env python | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will be updated to be able to extract not only 5 but first N models. |
||
""" | ||
Extract first N groups of a hdf5 to a new hdf5 file. | ||
|
||
Usage: python {0} <hdf5 input file> <hdf5 output file> <number of groups to write> | ||
Example: python {0} ./001_1GPW.hdf5 ./001_1GPW_sub10.hdf5 10 | ||
""" | ||
import sys | ||
import h5py | ||
|
||
USAGE = __doc__.format(__file__) | ||
|
||
|
||
def check_input(args): | ||
if len(args) != 3: | ||
sys.stderr.write(USAGE) | ||
sys.exit(1) | ||
|
||
|
||
def get_h5subset(fin, fout, n): | ||
"""Extract first number of groups and write to a new hdf5 file. | ||
|
||
Args: | ||
fin (hdf5): input hdf5 file. | ||
fout (hdf5): output hdf5 file. | ||
n (int): first n groups to write. | ||
""" | ||
n = int(n) | ||
h5 = h5py.File(fin, "r") | ||
h5out = h5py.File(fout, "w") | ||
print(f"First {n} groups in {fin}:") | ||
for i in list(h5)[0:n]: | ||
print(i) | ||
h5.copy(h5[i], h5out) | ||
|
||
print() | ||
print(f"Groups in {fout}:") | ||
print(list(h5out)) | ||
h5.close() | ||
h5out.close() | ||
print() | ||
print(f"{fout} generated.") | ||
|
||
|
||
if __name__ == "__main__": | ||
check_input(sys.argv[1:]) | ||
fin, fout, n = sys.argv[1:] | ||
get_h5subset(fin, fout, n) |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extract the calculation of hit/success rate from plot_utils to this new file, which could avoid CI failure due to the import of
rpy2
module.