Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add hist module for plotting raw hdf5 files features distributions #261

Merged
merged 28 commits into from Jan 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
bd2fe14
update minor in dataset.py doc string
gcroci2 Nov 29, 2022
60ab4e0
add notebook for development (to be deleted later)
gcroci2 Nov 29, 2022
5ce49a8
add first draft of transform module
gcroci2 Nov 29, 2022
9ce434c
Merge branch 'main' into 237_transformation_module_gcroci2
gcroci2 Dec 12, 2022
63a9b3c
Merge branch 'main' into 237_transformation_module_gcroci2
gcroci2 Dec 14, 2022
9d512da
add multiple hdf5 files option to hdf5_to_pandas
gcroci2 Dec 14, 2022
4a14621
modify how pandas df is built for save it in a feather/parquet file f…
gcroci2 Dec 14, 2022
5737189
add my comment to delete later
gcroci2 Dec 14, 2022
bf1b3d9
add reset index to pandas df in exporters
gcroci2 Dec 14, 2022
376804a
improve logic in hdf5_to_pandas
gcroci2 Dec 14, 2022
8981ee2
update development notebook
gcroci2 Dec 14, 2022
71b0402
add utility functions
gcroci2 Jan 3, 2023
988524b
add dependencies
gcroci2 Jan 3, 2023
9ba66ff
add tests for new functions in transform.py
gcroci2 Jan 3, 2023
d54512d
delete notebook
gcroci2 Jan 3, 2023
22223df
improve plotting function
gcroci2 Jan 4, 2023
d499ddc
update tests
gcroci2 Jan 4, 2023
3401460
fix prospector errors
gcroci2 Jan 4, 2023
24d2b1f
Merge branch 'main' into 237_transformation_module_gcroci2
gcroci2 Jan 4, 2023
8583aca
change plotly with matplot lib to handle big data
gcroci2 Jan 4, 2023
ef57d57
update tests
gcroci2 Jan 4, 2023
6877185
remove warning too verbose
Jan 5, 2023
3d32bee
remove plotly dependencies
gcroci2 Jan 12, 2023
4bf8974
merge with main
gcroci2 Jan 12, 2023
b2107c5
renaming scripts
gcroci2 Jan 13, 2023
713932e
uniform docstring to google style
gcroci2 Jan 13, 2023
fbcd6c6
add details to save_hist docstring
gcroci2 Jan 13, 2023
2d29bdc
change name in tests for hist module
gcroci2 Jan 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 3 additions & 5 deletions deeprankcore/dataset.py
Expand Up @@ -247,7 +247,7 @@ def __init__( # pylint: disable=too-many-arguments
Automatically set to 'regress' if the target is 'irmsd', 'lrmsd', 'fnat' or 'dockq'.

features (Union[List[str], str], optional): Consider all pre-computed features ("all") or some defined node features
(provide a list, example: ["res_type", "polarity", "bsa"]). The complete list can be found in `deeprankcore.domain.features`.
(provide a list, example: ["res_type", "polarity", "bsa"]). The complete list can be found in `deeprankcore.domain.gridstorage`.

classes (Union[List[str], List[int], List[float]], optional): Define the dataset target classes in classification mode. Defaults to [0, 1].

Expand Down Expand Up @@ -419,10 +419,10 @@ def __init__( # pylint: disable=too-many-arguments, too-many-locals
Automatically set to 'regress' if the target is 'irmsd', 'lrmsd', 'fnat' or 'dockq'.

node_features (Union[List[str], str, optional): Consider all pre-computed node features ("all") or some defined node features
(provide a list, example: ["res_type", "polarity", "bsa"]). The complete list can be found in `deeprankcore.domain.features`.
(provide a list, example: ["res_type", "polarity", "bsa"]). The complete list can be found in `deeprankcore.domain.nodestorage`.

edge_features (Union[List[str], str, optional): Consider all pre-computed edge features ("all") or some defined edge features
(provide a list, example: ["dist", "coulomb"]). The complete list can be found in `deeprankcore.domain.features`.
(provide a list, example: ["dist", "coulomb"]). The complete list can be found in `deeprankcore.domain.edgestorage`.

clustering_method (str, optional): "mcl" for Markov cluster algorithm (see https://micans.org/mcl/),
or "louvain" for Louvain method (see https://en.wikipedia.org/wiki/Louvain_method).
Expand Down Expand Up @@ -577,8 +577,6 @@ def load_one_graph(self, fname: str, entry_name: str) -> Data: # pylint: disabl
_log.warning(f"no clustering/{self.clustering_method} detected")
else:
_log.warning("no clustering group found")
else:
_log.warning("no cluster method set")

# load
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y, pos=pos)
Expand Down
198 changes: 198 additions & 0 deletions deeprankcore/tools/hist.py
@@ -0,0 +1,198 @@
import h5py
import pandas as pd
import logging

from typing import List, Union, Tuple
from deeprankcore.domain import (
edgestorage as Efeat,
nodestorage as Nfeat,
targetstorage as targets)
import numpy as np
import matplotlib.pyplot as plt


_log = logging.getLogger(__name__)


def hdf5_to_pandas( # noqa: MC0001, pylint: disable=too-many-locals
hdf5_path: Union[str,List],
subset: List[str] = None,
node_features: Union[List[str], str] = "all",
edge_features: Union[List[str], str] = "all",
target_features: Union[List[str], str] = "all"
) -> pd.DataFrame:
"""
Args:
hdf5_path (str or list): Path to hdf5 file(s). For multiple hdf5 files,
insert the paths in a list.

subset (list, optional): list of keys from hdf5 file to include. Defaults to None (meaning include all).

node_features (str or list, optional): consider all pre-computed node features ("all")
or some defined node features (provide a list, example: ["res_type", "polarity", "bsa"]).
The complete list can be found in deeprankcore/domain/nodestorage.py

edge_features (list, optional): consider all pre-computed edge features ("all")
or some defined edge features (provide a list, example: ["dist", "coulomb"]).
The complete list can be found in deeprankcore/domain/edgestorage.py

target_features (list, optional): consider all pre-computed target features ("all")
or some defined target features (provide a list, example: ["binary", "capri_class"]).
The complete list (only of the pre-defined ones) can be found in deeprankcore/domain/targetstorage.py

Returns:
df_final (pd.DataFrame): Pandas DataFrame containing the selected features as columns per all data points in
hdf5_path files.
"""
if not isinstance(hdf5_path, list):
hdf5_path = [hdf5_path]

df_final = pd.DataFrame()

for fname in hdf5_path:
with h5py.File(fname, 'r') as f:

mol_key = list(f.keys())[0]

if subset is not None:
mol_keys = [mol for mol, _ in f.items() if mol in subset]
else:
mol_keys = [mol for mol, _ in f.items()]

# read available node features
available_node_features = list(f[f"{mol_key}/{Nfeat.NODE}/"].keys())
available_node_features = [key for key in available_node_features if key[0] != '_'] # ignore metafeatures

# read available edge features
available_edge_features = list(f[f"{mol_key}/{Efeat.EDGE}/"].keys())
available_edge_features = [key for key in available_edge_features if key[0] != '_'] # ignore metafeatures

# read available targets
available_target_features = list(f[f"{mol_key}/{targets.VALUES}/"].keys())

if node_features == "all":
node_features = available_node_features
if edge_features == "all":
edge_features = available_edge_features
if target_features == "all":
target_features = available_target_features

if not isinstance(node_features, list):
node_features = [node_features]
if not isinstance(edge_features, list):
edge_features = [edge_features]
if not isinstance(target_features, list):
target_features = [target_features]

# check node features
for feat in node_features:
if feat not in available_node_features:
raise ValueError(
f"The node feature _{feat}_ was not found in the file {hdf5_path}.\
\nAvailable node features: {available_node_features}"
)
# check edge features
for feat in edge_features:
if feat not in available_edge_features:
raise ValueError(
f"The edge feature _{feat}_ was not found in the file {hdf5_path}.\
\nAvailable edge features: {available_edge_features}"
)
# check target features
for feat in target_features:
if feat not in available_target_features:
raise ValueError(
f"The target feature _{feat}_ was not found in the file {hdf5_path}.\
\nAvailable target features: {available_target_features}"
)

df_dict = {}
df_dict['id'] = mol_keys

for feat in node_features + edge_features + target_features:
if feat in node_features:
feat_type = 'node_features'
elif feat in edge_features:
feat_type = 'edge_features'
else:
feat_type = 'target_values'

if f[mol_key][feat_type][feat][()].ndim == 2:
for i in range(f[mol_key][feat_type][feat][:].shape[1]):
df_dict[feat + '_' + str(i)] = [f[mol_key][feat_type][feat][:][:,i] for mol_key in mol_keys]
else:
df_dict[feat] = [
f[mol_key][feat_type][feat][:]
if f[mol_key][feat_type][feat][()].ndim == 1
else f[mol_key][feat_type][feat][()] for mol_key in mol_keys]


df = pd.DataFrame(data=df_dict)

df_final = pd.concat([df_final, df])

df_final.reset_index(drop=True, inplace=True)

return df_final


def save_hist(
df: pd.DataFrame,
features: Union[str,List[str]],
fname: str,
bins: Union[int,List[float],str] = 10,
figsize: Tuple = (15, 15)
):
"""
Args
----------
df (pd.DataFrame): Pandas DataFrame object generated using hdf5_to_pandas function.

features (str or list): features to be plotted.

fname (str): str or path-like or binary file-like object.

bins (int or sequence or str): if bins is an integer, it defines the number of equal-width bins in the range.
If bins is a sequence, it defines the bin edges, including the left edge of the first bin and the right edge
of the last bin; in this case, bins may be unequally spaced. All but the last (righthand-most) bin is half-open.
If bins is a string, it is one of the binning strategies supported by numpy.histogram_bin_edges:
'auto', 'fd', 'doane', 'scott', 'stone', 'rice', 'sturges', or 'sqrt'.
Defaults to 10.

figsize (tuple): saved figure sizes, defaults to (15, 15).
"""
if not isinstance(features, list):
features = [features]

means = [
round(np.concatenate(df[feat].values).mean(), 1) if isinstance(df[feat].values[0], np.ndarray) \
else round(df[feat].values.mean(), 1) \
for feat in features]
devs = [
round(np.concatenate(df[feat].values).std(), 1) if isinstance(df[feat].values[0], np.ndarray) \
else round(df[feat].values.std(), 1) \
for feat in features]

if len(features) > 1:

fig, axs = plt.subplots(len(features), figsize=figsize)

for row, feat in enumerate(features):

if isinstance(df[feat].values[0], np.ndarray):
axs[row].hist(np.concatenate(df[feat].values), bins=bins)
else:
axs[row].hist(df[feat].values, bins=bins)
axs[row].set(xlabel=f'{feat} (mean {means[row]}, std {devs[row]})', ylabel='Count')
fig.tight_layout()

else:

fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)
ax.hist(df[features[0]].values, bins=bins)
ax.set(xlabel=f'{features[0]} (mean {means[0]}, std {devs[0]})', ylabel='Count')

fig.tight_layout()
fig.savefig(fname)
plt.close(fig)
1 change: 1 addition & 0 deletions deeprankcore/utils/exporters.py
Expand Up @@ -283,3 +283,4 @@ def process( # pylint: disable=too-many-arguments
df_epoch = pd.DataFrame(data=d_epoch)

self.df = pd.concat([self.df, df_epoch])
self.df.reset_index(drop=True, inplace=True)
43 changes: 43 additions & 0 deletions tests/tools/test_hist.py
@@ -0,0 +1,43 @@
from tempfile import mkdtemp
from shutil import rmtree
import os
import h5py
from deeprankcore.tools.hist import hdf5_to_pandas, save_hist

def test_hdf5_to_pandas():

hdf5_path = "tests/data/hdf5/test.hdf5"
df = hdf5_to_pandas(
hdf5_path,
node_features='charge',
edge_features=['distance', 'same_chain'],
target_features='binary')

with h5py.File(hdf5_path, 'r') as f:
keys = list(f.keys())

cols = list(df.columns)
cols.sort()

assert df.shape[0] == len(keys)
assert df.shape[1] == 5
assert cols == ['binary', 'charge', 'distance', 'id', 'same_chain']

df = hdf5_to_pandas(hdf5_path, subset=keys[2:])

assert df.shape[0] == len(keys[2:])

def test_save_hist():

output_directory = mkdtemp()
fname = os.path.join(output_directory, "test.png")
hdf5_path = "tests/data/hdf5/test.hdf5"

df = hdf5_to_pandas(
hdf5_path)

save_hist(df, ['charge', 'binary'], fname = fname)

assert len(os.listdir(output_directory)) > 0

rmtree(output_directory)