Skip to content

Commit

Permalink
add static distance methods (#46)
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Sep 27, 2017
1 parent 3dfb876 commit 0fecda8
Show file tree
Hide file tree
Showing 10 changed files with 489 additions and 206 deletions.
35 changes: 26 additions & 9 deletions biopandas/mol2/pandas_mol2.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,14 +227,35 @@ def rmsd(df1, df2, heavy_only=True):
rmsd = round((total.sum() / df1.shape[0])**0.5, 4)
return rmsd

def distance(self, df=None, xyz=(0.00, 0.00, 0.00)):
def distance(self, xyz=(0.00, 0.00, 0.00)):
"""Computes Euclidean distance between atoms in
self.df and a 3D point.
Parameters
----------
xyz : tuple (0.00, 0.00, 0.00)
X, Y, and Z coordinate of the reference center for the distance
computation
Returns
---------
pandas.Series : Pandas Series object containing the Euclidean
distance between the atoms in the atom section and `xyz`.
"""
return np.sqrt(np.sum(self.df[['x', 'y', 'z']]
.subtract(xyz, axis=1)**2, axis=1))

@staticmethod
def distance_df(df, xyz=(0.00, 0.00, 0.00)):
"""Computes Euclidean distance between atoms and a 3D point.
Parameters
----------
df : DataFrame, default: None
If a DataFrame is provided as an argument, uses this DataFrame
for the distance computation instead of `self.df`.
df : DataFrame
DataFrame containing entries similar to the PandasMol2.df
format for the
the distance computation to the `xyz` reference coordinates.
xyz : tuple (0.00, 0.00, 0.00)
X, Y, and Z coordinate of the reference center for the distance
computation
Expand All @@ -245,10 +266,6 @@ def distance(self, df=None, xyz=(0.00, 0.00, 0.00)):
distance between the atoms in the atom section and `xyz`.
"""
if df is None:
use_df = self.df
else:
use_df = df

return np.sqrt(np.sum(use_df[['x', 'y', 'z']]
return np.sqrt(np.sum(df[['x', 'y', 'z']]
.subtract(xyz, axis=1)**2, axis=1))
2 changes: 1 addition & 1 deletion biopandas/mol2/tests/test_pandas_mol2.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_distance_external_df():

pdmol = PandasMol2().read_mol2(data_path)
new_df = pdmol.df.iloc[1:, :].copy()
assert round(pdmol.distance(df=new_df).values[0], 3) == 31.165
assert round(PandasMol2.distance_df(df=new_df).values[0], 3) == 31.165


def test_overwrite_df():
Expand Down
52 changes: 41 additions & 11 deletions biopandas/pdb/pandas_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,32 +412,62 @@ def amino3to1(self, record='ATOM',

return pd.concat((tmp.iloc[indices]['chain_id'], transl), axis=1)

def distance(self, df=None, xyz=(0.00, 0.00, 0.00), record='ATOM'):
def distance(self, xyz=(0.00, 0.00, 0.00), records=('ATOM', 'HETATM')):
"""Computes Euclidean distance between atoms and a 3D point.
Parameters
----------
df : DataFrame, default: None
If a DataFrame is provided as an argument, uses this DataFrame
for the distance computation instead of `self.df[record]`.
xyz : tuple, default: (0.00, 0.00, 0.00)
X, Y, and Z coordinate of the reference center for the distance
computation.
record : str, default: 'ATOM'
Specfies the record DataFrame. Only used if `df=None`.
records : iterable, default: ('ATOM', 'HETATM')
Specify which record sections to consider. For example, to consider
both protein and ligand atoms, set `records=('ATOM', 'HETATM')`.
This setting is ignored if `df` is not set to None.
For downward compatibility, a string argument is still supported
but deprecated and will be removed in future versions.
Returns
---------
pandas.Series : Pandas Series object containing the Euclidean
distance between the atoms in the record section and `xyz`.
"""
if df is None:
use_df = self.df[record]
else:
use_df = df

return np.sqrt(np.sum(use_df[[
if isinstance(records, str):
warnings.warn('Using a string as `records` argument is '
'deprecated and will not be supported in future'
' versions. Please use a tuple or'
' other iterable instead', DeprecationWarning)
records = (records,)

df = pd.concat(objs=[self.df[i] for i in records])

return np.sqrt(np.sum(df[[
'x_coord', 'y_coord', 'z_coord']]
.subtract(xyz, axis=1)**2, axis=1))

@staticmethod
def distance_df(df, xyz=(0.00, 0.00, 0.00)):
"""Computes Euclidean distance between atoms and a 3D point.
Parameters
----------
df : DataFrame
DataFrame containing entries in the `PandasPdb.df['ATOM']`
or `PandasPdb.df['HETATM']` format for the
the distance computation to the `xyz` reference coordinates.
xyz : tuple, default: (0.00, 0.00, 0.00)
X, Y, and Z coordinate of the reference center for the distance
computation.
Returns
---------
pandas.Series : Pandas Series object containing the Euclidean
distance between the atoms in the record section and `xyz`.
"""
return np.sqrt(np.sum(df[[
'x_coord', 'y_coord', 'z_coord']]
.subtract(xyz, axis=1)**2, axis=1))

Expand Down
18 changes: 15 additions & 3 deletions biopandas/pdb/tests/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pandas as pd
from biopandas.pdb import PandasPdb
import os
from nose.tools import raises


def test_equal():
Expand All @@ -16,7 +15,20 @@ def test_equal():

p1t48 = PandasPdb()
p1t48.read_pdb(TESTDATA_1t48)
dist = p1t48.distance(xyz=(70.785, 15.477, 23.359), record='ATOM')
dist = p1t48.distance(xyz=(70.785, 15.477, 23.359), records=('ATOM',))

expect = pd.Series([2.533259, 1.520502, 0.000000, 1.257597, 1.252510],
index=[12, 13, 14, 15, 16])
assert dist[dist < 3].all() == expect.all()


def test_deprecated_str_arg():
TESTDATA_1t48 = os.path.join(os.path.dirname(__file__), 'data',
'1t48_995.pdb')

p1t48 = PandasPdb()
p1t48.read_pdb(TESTDATA_1t48)
dist = p1t48.distance(xyz=(70.785, 15.477, 23.359), records='ATOM')

expect = pd.Series([2.533259, 1.520502, 0.000000, 1.257597, 1.252510],
index=[12, 13, 14, 15, 16])
Expand All @@ -30,7 +42,7 @@ def test_use_external_df():
p1t48 = PandasPdb()
p1t48.read_pdb(TESTDATA_1t48)
new_df = p1t48.df['ATOM'].iloc[:-1, :].copy()
dist = p1t48.distance(df=new_df, xyz=(70.785, 15.477, 23.359))
dist = PandasPdb.distance_df(df=new_df, xyz=(70.785, 15.477, 23.359))

expect = pd.Series([2.533259, 1.520502, 0.000000, 1.257597],
index=[12, 13, 14, 15])
Expand Down
4 changes: 3 additions & 1 deletion docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ The CHANGELOG for the current development version is available at

##### Changes

- `PandasPdb.distance` and `PandasMol2.distance` now accept external `DataFrames` to allow for more efficient distance computations on smaller `DataFrames` if desired.
- `PandasMol2.distance_df` was added as a static method that allows distance computations based for external data frames with its behavior otherwise similar to `PandasMol2.distance`.
- `PandasPdb.distance_df` was added as a static method that allows distance computations based for external data frames with its behavior otherwise similar to `PandasPdb.distance`.
- `PandasPdb.distance` now supports multiple record sections to be considered (e.g., `records=('ATOM', 'HETATM')` to include both protein and ligand in a query. Now also defaults to `records=('ATOM', 'HETATM')` for concistency with the impute method.
- `PandasPdb.get(...)` now supports external data frames and lets the user specify the record section to be considered (e.g., `records=('ATOM', 'HETATM')` to include both protein and ligand in a query. Now also defaults to `records=('ATOM', 'HETATM')` for concistency with the impute method.
- The `section` parameter of `PandasPdb.impute_element(...)` was renamed to `records` for API consistency.

Expand Down
29 changes: 25 additions & 4 deletions docs/sources/api_subpackages/biopandas.mol2.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,37 @@ Object for working with Tripos Mol2 structure files.

<hr>

*distance(df=None, xyz=(0.0, 0.0, 0.0))*
*distance(xyz=(0.0, 0.0, 0.0))*

Computes Euclidean distance between atoms in
self.df and a 3D point.

**Parameters**

- `xyz` : tuple (0.00, 0.00, 0.00)

X, Y, and Z coordinate of the reference center for the distance
computation

**Returns**

- `pandas.Series` : Pandas Series object containing the Euclidean

distance between the atoms in the atom section and `xyz`.

<hr>

*distance_df(df, xyz=(0.0, 0.0, 0.0))*

Computes Euclidean distance between atoms and a 3D point.

**Parameters**

- `df` : DataFrame, default: None
- `df` : DataFrame

If a DataFrame is provided as an argument, uses this DataFrame
for the distance computation instead of `self.df`.
DataFrame containing entries similar to the PandasMol2.df
format for the
the distance computation to the `xyz` reference coordinates.

- `xyz` : tuple (0.00, 0.00, 0.00)

Expand Down
40 changes: 32 additions & 8 deletions docs/sources/api_subpackages/biopandas.pdb.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,25 +77,49 @@ Creates 1-letter amino acid codes from DataFrame

<hr>

*distance(df=None, xyz=(0.0, 0.0, 0.0), record='ATOM')*
*distance(xyz=(0.0, 0.0, 0.0), records=('ATOM', 'HETATM'))*

Computes Euclidean distance between atoms and a 3D point.

**Parameters**

- `df` : DataFrame, default: None

If a DataFrame is provided as an argument, uses this DataFrame
for the distance computation instead of `self.df[record]`.

- `xyz` : tuple, default: (0.00, 0.00, 0.00)

X, Y, and Z coordinate of the reference center for the distance
computation.

- `record` : str, default: 'ATOM'
- `records` : iterable, default: ('ATOM', 'HETATM')

Specfies the record DataFrame. Only used if `df=None`.
Specify which record sections to consider. For example, to consider
both protein and ligand atoms, set `records=('ATOM', 'HETATM')`.
This setting is ignored if `df` is not set to None.
For downward compatibility, a string argument is still supported
but deprecated and will be removed in future versions.

**Returns**

- `pandas.Series` : Pandas Series object containing the Euclidean

distance between the atoms in the record section and `xyz`.

<hr>

*distance_df(df, xyz=(0.0, 0.0, 0.0))*

Computes Euclidean distance between atoms and a 3D point.

**Parameters**

- `df` : DataFrame

DataFrame containing entries in the `PandasPdb.df['ATOM']`
or `PandasPdb.df['HETATM']` format for the
the distance computation to the `xyz` reference coordinates.

- `xyz` : tuple, default: (0.00, 0.00, 0.00)

X, Y, and Z coordinate of the reference center for the distance
computation.

**Returns**

Expand Down
Loading

0 comments on commit 0fecda8

Please sign in to comment.