add static distance methods (#46)

BioPandas · Sep 27, 2017 · 0fecda8 · 0fecda8
1 parent 3dfb876
commit 0fecda8
Show file tree

Hide file tree

Showing 10 changed files with 489 additions and 206 deletions.
diff --git a/biopandas/mol2/pandas_mol2.py b/biopandas/mol2/pandas_mol2.py
@@ -227,14 +227,35 @@ def rmsd(df1, df2, heavy_only=True):
         rmsd = round((total.sum() / df1.shape[0])**0.5, 4)
         return rmsd
 
-    def distance(self, df=None, xyz=(0.00, 0.00, 0.00)):
+    def distance(self, xyz=(0.00, 0.00, 0.00)):
+        """Computes Euclidean distance between atoms in
+            self.df and a 3D point.
+
+        Parameters
+        ----------
+        xyz : tuple (0.00, 0.00, 0.00)
+            X, Y, and Z coordinate of the reference center for the distance
+            computation
+
+        Returns
+        ---------
+        pandas.Series : Pandas Series object containing the Euclidean
+            distance between the atoms in the atom section and `xyz`.
+
+        """
+        return np.sqrt(np.sum(self.df[['x', 'y', 'z']]
+                       .subtract(xyz, axis=1)**2, axis=1))
+
+    @staticmethod
+    def distance_df(df, xyz=(0.00, 0.00, 0.00)):
         """Computes Euclidean distance between atoms and a 3D point.
 
         Parameters
         ----------
-        df : DataFrame, default: None
-            If a DataFrame is provided as an argument, uses this DataFrame
-            for the distance computation instead of `self.df`.
+        df : DataFrame
+            DataFrame containing entries similar to the PandasMol2.df
+            format for the
+            the distance computation to the `xyz` reference coordinates.
         xyz : tuple (0.00, 0.00, 0.00)
             X, Y, and Z coordinate of the reference center for the distance
             computation
@@ -245,10 +266,6 @@ def distance(self, df=None, xyz=(0.00, 0.00, 0.00)):
             distance between the atoms in the atom section and `xyz`.
 
         """
-        if df is None:
-            use_df = self.df
-        else:
-            use_df = df
 
-        return np.sqrt(np.sum(use_df[['x', 'y', 'z']]
+        return np.sqrt(np.sum(df[['x', 'y', 'z']]
                        .subtract(xyz, axis=1)**2, axis=1))
diff --git a/biopandas/mol2/tests/test_pandas_mol2.py b/biopandas/mol2/tests/test_pandas_mol2.py
@@ -64,7 +64,7 @@ def test_distance_external_df():
 
     pdmol = PandasMol2().read_mol2(data_path)
     new_df = pdmol.df.iloc[1:, :].copy()
-    assert round(pdmol.distance(df=new_df).values[0], 3) == 31.165
+    assert round(PandasMol2.distance_df(df=new_df).values[0], 3) == 31.165
 
 
 def test_overwrite_df():

diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py
@@ -412,32 +412,62 @@ def amino3to1(self, record='ATOM',
 
         return pd.concat((tmp.iloc[indices]['chain_id'], transl), axis=1)
 
-    def distance(self, df=None, xyz=(0.00, 0.00, 0.00), record='ATOM'):
+    def distance(self, xyz=(0.00, 0.00, 0.00), records=('ATOM', 'HETATM')):
         """Computes Euclidean distance between atoms and a 3D point.
 
         Parameters
         ----------
-        df : DataFrame, default: None
-            If a DataFrame is provided as an argument, uses this DataFrame
-            for the distance computation instead of `self.df[record]`.
         xyz : tuple, default: (0.00, 0.00, 0.00)
             X, Y, and Z coordinate of the reference center for the distance
             computation.
-        record : str, default: 'ATOM'
-            Specfies the record DataFrame. Only used if `df=None`.
+        records : iterable, default: ('ATOM', 'HETATM')
+            Specify which record sections to consider. For example, to consider
+            both protein and ligand atoms, set `records=('ATOM', 'HETATM')`.
+            This setting is ignored if `df` is not set to None.
+            For downward compatibility, a string argument is still supported
+            but deprecated and will be removed in future versions.
 
         Returns
         ---------
         pandas.Series : Pandas Series object containing the Euclidean
             distance between the atoms in the record section and `xyz`.
 
         """
-        if df is None:
-            use_df = self.df[record]
-        else:
-            use_df = df
 
-        return np.sqrt(np.sum(use_df[[
+        if isinstance(records, str):
+            warnings.warn('Using a string as `records` argument is '
+                          'deprecated and will not be supported in future'
+                          ' versions. Please use a tuple or'
+                          ' other iterable instead', DeprecationWarning)
+            records = (records,)
+
+        df = pd.concat(objs=[self.df[i] for i in records])
+
+        return np.sqrt(np.sum(df[[
+            'x_coord', 'y_coord', 'z_coord']]
+            .subtract(xyz, axis=1)**2, axis=1))
+
+    @staticmethod
+    def distance_df(df, xyz=(0.00, 0.00, 0.00)):
+        """Computes Euclidean distance between atoms and a 3D point.
+
+        Parameters
+        ----------
+        df : DataFrame
+            DataFrame containing entries in the `PandasPdb.df['ATOM']`
+            or `PandasPdb.df['HETATM']` format for the
+            the distance computation to the `xyz` reference coordinates.
+        xyz : tuple, default: (0.00, 0.00, 0.00)
+            X, Y, and Z coordinate of the reference center for the distance
+            computation.
+
+        Returns
+        ---------
+        pandas.Series : Pandas Series object containing the Euclidean
+            distance between the atoms in the record section and `xyz`.
+
+        """
+        return np.sqrt(np.sum(df[[
             'x_coord', 'y_coord', 'z_coord']]
             .subtract(xyz, axis=1)**2, axis=1))
 

diff --git a/biopandas/pdb/tests/test_distance.py b/biopandas/pdb/tests/test_distance.py
@@ -7,7 +7,6 @@
 import pandas as pd
 from biopandas.pdb import PandasPdb
 import os
-from nose.tools import raises
 
 
 def test_equal():
@@ -16,7 +15,20 @@ def test_equal():
 
     p1t48 = PandasPdb()
     p1t48.read_pdb(TESTDATA_1t48)
-    dist = p1t48.distance(xyz=(70.785, 15.477, 23.359), record='ATOM')
+    dist = p1t48.distance(xyz=(70.785, 15.477, 23.359), records=('ATOM',))
+
+    expect = pd.Series([2.533259, 1.520502, 0.000000, 1.257597, 1.252510],
+                       index=[12, 13, 14, 15, 16])
+    assert dist[dist < 3].all() == expect.all()
+
+
+def test_deprecated_str_arg():
+    TESTDATA_1t48 = os.path.join(os.path.dirname(__file__), 'data',
+                                                            '1t48_995.pdb')
+
+    p1t48 = PandasPdb()
+    p1t48.read_pdb(TESTDATA_1t48)
+    dist = p1t48.distance(xyz=(70.785, 15.477, 23.359), records='ATOM')
 
     expect = pd.Series([2.533259, 1.520502, 0.000000, 1.257597, 1.252510],
                        index=[12, 13, 14, 15, 16])
@@ -30,7 +42,7 @@ def test_use_external_df():
     p1t48 = PandasPdb()
     p1t48.read_pdb(TESTDATA_1t48)
     new_df = p1t48.df['ATOM'].iloc[:-1, :].copy()
-    dist = p1t48.distance(df=new_df, xyz=(70.785, 15.477, 23.359))
+    dist = PandasPdb.distance_df(df=new_df, xyz=(70.785, 15.477, 23.359))
 
     expect = pd.Series([2.533259, 1.520502, 0.000000, 1.257597],
                        index=[12, 13, 14, 15])

diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -16,7 +16,9 @@ The CHANGELOG for the current development version is available at
 
 ##### Changes
 
-- `PandasPdb.distance` and `PandasMol2.distance` now accept external `DataFrames` to allow for more efficient distance computations on smaller `DataFrames` if desired. 
+- `PandasMol2.distance_df` was added as a static method that allows distance computations based for external data frames with its behavior otherwise similar to `PandasMol2.distance`.
+- `PandasPdb.distance_df` was added as a static method that allows distance computations based for external data frames with its behavior otherwise similar to `PandasPdb.distance`.
+- `PandasPdb.distance` now supports multiple record sections to be considered (e.g., `records=('ATOM', 'HETATM')` to include both protein and ligand in a query. Now also defaults to `records=('ATOM', 'HETATM')` for concistency with the impute method.
 - `PandasPdb.get(...)` now supports external data frames and lets the user specify the record section to be considered (e.g., `records=('ATOM', 'HETATM')` to include both protein and ligand in a query. Now also defaults to `records=('ATOM', 'HETATM')` for concistency with the impute method.
 - The `section` parameter of `PandasPdb.impute_element(...)` was renamed to `records` for API consistency.
 

diff --git a/docs/sources/api_subpackages/biopandas.mol2.md b/docs/sources/api_subpackages/biopandas.mol2.md
@@ -30,16 +30,37 @@ Object for working with Tripos Mol2 structure files.
 
 <hr>
 
-*distance(df=None, xyz=(0.0, 0.0, 0.0))*
+*distance(xyz=(0.0, 0.0, 0.0))*
+
+Computes Euclidean distance between atoms in
+    self.df and a 3D point.
+
+**Parameters**
+
+- `xyz` : tuple (0.00, 0.00, 0.00)
+
+    X, Y, and Z coordinate of the reference center for the distance
+    computation
+
+**Returns**
+
+- `pandas.Series` : Pandas Series object containing the Euclidean
+
+    distance between the atoms in the atom section and `xyz`.
+
+<hr>
+
+*distance_df(df, xyz=(0.0, 0.0, 0.0))*
 
 Computes Euclidean distance between atoms and a 3D point.
 
 **Parameters**
 
-- `df` : DataFrame, default: None
+- `df` : DataFrame
 
-    If a DataFrame is provided as an argument, uses this DataFrame
-    for the distance computation instead of `self.df`.
+    DataFrame containing entries similar to the PandasMol2.df
+    format for the
+    the distance computation to the `xyz` reference coordinates.
 
 - `xyz` : tuple (0.00, 0.00, 0.00)
 

diff --git a/docs/sources/api_subpackages/biopandas.pdb.md b/docs/sources/api_subpackages/biopandas.pdb.md
@@ -77,25 +77,49 @@ Creates 1-letter amino acid codes from DataFrame
 
 <hr>
 
-*distance(df=None, xyz=(0.0, 0.0, 0.0), record='ATOM')*
+*distance(xyz=(0.0, 0.0, 0.0), records=('ATOM', 'HETATM'))*
 
 Computes Euclidean distance between atoms and a 3D point.
 
 **Parameters**
 
-- `df` : DataFrame, default: None
-
-    If a DataFrame is provided as an argument, uses this DataFrame
-    for the distance computation instead of `self.df[record]`.
-
 - `xyz` : tuple, default: (0.00, 0.00, 0.00)
 
     X, Y, and Z coordinate of the reference center for the distance
     computation.
 
-- `record` : str, default: 'ATOM'
+- `records` : iterable, default: ('ATOM', 'HETATM')
 
-    Specfies the record DataFrame. Only used if `df=None`.
+    Specify which record sections to consider. For example, to consider
+    both protein and ligand atoms, set `records=('ATOM', 'HETATM')`.
+    This setting is ignored if `df` is not set to None.
+    For downward compatibility, a string argument is still supported
+    but deprecated and will be removed in future versions.
+
+**Returns**
+
+- `pandas.Series` : Pandas Series object containing the Euclidean
+
+    distance between the atoms in the record section and `xyz`.
+
+<hr>
+
+*distance_df(df, xyz=(0.0, 0.0, 0.0))*
+
+Computes Euclidean distance between atoms and a 3D point.
+
+**Parameters**
+
+- `df` : DataFrame
+
+    DataFrame containing entries in the `PandasPdb.df['ATOM']`
+    or `PandasPdb.df['HETATM']` format for the
+    the distance computation to the `xyz` reference coordinates.
+
+- `xyz` : tuple, default: (0.00, 0.00, 0.00)
+
+    X, Y, and Z coordinate of the reference center for the distance
+    computation.
 
 **Returns**