Merge pull request #196 from CancerDataAggregator/ah_revert_3

revert the to_list and to_dataframe changes
CancerDataAggregator · Apr 18, 2023 · f1229da · f1229da
2 parents 5d61dd5 + 1198bfb
commit f1229da
Show file tree

Hide file tree

Showing 9 changed files with 112 additions and 117 deletions.
diff --git a/cdapython/results/base.py b/cdapython/results/base.py
@@ -63,7 +63,6 @@ def to_dataframe(
         max_level: Union[int, None] = None,
         search_fields: Union[List[str], str, None] = None,
         search_value: str = "",
-        allow_substring: bool = True,
     ) -> DataFrame:
         """[summary]
         Creates a pandas DataFrame for the Results

diff --git a/cdapython/results/columns_result.py b/cdapython/results/columns_result.py
@@ -1,10 +1,29 @@
 from typing import Any, Dict, List, Optional, Union
 
-from pandas import DataFrame, Index, json_normalize, concat
+from pandas import DataFrame, Index, json_normalize, merge
+from typing_extensions import Literal, TypedDict
 
 from cdapython.results.base import BaseResult
 
 
+class _Column_Types(TypedDict):
+    """
+    This is made for typechecking a dict
+    Args:
+        TypedDict (_type_): _description_
+    """
+
+    fieldName: str
+    endpoint: str
+    description: str
+    mode: str
+
+
+_Column_str = Union[
+    Literal["fieldName"], Literal["endpoint"], Literal["description"], Literal["mode"]
+]
+
+
 class ColumnsResult(BaseResult):
     def __init__(
         self,
@@ -33,29 +52,65 @@ def __str__(self) -> str:
         return self._repr_value(show_value=self.show_sql)
 
     def to_list(
-        self,
-        search_fields: Union[str, List[str], None] = None,
-        search_value: Union[str, None] = None,
-        allow_substring: bool = True,
+        self, filters: Optional[str] = None, exact: bool = False, endpoint: str = ""
     ) -> List[Any]:
-        """_summary_
-
-        Args:
-            allow_substring (bool, optional): Whether the seach_value should match if it is only part of a word. Defaults to True.
-            search_fields (Union[str, List[str], None]): _description_. Defaults to None.
-            search_value (Optional[str], optional): _description_. Defaults to None.
-
-        Returns:
-            List[Any]: _description_
-        """
-        result = self.to_dataframe(
-            search_fields=search_fields,
-            search_value=search_value,
-            allow_substring=allow_substring,
-        )
+        if filters is not None and filters != "":
+            values: Union[List[_Column_Types], List[Any]] = []
+            filters = filters.replace("\n", " ").strip()
+            if self.description is False:
+                values.extend(
+                    [i["fieldName"] for i in self._result if i["fieldName"] is not None]
+                )
+            if self.description:
+                values.extend([i for i in self._result if list(i) is not None])
+            # values = list(filter(None, values))
+            if exact:
+                if self.description is False:
+                    return list(
+                        filter(
+                            lambda items: (
+                                str(items["fieldName"]).lower() == filters.lower()
+                            ),
+                            values,
+                        )
+                    )
+                return list(
+                    filter(
+                        lambda items: (
+                            str(items["description"]).lower() == filters.lower()
+                            or str(items["endpoint"]).lower() == filters.lower()
+                            or str(items["fieldName"]).lower() == filters.lower()
+                        ),
+                        values,
+                    )
+                )
+            else:
+                if self.description is False:
+                    return list(
+                        filter(
+                            lambda items: (
+                                str(items).lower().find(str(filters.lower())) != -1
+                            ),
+                            values,
+                        )
+                    )
+                return list(
+                    filter(
+                        lambda items: (
+                            str(items["description"]).lower().find(filters.lower())
+                            != -1
+                            or str(items["endpoint"]).lower().find(filters.lower())
+                            != -1
+                            or str(items["fieldName"]).lower().find(filters.lower())
+                            != -1
+                        ),
+                        values,
+                    )
+                )
         if self.description is False:
-            return result["fieldName"].values.tolist()
-        return list(result.to_dict("records"))
+            return [i["fieldName"] for i in self._result]
+
+        return list(self._result)
 
     def to_dataframe(
         self,
@@ -64,8 +119,7 @@ def to_dataframe(
         meta_prefix: Union[str, None] = None,
         max_level: Union[int, None] = None,
         search_fields: Union[List[str], str, None] = None,
-        search_value: Optional[str] = None,
-        allow_substring: bool = True,
+        search_value: str = "",
     ) -> DataFrame:
         """[summary]
         Creates a pandas DataFrame for the Results
@@ -82,36 +136,16 @@ def to_dataframe(
             value = DataFrame(columns=column_names, index=Index([], dtype="int"))
             if isinstance(search_fields, str):
                 search_fields = [search_fields]
-            if allow_substring:
-                for i in search_fields:
-                    value = (
-                        concat(
-                            [
-                                value,
-                                data_frame[
-                                    data_frame[i].str.contains(
-                                        search_value, case=False, na=False
-                                    )
-                                ],
-                            ]
-                        )
-                        .drop_duplicates()
-                        .reset_index(drop=True)
-                    )
-            else:
-                for i in search_fields:
-                    value = (
-                        concat(
-                            [
-                                value,
-                                data_frame[
-                                    data_frame[i].str.lower() == search_value.lower()
-                                ],
-                            ]
-                        )
-                        .drop_duplicates()
-                        .reset_index(drop=True)
-                    )
+            for i in search_fields:
+                value = merge(
+                    value,
+                    data_frame[
+                        data_frame[i].str.contains(search_value, case=False, na=False)
+                    ],
+                    how="right",
+                    right_on=column_names,
+                    left_on=column_names,
+                )
             return value
         if self.format_type == "tsv":
             return self._df

diff --git a/cdapython/results/string_result.py b/cdapython/results/string_result.py
@@ -2,7 +2,7 @@
 This class inheritance from the result class it is made for unique terms function
 in the utility class,just to add a different to to_list
 """
-from typing import List, Optional, Any
+from typing import List, Optional
 
 from cda_client.api.query_api import QueryApi
 from cda_client.model.query_response_data import QueryResponseData
@@ -38,43 +38,35 @@ def __init__(
             format_type,
         )
 
-    def to_list(
-        self,
-        search_value: Optional[str] = None,
-        allow_substring: bool = True,
-    ) -> List[Any]:
+    def to_list(self, filters: Optional[str] = None, exact: bool = False) -> list:
         """_summary_
         this overloads the base Result to_list function
         Args:
-            allow_substring (bool, optional): Whether the seach_value should match if it is only part of a word. Defaults to True.
-            search_fields (Union[str, List[str], None]): _description_. Defaults to None.
-            search_value (Optional[str], optional): _description_. Defaults to None.
+            filters (Optional[str], optional): _description_. Defaults to None.
+            exact (bool, optional): _description_. Defaults to False.
 
         Returns:
-            List[Any]: _description_
+            list: _description_
         """
-        if search_value is not None:
-            values: list["StringResult"] = [
+        if filters is not None and filters != "":
+            filters = filters.replace("\n", " ").strip()
+            values: List["StringResult"] = [
                 list(i.values())[0]
                 for i in self._api_response.result
                 if list(i.values())[0] is not None
             ]
-
-            if allow_substring:
-                # concatenate all search values
+            # values = list(filter(None, values))
+            if exact:
                 return list(
                     filter(
-                        lambda term: (
-                            str(term).lower().find(str(search_value.lower())) != -1
-                        ),
+                        lambda items: (str(items).lower() == filters.lower()),
                         values,
                     )
                 )
-            else:
-                return list(
-                    filter(
-                        lambda term: (term.lower() in search_value.lower()),
-                        values,
-                    )
+            return list(
+                filter(
+                    lambda items: (str(items).lower().find(str(filters.lower())) != -1),
+                    values,
                 )
+            )
         return [list(i.values())[0] for i in self._api_response.result]
diff --git a/notebooks/BuildingACohort.ipynb b/notebooks/BuildingACohort.ipynb
@@ -1014,7 +1014,7 @@
     }
    ],
    "source": [
-    "columns().to_list(search_value=\"diagnosis\")"
+    "columns().to_list(filters=\"diagnosis\")"
    ]
   },
   {
@@ -1117,7 +1117,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "unique_terms(\"ResearchSubject.primary_diagnosis_site\").to_list(search_value=\"uter\")"
+    "unique_terms(\"ResearchSubject.primary_diagnosis_site\").to_list(filters=\"uter\")"
    ]
   },
   {
@@ -1138,7 +1138,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "unique_terms(\"ResearchSubject.primary_diagnosis_site\").to_list(search_value=\"cerv\")"
+    "unique_terms(\"ResearchSubject.primary_diagnosis_site\").to_list(filters=\"cerv\")"
    ]
   },
   {

diff --git a/tests/colmdata.py b/tests/colmdata.py
@@ -3,23 +3,11 @@
 print(
     columns(
         host="https://cancerdata.dsde-dev.broadinstitute.org/", description=False
-    ).to_list(search_fields=["fieldName"], search_value="ma")
+    ).to_list(filters="ma")
 )
 
 print(
     columns(
         host="https://cancerdata.dsde-dev.broadinstitute.org/", description=True
-    ).to_list(
-        search_fields=["fieldName", "endpoint"],
-        search_value="tumor",
-        allow_substring=True,
-    )
-)
-
-print(
-    columns(
-        host="https://cancerdata.dsde-dev.broadinstitute.org/", description=True
-    ).to_dataframe(
-        search_fields=["description"], search_value="tumor", allow_substring=False
-    )
+    ).to_list(filters="ma")
 )
diff --git a/tests/dataframe_search.ipynb b/tests/dataframe_search.ipynb
@@ -67,7 +67,7 @@
     "# df = columns().to_dataframe()\n",
     "# df[df[col].str.contains(val, case=False, na=False)]\n",
     "\"\"\" to search with to_dataframe function you need the column name and the value use : to search like this Description:GDC \"\"\"\n",
-    "columns().to_list(search_value=\"AF\")       "
+    "columns().to_list(filters=\"AF\")       "
    ]
   },
   {

diff --git a/tests/rich_file_id_problim.py b/tests/rich_file_id_problim.py
@@ -6,7 +6,7 @@
 def test():
     columns().to_list()
 
-    columns().to_list(search_value="primary_diagnosis_site")
-    unique_terms("ResearchSubject.primary_diagnosis_site").to_list(search_value="ov")
+    columns().to_list(filters="primary_diagnosis_site")
+    unique_terms("ResearchSubject.primary_diagnosis_site").to_list(filters="ov")
 
     r = Q("ResearchSubject.primary_diagnosis_site = 'Ovary'").file.run(host=host)
diff --git a/tests/test_C_integration.py b/tests/test_C_integration.py
@@ -14,20 +14,3 @@ def test_unique_terms() -> None:
         "sex", "GDC", host=integration_host, table=integration_table
     ).to_list()
     assert "female" in terms
-
-
-def test_unique_terms_search_partial() -> None:
-    terms = unique_terms(
-        "sex", "GDC", host=integration_host, table=integration_table
-    ).to_list(search_value="male")
-    assert "female" in terms
-    assert "male" in terms
-    assert "unknown" not in terms
-
-
-def test_unique_terms_search() -> None:
-    terms = unique_terms(
-        "sex", "GDC", host=integration_host, table=integration_table
-    ).to_list(search_value="male", allow_substring=False)
-    assert "male" in terms
-    assert "female" not in terms
diff --git a/tests/test_unique_terms_check.py b/tests/test_unique_terms_check.py
@@ -11,7 +11,6 @@ def test_unique_terms_convert() -> None:
         show_counts=True,
         async_req=True,
     ).to_dataframe()
-    print(d)
 
 
 test_unique_terms_convert()