Merge pull request #219 from h-croser/concordance-performance

Concordance performance improvements
Australian-Text-Analytics-Platform · Sep 15, 2023 · 1ab290b · 1ab290b
2 parents dcdac7c + 140f576
commit 1ab290b
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 11 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "atap_widgets"
-version = "0.5.0"
+version = "0.5.1"
 description = "Interactive widgets used by the Australian Text Analytics Platform"
 authors = ["Marius Mather <marius.mather@sydney.edu.au>"]
 license = "MIT"

diff --git a/src/atap_widgets/concordance.py b/src/atap_widgets/concordance.py
@@ -10,6 +10,7 @@
 import pandas as pd
 import spacy
 from IPython.display import display
+from spacy.lang.en import English
 from textacy.extract import keyword_in_context
 
 
@@ -108,7 +109,7 @@ def prepare_text_df(
     df: pd.DataFrame,
     text_column: str = "text",
     id_column: str = None,
-    language_model: Union[str, spacy.language.Language] = "en_core_web_sm",
+    language_model: Union[str, spacy.language.Language] = English(),
 ) -> pd.DataFrame:
     """
     Our text processing functions expect a dataframe with
@@ -127,18 +128,20 @@ def prepare_text_df(
         id_column: The current column name of the unique identifier for each text
             in df. If not given, numeric IDs will be generated for each text.
         language_model: The name of a spacy model like "en_core_web_sm", or a
-            spacy language model instance.
+            spacy language model instance. Defaults to English()
     """
     output = df.copy()
     if id_column is None:
-        output["text_id"] = pd.Series(range(output.shape[0]), dtype=pd.Int64Dtype)
+        output["text_id"] = pd.Series(range(output.shape[0]), dtype=pd.Int64Dtype())
         id_column = "text_id"
     output = output.rename(columns={text_column: "text", id_column: "text_id"})
     output = output.set_index("text_id", drop=False)
 
     if isinstance(language_model, str):
         language_model = spacy.load(language_model)
-    output["spacy_doc"] = output["text"].map(language_model)  # Doc for each line
+    output["spacy_doc"] = [
+        d for d in language_model.pipe(output["text"])
+    ]  # Doc for each line
 
     return output
 
@@ -275,9 +278,7 @@ def group_by_chunk(self, df):
         grouped = df.groupby(["chunk"])["text"].apply("".join).reset_index()
         return grouped
 
-    def show(
-        self, language_model: Union[str, spacy.language.Language] = "en_core_web_sm"
-    ):
+    def show(self, language_model: Union[str, spacy.language.Language] = English()):
         prepared_df = prepare_text_df(self.data, language_model=language_model)
         widget = ConcordanceLoaderWidget(
             prepared_df,
@@ -340,7 +341,9 @@ def display_results(page: int, **kwargs):
 
             display(ipywidgets.HTML(html))
 
-        keyword_input = ipywidgets.Text(description="Keyword(s):")
+        keyword_input = ipywidgets.Text(
+            description="Keyword(s):", continuous_update=False
+        )
         regex_toggle_input = ipywidgets.Checkbox(
             value=False,
             description="Enable regular expressions",
@@ -516,7 +519,7 @@ def __init__(
         stylingOn: bool = False,
         additional_info: str = None,
         tag_lines: bool = False,
-        language_model: str = "en_core_web_sm",
+        language_model: str = English(),
         sort: str = "text_id",
     ):
         self.df = df
@@ -948,7 +951,9 @@ def display_results(page: int, **kwargs):
             display(ipywidgets.HTML(html))
             return html
 
-        keyword_input = ipywidgets.Text(description="Keyword(s):")
+        keyword_input = ipywidgets.Text(
+            description="Keyword(s):", continuous_update=False
+        )
         regex_toggle_input = ipywidgets.Checkbox(
             value=False,
             description="Enable regular expressions",