Skip to content

Commit

Permalink
Merge pull request #219 from h-croser/concordance-performance
Browse files Browse the repository at this point in the history
Concordance performance improvements
  • Loading branch information
h-croser committed Sep 15, 2023
2 parents dcdac7c + 140f576 commit 1ab290b
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 11 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "atap_widgets"
version = "0.5.0"
version = "0.5.1"
description = "Interactive widgets used by the Australian Text Analytics Platform"
authors = ["Marius Mather <marius.mather@sydney.edu.au>"]
license = "MIT"
Expand Down
25 changes: 15 additions & 10 deletions src/atap_widgets/concordance.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas as pd
import spacy
from IPython.display import display
from spacy.lang.en import English
from textacy.extract import keyword_in_context


Expand Down Expand Up @@ -108,7 +109,7 @@ def prepare_text_df(
df: pd.DataFrame,
text_column: str = "text",
id_column: str = None,
language_model: Union[str, spacy.language.Language] = "en_core_web_sm",
language_model: Union[str, spacy.language.Language] = English(),
) -> pd.DataFrame:
"""
Our text processing functions expect a dataframe with
Expand All @@ -127,18 +128,20 @@ def prepare_text_df(
id_column: The current column name of the unique identifier for each text
in df. If not given, numeric IDs will be generated for each text.
language_model: The name of a spacy model like "en_core_web_sm", or a
spacy language model instance.
spacy language model instance. Defaults to English()
"""
output = df.copy()
if id_column is None:
output["text_id"] = pd.Series(range(output.shape[0]), dtype=pd.Int64Dtype)
output["text_id"] = pd.Series(range(output.shape[0]), dtype=pd.Int64Dtype())
id_column = "text_id"
output = output.rename(columns={text_column: "text", id_column: "text_id"})
output = output.set_index("text_id", drop=False)

if isinstance(language_model, str):
language_model = spacy.load(language_model)
output["spacy_doc"] = output["text"].map(language_model) # Doc for each line
output["spacy_doc"] = [
d for d in language_model.pipe(output["text"])
] # Doc for each line

return output

Expand Down Expand Up @@ -275,9 +278,7 @@ def group_by_chunk(self, df):
grouped = df.groupby(["chunk"])["text"].apply("".join).reset_index()
return grouped

def show(
self, language_model: Union[str, spacy.language.Language] = "en_core_web_sm"
):
def show(self, language_model: Union[str, spacy.language.Language] = English()):
prepared_df = prepare_text_df(self.data, language_model=language_model)
widget = ConcordanceLoaderWidget(
prepared_df,
Expand Down Expand Up @@ -340,7 +341,9 @@ def display_results(page: int, **kwargs):

display(ipywidgets.HTML(html))

keyword_input = ipywidgets.Text(description="Keyword(s):")
keyword_input = ipywidgets.Text(
description="Keyword(s):", continuous_update=False
)
regex_toggle_input = ipywidgets.Checkbox(
value=False,
description="Enable regular expressions",
Expand Down Expand Up @@ -516,7 +519,7 @@ def __init__(
stylingOn: bool = False,
additional_info: str = None,
tag_lines: bool = False,
language_model: str = "en_core_web_sm",
language_model: str = English(),
sort: str = "text_id",
):
self.df = df
Expand Down Expand Up @@ -948,7 +951,9 @@ def display_results(page: int, **kwargs):
display(ipywidgets.HTML(html))
return html

keyword_input = ipywidgets.Text(description="Keyword(s):")
keyword_input = ipywidgets.Text(
description="Keyword(s):", continuous_update=False
)
regex_toggle_input = ipywidgets.Checkbox(
value=False,
description="Enable regular expressions",
Expand Down

0 comments on commit 1ab290b

Please sign in to comment.