# Text Search Pipeline for Latin Using Whoosh (Deprecated)

Query for documents or indexes of word occurrences inside documents.

## Initial Setup

Import widgets.

In [2]:
import ipywidgets as widgets
from IPython.display import display

Import other required modules.

In [6]:
import bs4, csv, json, multiprocessing as mp, numpy as np, os, pandas as pd, pickle, re, requests, tqdm
from inspect import signature
from dotenv import load_dotenv
from pprint import pprint
from termcolor import colored, cprint

Use caching.

In [11]:
import ipycache
from IPython.utils.traitlets import Unicode
%load_ext ipycache
from IPython.core

ModuleNotFoundError: No module named 'IPython.config'

Configurations.

In [12]:
# print coloring options
line_color = "green"
text_color = "magenta"
reference_color = "white"

Load environment variables from .env file.

In [19]:
load_dotenv()
metadata_path = os.getenv(key="METADATA_PATH")
metadata_df_path = os.getenv(key="METADATA_DF_PATH")
full_texts_path = os.getenv(key="FULL_TEXTS_PATH")

cprint(text="-" * 100, color=line_color)
cprint(text=f"Metadata path: {metadata_path}", color=text_color)
cprint(text=f"Metadata dataframe path: {metadata_df_path}", color=text_color)
cprint(text=f"Full texts path: {full_texts_path}", color=text_color)

[32m----------------------------------------------------------------------------------------------------[0m
[35mMetadata path: /mnt/d/share/AI-in-Classics/src/metadata.csv[0m
[35mMetadata dataframe path: /mnt/d/share/AI-in-Classics/src/metadata_df.csv[0m
[35mFull texts path: /mnt/d/share/AI-in-Classics/src/pipelines/data_collection/full_texts/[0m


## Load Metadata

Load metadata for First1KGreek project.

In [23]:
metadata_df = None

try:
    metadata_df = pickle.load(file=open(file="metadata_df.pkl", mode="rb"))[
        "metadata_df"
    ]

except:
    metadata_csv_reader = csv.reader(
        open(file=metadata_path, mode="r", encoding="utf-8")
    )
    columns = next(metadata_csv_reader)
    metadata_df = pd.DataFrame(
        data=np.asarray(a=list(metadata_csv_reader)), columns=columns
    )

cprint(text="-" * 100, color=line_color)
cprint(text="metadata_df:", color=text_color)
cprint(text="-" * 100, color=line_color)

[32m----------------------------------------------------------------------------------------------------[0m
[35mmetadata_df:[0m
[32m----------------------------------------------------------------------------------------------------[0m


In [24]:
metadata_df

Unnamed: 0,Unnamed: 1,Workgroup,Work,Language,Words,URN,URL
0,0,Anonymous,Anametresis Pontou,"grc,lat,deu",374,urn:cts:greekLit:ggm0001.ggm001.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
1,1,,Isaias,"lat,eng",36226,urn:cts:hebrewlit:heb0001.heb010.1st1K-eng1,https://scaife.perseus.org/reader/urn:cts:hebr...
2,2,Pinytus,De Epistola Pinyti ad Dionysium,"grc,lat",162,urn:cts:greekLit:ogl0001.ogl001.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
3,3,pseudo-Aristotle,De mundo,grc,6446,urn:cts:greekLit:stoa0033a.tlg028.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
4,4,pseudo-Aristotle,De spiritu,grc,3460,urn:cts:greekLit:stoa0033a.tlg043.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
...,...,...,...,...,...,...,...
974,974,Scholia in Sophoclem,Scholia in Sophoclem (scholia vetera),"grc,lat",89341,urn:cts:greekLit:tlg5037.tlg004.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
975,975,Anonymi In Aristotelis Librum Alterum Analytic...,Anonymi in analyticorum posteriorum librum alt...,"grc,lat",25619,urn:cts:greekLit:tlg9004.tlg001.opp-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
976,976,,Libanius Opera,lat,7534,urn:cts:greekLit:tlg9006.tlg011.opp-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
977,977,Suda,Suidae lexicon,"grc,lat",821723,urn:cts:greekLit:tlg9010.tlg001.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...


## Import Data

Import Latin text.

### *From TXT File*

In [25]:
def load_txt(filename: str):
    """
    Extract text from a .txt file.

    Parameters:
        filename (str): Path of file to load.

    Returns:
        str: Text loaded from file.
    """
    if not os.path.exists(path=filename):
        raise ValueError(f"The path {filename} does not exist.")

    text = open(file=filename, mode="r").read()
    return text

In [28]:
# Example
filename = "/mnt/c/share/AI-in-Classics/src/sample_text/latin/urn_cts_greekLit_stoa0146d.stoa001.opp-lat11.txt"
text = load_txt(filename=filename)
cprint(text="-" * 100, color=line_color)
cprint(text="Loading from TXT file:", color=text_color)
cprint(text=filename, color=reference_color)
cprint(text="-" * 100, color=line_color)
cprint(text=text, color=text_color)

[32m----------------------------------------------------------------------------------------------------[0m
[35mLoading from TXT file:[0m
[37m/mnt/c/share/AI-in-Classics/src/sample_text/latin/urn_cts_greekLit_stoa0146d.stoa001.opp-lat11.txt[0m
[32m----------------------------------------------------------------------------------------------------[0m
[35mACTA ARCHELAI. Thesaurus verus sive disputatio habita in Carcharis eiTitate Mesopotamiae
Archelai episeopi adversus Manen, Judicantibus Manippo et
Aegialeo et Clandio et Cleobolo. In qua urbe erat quidani vir Marcellus 
 nomine qui vita et studiis et genere, prudentia quoque et honestate
valde clanis habebatnr; faeultatibus etiam copiosus et quoii
omnium maximiim est, religiosiasime deum timens, et his quae de
Christo dicebantur semper cum timore auscultans, nec quicquam omnino
honi erat quod illi viro deesset; unde et honore plurimo ab nniversa 
 eivitate colebatnr plnrimisque ipse civitatem suam freqiienter largitionibus
remu

### *From URI*

In [32]:
def load_uri(uri: str):
    """
    Load text from URI.

    Parameters:
        uri (str): URI link to text online.

    Returns:
        str: Text loaded from URI.
    """
    req = requests.get(url=uri)
    return req.text

In [35]:
# Example
uri = "https://scaife.perseus.org/library/passage/urn:cts:greekLit:stoa0146d.stoa001.opp-lat1:1/text/"
text = load_uri(uri=uri)
cprint(text="-" * 100, color=line_color)
cprint(text="Loading from URI:", color=text_color)
cprint(text=uri, color=reference_color)
cprint(text="-" * 100, color=line_color)
cprint(text=text, color=text_color)

[32m----------------------------------------------------------------------------------------------------[0m
[35mLoading from URI:[0m
[37mhttps://scaife.perseus.org/library/passage/urn:cts:greekLit:stoa0146d.stoa001.opp-lat1:1/text/[0m
[32m----------------------------------------------------------------------------------------------------[0m
[35mACTA ARCHELAI. Thesaurus verus sive disputatio habita in Carcharis eiTitate Mesopotamiae
Archelai episeopi adversus Manen, Judicantibus Manippo et
Aegialeo et Clandio et Cleobolo. In qua urbe erat quidani vir Marcellus 
 nomine qui vita et studiis et genere, prudentia quoque et honestate
valde clanis habebatnr; faeultatibus etiam copiosus et quoii
omnium maximiim est, religiosiasime deum timens, et his quae de
Christo dicebantur semper cum timore auscultans, nec quicquam omnino
honi erat quod illi viro deesset; unde et honore plurimo ab nniversa 
 eivitate colebatnr plnrimisque ipse civitatem suam freqiienter largitionibus
remunerabatur

### *From URN*

In [36]:
def load_urn(urn: str):
    """
    Load text from URN.

    Parameters:
        urn (str): URN link that identifies specific work.

    Returns:
        str: Text loaded from URI.
    """
    idx = metadata_df.index[metadata_df["URN"] == urn][0]
    url = f'https://scaife.perseus.org/library/passage/{metadata_df.at[idx, "URL"].split("/")[-2]}/text/'
    req = requests.get(url=url)

    return req.text

In [37]:
# Example
urn = "urn:cts:greekLit:stoa0146d.stoa001.opp-lat1"
text = load_urn(urn=urn)
url = f'https://scaife.perseus.org/library/passage/{metadata_df.at[0, "URL"].split("/")[-2]}/text/'
cprint(text="-" * 100, color=line_color)
cprint(text="Loading from URN:", color=text_color)
cprint(text=urn, color=reference_color)
cprint(text=url, color=text_color)
cprint(text="-" * 100, color=line_color)
cprint(text=text, color=text_color)

[32m----------------------------------------------------------------------------------------------------[0m
[35mLoading from URN:[0m
[37murn:cts:greekLit:stoa0146d.stoa001.opp-lat1[0m
[35mhttps://scaife.perseus.org/library/passage/urn:cts:greekLit:ggm0001.ggm001.1st1K-grc1:1.1/text/[0m
[32m----------------------------------------------------------------------------------------------------[0m
[35mACTA ARCHELAI. Thesaurus verus sive disputatio habita in Carcharis eiTitate Mesopotamiae
Archelai episeopi adversus Manen, Judicantibus Manippo et
Aegialeo et Clandio et Cleobolo. In qua urbe erat quidani vir Marcellus 
 nomine qui vita et studiis et genere, prudentia quoque et honestate
valde clanis habebatnr; faeultatibus etiam copiosus et quoii
omnium maximiim est, religiosiasime deum timens, et his quae de
Christo dicebantur semper cum timore auscultans, nec quicquam omnino
honi erat quod illi viro deesset; unde et honore plurimo ab nniversa 
 eivitate colebatnr plnrimisque ipse 

Create analyzer for text content.

In [39]:
from whoosh.lang import has_stemmer, languages
from whoosh.analysis import SpaceSeparatedTokenizer, LowercaseFilter

In [40]:
# check for Latin availability for analyzer
cprint(text="-" * 100, color=line_color)
cprint(text="Available languages for LanguageAnalyzer:", color=text_color)
cprint(text="-" * 100, color=line_color)
pprint(languages)

cprint(text="-" * 100, color=line_color)
cprint(text=f'Latin is available? {has_stemmer(lang="lat")}', color=text_color)

[32m----------------------------------------------------------------------------------------------------[0m
[35mAvailable languages for LanguageAnalyzer:[0m
[32m----------------------------------------------------------------------------------------------------[0m
('ar',
 'da',
 'nl',
 'en',
 'fi',
 'fr',
 'de',
 'hu',
 'it',
 'no',
 'pt',
 'ro',
 'ru',
 'es',
 'sv',
 'tr')
[32m----------------------------------------------------------------------------------------------------[0m
[35mLatin is available? False[0m


In [45]:
analyzer = SpaceSeparatedTokenizer() | LowercaseFilter()
for token in analyzer(value=text):
    pprint(token)

Token(positions=False, chars=False, stopped=False, boost=1.0, removestops=True, mode='', text='acta')
Token(positions=False, chars=False, stopped=False, boost=1.0, removestops=True, mode='', text='archelai.')
Token(positions=False, chars=False, stopped=False, boost=1.0, removestops=True, mode='', text='thesaurus')
Token(positions=False, chars=False, stopped=False, boost=1.0, removestops=True, mode='', text='verus')
Token(positions=False, chars=False, stopped=False, boost=1.0, removestops=True, mode='', text='sive')
Token(positions=False, chars=False, stopped=False, boost=1.0, removestops=True, mode='', text='disputatio')
Token(positions=False, chars=False, stopped=False, boost=1.0, removestops=True, mode='', text='habita')
Token(positions=False, chars=False, stopped=False, boost=1.0, removestops=True, mode='', text='in')
Token(positions=False, chars=False, stopped=False, boost=1.0, removestops=True, mode='', text='carcharis')
Token(positions=False, chars=False, stopped=False, boost=1.0

Create schema.

In [49]:
from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, ID, KEYWORD, NUMERIC, TEXT
from whoosh.formats import Positions
from whoosh.qparser import QueryParser

In [50]:
schema_upload = widgets.FileUpload(
    accept=".json, .pkl, .txt",
    multiple=False,
)

display(schema_upload)

FileUpload(value=(), accept='.json, .pkl, .txt', description='Upload')

In [51]:
schema_bytestr = list(schema_upload.value.values())[0]["content"]
schema_dict = json.loads(s=schema_bytestr)
for key, val in schema_dict.items():
    schema_dict[key] = eval(val)

cprint(text="-" * 100, color=line_color)
cprint(text="Dictionary form of uploaded schema:", color=text_color)
cprint(text="-" * 100, color=line_color)
pprint(schema_dict)

AttributeError: 'tuple' object has no attribute 'values'

In [None]:
Schema(**schema_dict)

NameError: name 'schema_dict' is not defined

In [None]:
schema = Schema(
    index=NUMERIC(
        bits=64, numtype="int", signed=False, sortable=True, stored=True, unique=True
    ),
    #     url = ID(
    #         sortable=False,
    #         stored=True,
    #         unique=True
    #     ),
    #     title=TEXT(
    #         chars=True,
    #         phrase=True,
    #         sortable=True,
    #         stored=True
    #     ),
    #     author=TEXT(
    #         chars=True,
    #         phrase=True,
    #         sortable=True,
    #         stored=True
    #     ),
    #     languages=KEYWORD(
    #         commas=True,
    #         lowercase=True,
    #         scorable=False,
    #         sortable=False,
    #         stored=True,
    #         unique=False
    #     ),
    #     keywords=KEYWORD(
    #         commas=True,
    #         lowercase=True,
    #         scorable=True,
    #         sortable=True,
    #         stored=True,
    #         unique=False
    #     ),
    content=TEXT(
        analyzer=analyzer,
        chars=True,
        phrase=True,
        sortable=False,
        stored=False,
    ),
)

Create or load index.

In [52]:
dirname_dropdown = widgets.Dropdown(
    options=np.asarray(
        a=list(filter(lambda x: os.path.isdir(s=x), os.listdir(path=".")))
    ),
    description="dirname",
    disabled=False,
)

display(dirname_dropdown)

Dropdown(description='dirname', options=(), value=None)

In [None]:
writelock_files = list(
    filter(
        lambda x: re.match(pattern=".+_WRITELOCK", string=x),
        list([x.decode() for x in os.listdir(path=dirname_dropdown.value)]),
    )
)

indexname_options = np.asarray(
    a=list(map(lambda x: x[: -len("_WRITELOCK")], writelock_files))
)

indexname_dropdown = widgets.Dropdown(
    options=indexname_options, description="indexname", disabled=False
)

display(indexname_dropdown)

Dropdown(description='indexname', options=(), value=None)

In [None]:
ix = None

try:
    ix = open_dir(
        dirname=str(dirname_dropdown.value), indexname=indexname_dropdown.value
    )

except Exception:
    ix = create_in(dirname="indexes/", schema=schema, indexname="index")

Index documents.

In [55]:
cprint(text="-" * 100, color=line_color)
cprint(text="Enter number of processors for indexing:", color=text_color)
cprint(text="-" * 100, color=line_color)
procs_select = widgets.IntSlider(
    value=64,
    min=1,
    max=128,
    step=1,
    description="Number of processors to use during indexing.",
    disabled=False,
    continuous_update=True,
    orientation="horizontal",
    readout=True,
    readout_format="d",
)
display(procs_select)

[32m----------------------------------------------------------------------------------------------------[0m
[35mEnter number of processors for indexing:[0m
[32m----------------------------------------------------------------------------------------------------[0m


IntSlider(value=64, description='Number of processors to use during indexing.', max=128, min=1)

In [54]:
for idx, row in enumerate(
    tqdm.tqdm(list([row[1] for row in list(metadata_df.iterrows())]))
):
    writer = ix.writer()
    writer.add_document(
        index=idx,
        #         url = row['URL'],
        #         title = row['Work'],
        #         author = row['Workgroup'],
        #         languages = row['Language'],
        #         keywords = u'',
        content=load_urn(urn=row["URN"]),
    )

    writer.commit()

  0%|          | 0/979 [00:00<?, ?it/s]


NameError: name 'ix' is not defined

Documentation example.

In [None]:
queries = list([query.strip() for query in open(file="greek_queries.txt").readlines()])
pprint(queries)

['Ξεν',
 'Ξενιζ',
 'ξενισ',
 'Ξενικ-ος',
 'Βαρβαρ-ος',
 'Βαρβαριζ',
 'Ελληνιζ',
 'ελλην',
 'ελληνικ',
 'σολοικ',
 'αρχαιζ']


In [None]:
with ix.searcher() as searcher:
    query = QueryParser(fieldname="content", schema=ix.schema).parse(
        #         text=' '.join(latin_queries),
        text=" OR ".join(queries)
    )
    results = searcher.search(q=query)
    pprint(list(results))
#     pprint(signature(obj=query))

[<Hit {'index': 927}>]
