# Text Search Pipeline for Latin

Query for documents or indexes of word occurrences inside documents.

## Initial Setup

Import widgets.

In [None]:
import ipywidgets as widgets
from IPython.display import display

Import other required modules.

In [None]:
import bs4, csv, multiprocessing as mp, numpy as np, os, pandas as pd, pickle, requests, tqdm
from inspect import signature
from dotenv import load_dotenv
from pprint import pprint
from termcolor import colored, cprint

Use caching.

In [None]:
import ipycache
from IPython.utils.traitlets import Unicode
%load_ext ipycache

Configurations.

In [None]:
# print coloring options
line_color = 'blue'
text_color = 'magenta'
reference_color = 'white'

Load environment variables from .env file.

In [None]:
load_dotenv()
metadata_path=os.getenv(key='metadata_path')
metadata_df_path=os.getenv(key='metadata_df_path')
corpus_path=os.getenv(key='corpus_path')

cprint(text='-' * 100, color='green')
cprint(text=f'Metadata path: {metadata_path}', color='magenta')
cprint(text=f'Metadata dataframe path: {metadata_df_path}', color='magenta')
cprint(text=f'Corpus path: {corpus_path}', color='magenta')

In [None]:
text = widgets.Text('Testing...')
display(text)

## Load Metadata

Load metadata for First1KGreek project.

In [None]:
metadata_df = None

try:
    metadata_df = pickle.load(file=open(file='metadata_df.pkl', mode='rb'))['metadata_df']

except:
    metadata_csv_reader = csv.reader(open(file=metadata_path, mode='r', encoding='utf-8'))
    columns = next(metadata_csv_reader)
    metadata_df = pd.DataFrame(
        data=np.asarray(a=list(metadata_csv_reader)),
        columns=columns
    )

cprint(text='-' * 100, color='green')
cprint(text='metadata_df:', color='magenta')
cprint(text='-' * 100, color='green')

In [None]:
metadata_df

In [None]:
%%cache metadata_df.pkl metadata_df --force
metadata_df

## Import Data

Import Latin text.

### *From TXT File*

In [None]:
def load_txt(filename: str):
    """
    Extract text from a .txt file.

    Parameters:
        filename (str): Path of file to load.

    Returns:
        str: Text loaded from file.
    """
    if not os.path.exists(path=filename):
        raise ValueError(f'The path {filename} does not exist.')

    text = open(file=filename, mode='r').read()
    return text

filename = '/mnt/d/share/Using-AI-to-Trace-the-History-of-Race-and-Inequality/src/sample_text/latin/urn_cts_greekLit_stoa0146d.stoa001.opp-lat11.txt'
text = load_txt(filename=filename)
cprint(text='-' * 100, color=line_color)
cprint(text='Loading from TXT file:', color=text_color)
cprint(text=filename, color=reference_color)
cprint(text='-' * 100, color=line_color)
cprint(text=text, color=text_color)

### *From URI*

In [None]:
def load_uri(uri: str):
    """
    Load text from URI.

    Parameters:
        uri (str): URI link to text online.

    Returns:
        str: Text loaded from URI.
    """
    req = requests.get(url=uri)
    return req.text

uri = 'https://scaife.perseus.org/library/passage/urn:cts:greekLit:stoa0146d.stoa001.opp-lat1:1/text/'
text = load_uri(uri=uri)
cprint(text='-' * 100, color=line_color)
cprint(text='Loading from URI:', color=text_color)
cprint(text=uri, color=reference_color)
cprint(text='-' * 100, color=line_color)
cprint(text=text, color=text_color)

### *From URN*

In [None]:
def load_urn(urn: str):
    """
    Load text from URN.

    Parameters:
        urn (str): URN link that identifies specific work.

    Returns:
        str: Text loaded from URI.
    """
    idx = metadata_df.index[metadata_df['URN'] == urn][0]
    url = f'https://scaife.perseus.org/library/passage/{metadata_df.at[idx, "URL"].split("/")[-2]}/text/'
    req = requests.get(url=url)
    
    return req.text

urn = 'urn:cts:greekLit:stoa0146d.stoa001.opp-lat1'
text = load_urn(urn=urn)
url = f'https://scaife.perseus.org/library/passage/{metadata_df.at[0, "URL"].split("/")[-2]}/text/'
cprint(text='-' * 100, color=line_color)
cprint(text='Loading from URN:', color=text_color)
cprint(text=urn, color=reference_color)
cprint(text=url, color=text_color)
cprint(text='-' * 100, color=line_color)
cprint(text=text, color=text_color)

Create analyzer for text content.

In [None]:
from whoosh.lang import has_stemmer, languages
from whoosh.analysis import SpaceSeparatedTokenizer, LowercaseFilter

In [None]:
# check for Latin availability for analyzer
cprint(text='-' * 100, color='green')
cprint(text='Available languages for LanguageAnalyzer:', color='magenta')
cprint(text='-' * 100, color='green')
pprint(languages)

cprint(text='-' * 100, color='green')
cprint(text=f'Latin is available? {has_stemmer(lang="lat")}', color='magenta')

In [None]:
analyzer = SpaceSeparatedTokenizer() | LowercaseFilter()
for token in analyzer(value=text):
    pprint(token)

Create schema.

In [None]:
from whoosh.index import create_in
from whoosh.fields import Schema, ID, KEYWORD, NUMERIC, TEXT
from whoosh.qparser import QueryParser

In [None]:
schema = Schema(
    index = NUMERIC(
        bits=64,
        numtype='int',
        signed=False,
        sortable=True,
        stored=True,
        unique=True
    ),
    url = ID(
        sortable=False,
        stored=True,
        unique=True
    ),
    title=TEXT(
        chars=True,
        phrase=True,
        sortable=True,
        stored=True
    ),
    author=TEXT(
        chars=True,
        phrase=True,
        sortable=True,
        stored=True
    ),
    languages=KEYWORD(
        commas=True,
        lowercase=True,
        scorable=False,
        sortable=False,
        stored=True,
        unique=False
    ),
    keywords=KEYWORD(
        commas=True,
        lowercase=True,
        scorable=True,
        sortable=True,
        stored=True,
        unique=False
    ),
    content=TEXT(
        chars=True,
        phrase=True,
        sortable=False,
        stored=False,
        analyzer=analyzer
    )
)

Create index.

In [None]:
ix = create_in(dirname='indexes/',
              schema=schema,
              indexname='latin_index')

Index documents.

Multiprocessing does not work with document indexing.

Conventional for-loop approach to document indexing. The multiprocessing portion is built-in inside writer.

In [None]:
from whoosh.writing import AsyncWriter, BufferedWriter

In [None]:
corpus = pickle.load(file=open(file=corpus_path, mode='rb'))['corpus']

In [None]:
writer = ix.writer()
insertions = np.asarray(a=list([metadata_df.loc[:10, col_name].to_numpy().astype(dtype=np.unicode_) for col_name in metadata_df.columns]))
insertions[0] = insertions[0].astype(dtype=np)
#     pprint(type(u'%s' % row['URL']))

    
writer.add_document(
    index = insertions[0].tolist(),
    url = insertions[1].tolist(),
    title = insertions[2].tolist(),
    author = insertions[3].tolist()
    languages = insertions[4].tolist(),
    keywords = u'',
    content = corpus[:10].astype(dtype=np.unicode_).tolist()
)

writer.commit()

Documentation example.

In [None]:
with ix.searcher() as searcher:
    query = QueryParser(
        fieldname='author',
        schema=ix.schema
    ).parse(
        text='Workgroup',
    )
    results = searcher.search(q=query)
    pprint(results)
#     pprint(signature(obj=query))