# Text Analytics Pipeline for Latin

Analyzes text from Latin sources in more details.

## Initial Setup

Frequently used Python modules.

In [None]:
import bidict, cltk, multiprocessing as mp, numpy as np, pandas as pd, pickle, os, requests, tqdm
from dotenv import load_dotenv
from pprint import pprint
from termcolor import colored, cprint
from typing import Iterable

Use caching.

In [None]:
import ipycache
from IPython.utils.traitlets import Unicode
%load_ext ipycache

Configurations.

In [None]:
# print coloring options
line_color = 'blue'
text_color = 'magenta'
reference_color = 'white'

In [None]:
# set floating point precision and printout length for numpy
np.set_printoptions(
    precision=10,
    threshold=20
)

# set floating point precision for pandas
pd.set_option('display.float_format', lambda x: '%.10f' % x)

Load environment variables from .env file.

In [None]:
load_dotenv()
metadata_path=os.getenv(key='metadata_path')
metadata_df_path=os.getenv(key='metadata_df_path')

cprint(text='-' * 100, color='green')
cprint(text=f'Metadata path: {metadata_path}', color='magenta')
cprint(text=f'Metadata dataframe path: {metadata_df_path}', color='magenta')

## Load Metadata

Load metadata for First1KGreek project.

In [None]:
metadata_df = None

try:
    metadata_df = pickle.load(file=open(file=metadata_df_path, mode='rb'))['metadata_df']

except:
    metadata_csv_reader = csv.reader(open(file=metadata_path, mode='r', encoding='utf-8'))
    columns = next(metadata_csv_reader)
    metadata_df = pd.DataFrame(
        data=np.asarray(a=list(metadata_csv_reader)),
        columns=columns
    )

cprint(text='-' * 100, color='green')
cprint(text='metadata_df:', color='magenta')
cprint(text='-' * 100, color='green')

In [None]:
metadata_df

In [None]:
%%cache metadata_df.pkl metadata_df --force
metadata_df

## Import Data

Import Latin text.

### *From TXT FIles*

In [None]:
def load_txt(filename: str):
    """
    Extract text from a .txt file.

    Parameters:
        filename (str): Path of file to load.

    Returns:
        str: Text loaded from file.
    """
    if not os.path.exists(path=filename):
        raise ValueError(f'The path {filename} does not exist.')

    text = open(file=filename, mode='r').read()
    return text

filename = '/mnt/d/share/Using-AI-to-Trace-the-History-of-Race-and-Inequality/src/sample_text/latin/urn_cts_greekLit_stoa0146d.stoa001.opp-lat11.txt'
text = load_txt(filename=filename)
cprint(text='-' * 100, color=line_color)
cprint(text='Loading from TXT file:', color=text_color)
cprint(text=filename, color=reference_color)
cprint(text='-' * 100, color=line_color)
cprint(text=text, color=text_color)

### *Form URI*

In [None]:
def load_uri(uri: str):
    """
    Load text from URI.

    Parameters:
        uri (str): URI link to text online.

    Returns:
        str: Text loaded from URI.
    """
    req = requests.get(url=uri)
    return req.text

uri = 'https://scaife.perseus.org/library/passage/urn:cts:greekLit:stoa0146d.stoa001.opp-lat1:1/text/'
text = load_uri(uri=uri)
cprint(text='-' * 100, color=line_color)
cprint(text='Loading from URI:', color=text_color)
cprint(text=uri, color=reference_color)
cprint(text='-' * 100, color=line_color)
cprint(text=text, color=text_color)

### *From URN*

In [None]:
def load_urn(urn: str):
    """
    Load text from URN.

    Parameters:
        urn (str): URN link that identifies specific work.

    Returns:
        str: Text loaded from URI.
    """
    idx = metadata_df.index[metadata_df['URN'] == urn][0]
    url = f'https://scaife.perseus.org/library/passage/{metadata_df.at[idx, "URL"].split("/")[-2]}/text/'
    req = requests.get(url=url)
    
    return req.text

urn = 'urn:cts:greekLit:stoa0146d.stoa001.opp-lat1'
text = load_urn(urn=urn)
url = f'https://scaife.perseus.org/library/passage/{metadata_df.at[0, "URL"].split("/")[-2]}/text/'
cprint(text='-' * 100, color=line_color)
cprint(text='Loading from URN:', color=text_color)
cprint(text=urn, color=reference_color)
cprint(text=url, color=text_color)
cprint(text='-' * 100, color=line_color)
cprint(text=text, color=text_color)

## Term Extraction

Detect and extract potentially important terms from text.

In [None]:
# initialize NLP model
nlp_model = cltk.NLP(language='lat')
doc = nlp_model.analyze(text=text)
pprint(doc)

cprint(text='-' * 100, color=line_color)
cprint(text='NLP model:', color=text_color)
cprint(text='-' * 100, color=line_color)
cprint(text=nlp_model, color=text_color)
cprint(text='-' * 100, color=line_color)
cprint(text='Document', color=text_color)
cprint(text='-' * 100, color=line_color)
cprint(text=doc, color=text_color)

Generate text corpus from URNs.

In [None]:
corpus = None

try:
    corpus = pickle.load(file=open(file='corpus.pkl', mode='rb'))['corpus']

except:
    pool = mp.Pool(processes=mp.cpu_count())
    corpus = np.asarray(
        a=pool.starmap(
            func=load_urn,
            iterable=tqdm.tqdm(np.asarray(a=list((urn,) for urn in metadata_df['URN'])))
        )
    )

cprint(text='-' * 100, color='green')
cprint(text='Corpus:', color='magenta')
cprint(text='-' * 100, color='green')
pprint(corpus)

In [None]:
%%cache corpus.pkl corpus --force
corpus

Create count vectorizer.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = None
vector = None
doc_term_matrix = None

try:
    raise Exception()
    vectorizer = pickle.load(file=open(file='vectorizer.pkl', mode='rb'))['vectorizer']
    vector = pickle.load(file=open(file='vector.pkl', mode='rb'))['vector']
    doc_term_matrix = pickle.load(file=open(file='doc_term_matrix.pkl', mode='rb'))['doc_term_matrix']

except:
    vectorizer = CountVectorizer(input='content')
    vector = vectorizer.fit_transform(raw_documents=corpus)
    doc_term_matrix = pd.DataFrame(
        data=np.matrix(data=vector.toarray()),
        columns=np.asarray(a=list(vectorizer.vocabulary_.keys()))
    ) 

cprint(text='-' * 100, color='green')
cprint(text='Vocabulary:', color='magenta')
cprint(text='-' * 100, color='green')
pprint(vectorizer.vocabulary_)
cprint(text='-' * 100, color='green')
cprint(text='Document term matrix:', color='magenta')
cprint(text='-' * 100, color='green')
pprint(doc_term_matrix)

In [None]:
%%cache vectorizer.pkl vectorizer --force
vectorizer

In [None]:
 %%cache vector.pkl vector --force
vector

In [None]:
%%cache doc_term_matrix.pkl doc_term_matrix --force
doc_term_matrix

# Search

Document identification via keywords present in content.

In [None]:
def docs_from_keywords(keywords: Iterable[str]):
    """
    Find documents that contain at least one (1) of any provided keywords in content.
    
    Parameters:
        keywords (str): Keywords to search for to locate relevant documents.

    Returns:
        pd.DataFrame: Subset of metadata dataframe for only documents containing any keyword.
    """
    keywords = set([x for x in keywords if x in doc_term_matrix.columns])
    indexes = doc_term_matrix.index[doc_term_matrix[keywords].any(axis=1)]
    return metadata_df.loc[indexes, :]

In [None]:
keywords = np.asarray(a=list([
    'Ξεν',
    'Ξενιζ',
    'ξενισ',
    'Ξενικ-ος',
    'Βαρβαρ-ος',
    'Βαρβαριζ',
    'Ελληνιζ',
    'ελλην',
    'ελληνικ',
    'σολοικ',
    'αρχαιζ'
]))
docs_from_keywords(keywords=keywords)