# Data Collection Pipeline

Seek out as many books, chapter, sections, sentences, etc. from each URN.

## Initial Setup

Import other required modules.

In [None]:
import csv, multiprocess as mp, numpy as np, os, pandas as pd, re, requests, string, tqdm
from dotenv import load_dotenv
from itertools import count
from pprint import pprint
from termcolor import colored, cprint

Use caching.

In [None]:
import ipycache
from IPython.utils.traitlets import Unicode
%load_ext ipycache

Configurations.

In [None]:
# print coloring options
line_color = 'green'
text_color = 'magenta'
reference_color = 'white'

Load environment variables from .env file.

In [None]:
load_dotenv()
metadata_path=os.getenv(key='metadata_path')
metadata_df_path=os.getenv(key='metadata_df_path')
corpus_path=os.getenv(key='corpus_path')
corpus_normalized_path=os.getenv(key='corpus_normalized_path')

cprint(text='-' * 100, color=line_color)
cprint(text=f'Metadata path: {metadata_path}', color=text_color)
cprint(text=f'Metadata dataframe path: {metadata_df_path}', color=text_color)
cprint(text=f'Corpus path: {corpus_path}', color=text_color)
cprint(text=f'Corpus normalized path: {corpus_normalized_path}', color=text_color)

## Load Metadata

Load metadata for First1KGreek project.

In [None]:
metadata_df = None

try:
    metadata_df = pickle.load(file=open(file=metadata_df_path, mode='rb'))['metadata_df']

except:
    metadata_csv_reader = csv.reader(open(file=metadata_path, mode='r', encoding='utf-8'))
    columns = next(metadata_csv_reader)
    metadata_df = pd.DataFrame(
        data=np.asarray(a=list(metadata_csv_reader)),
        columns=columns
    )

cprint(text='-' * 100, color=line_color)
cprint(text='metadata_df:', color=text_color)
cprint(text='-' * 100, color=line_color)

In [None]:
metadata_df

## Import Data

Import text.

### *From URN*

In [None]:
def load_urn(urn: str):
    """
    Load text from URN.

    Parameters:
        urn (str): URN link that identifies specific work.

    Returns:
        str: Text loaded from URI.
    """
    if not re.search(pattern='.+:\d+(\.\d+)*$', string=urn):
        urn += ':1'
    
    uri = f'https://scaife.perseus.org/library/passage/{urn}/text/'
    req = requests.get(url=uri)
    return req.text

urn = 'urn:cts:greekLit:stoa0146d.stoa001.opp-lat1'
text = load_urn(urn=urn)
url = f'https://scaife.perseus.org/library/passage/{metadata_df.at[0, "URL"].split("/")[-2]}/text/'
cprint(text='-' * 100, color=line_color)
cprint(text='Loading from URN:', color=text_color)
cprint(text=urn, color=reference_color)
cprint(text=url, color=text_color)
cprint(text='-' * 100, color=line_color)
cprint(text=text, color=text_color)

Finding ppotential sections of the same work by guessing URLs to individual sections.

In [None]:
def get_sections_from_urn(urn: str, limit: int=None):
    """
    Get all first-level sections of a text identified by the URN.
    
    Parameters:
        urn (str): URN link that identifies specific work.
        limit (int): The number of sections to extract from URN. Defaults to None. If None, all available sections are extracted.
    
    Returns:
        Iterable[Tuple]: Iterable of the section number, URN, and URL of work. 
    """
#     if __debug__:
#         cprint(text='-' * 100, color=line_color)
#         cprint(text='URN:', color=text_color)
#         cprint(text='-' * 100, color=line_color)
    pprint(urn)
    
    section_data = list()
    for section in count(start=1, step=1):    
        potential_uri = f'https://scaife.perseus.org/library/passage/{urn}:{section}/text/'
        redirected_uri = requests.get(url=potential_uri).url

#         if __debug__:
#             cprint(text='-' * 100, color=line_color)
#             cprint(text='Potential and redirected URIs:', color=text_color)
#             cprint(text='-' * 100, color=line_color)
#             pprint(potential_uri)
#             pprint(redirected_uri)
        
        if potential_uri != redirected_uri:
            break
        elif limit is not None and section > limit:
            break
        
        section_url = f'https://scaife.perseus.org/reader/{urn}:{section}/'

#         if __debug__:
#             cprint(text='-' * 100, color=line_color)
#             cprint(text='Section URL:', color=text_color)
#             cprint(text='-' * 100, color=line_color)
#             pprint(section_url)
        
        # appending URL of section of text to collected dataata
        section_data.append((section, tup.URN, section_url,))
    
        pprint(section)
    return np.asarray(a=section_data, dtype=tuple)

In [None]:
data_by_urn = list()
for tup in tqdm.tqdm(metadata_df.itertuples()):
    data_by_urn.append(get_sections_from_urn(urn=tup.URN, limit=5))

Removing any URN groups with no numerical-based sections in its URLs.

In [None]:
data_by_urn = np.asarray(a=[
    np.asarray(a=[section_group for section_group in urn_group if len(section_group) == 3])
                                 for urn_group in data_by_urn if len(urn_group) > 0])
len(data_by_urn)

In [None]:
data = np.vstack(tup=data_by_urn)
columns = np.asarray(a=list(['Section', 'URN', 'URL']))
section_df = pd.DataFrame(
    data=data,
    columns=columns
)

In [None]:
section_df