# Data Collection Pipeline

Seek out as many books, chapter, sections, sentences, etc. from each URN.

## Initial Setup

Import other required modules.

In [1]:
import csv, multiprocessing as mp, numpy as np, os, pandas as pd, re, requests, string, tqdm
from dotenv import load_dotenv
from itertools import count
from pprint import pprint
from termcolor import colored, cprint

Use caching.

In [2]:
import ipycache
from IPython.utils.traitlets import Unicode
%load_ext ipycache

ModuleNotFoundError: No module named 'IPython.config'

Configurations.

In [3]:
# print coloring options
line_color = 'green'
text_color = 'magenta'
reference_color = 'white'

Load environment variables from .env file.

In [4]:
load_dotenv()
metadata_path=os.getenv(key='METADATA_PATH')
metadata_df_path=os.getenv(key='METADATA_DF_PATH')
full_texts_path=os.getenv(key='FULL_TEXTS_PATH')
full_texts_normalized_path=os.getenv(key='FULL_TEXTS_NORMALIZED_PATH')


cprint(text='-' * 100, color=line_color)
cprint(text=f'Metadata path: {metadata_path}', color=text_color)
cprint(text=f'Metadata dataframe path: {metadata_df_path}', color=text_color)
cprint(text=f'Full texts path: {full_texts_path}', color=text_color)
cprint(text=f'Full texts normalized path: {full_texts_normalized_path}', color=text_color)

[32m----------------------------------------------------------------------------------------------------[0m
[35mMetadata path: /mnt/c/share/AI-in-Classics/src/metadata.csv[0m
[35mMetadata dataframe path: /mnt/c/share/AI-in-Classics/src/metadata_df.csv[0m
[35mFull texts path: /mnt/c/share/AI-in-Classics/src/pipelines/data_collection/full_texts/[0m
[35mFull texts normalized path: None[0m


## Load Metadata

Load metadata for First1KGreek project.

In [5]:
metadata_df = None

try:
    metadata_df = pickle.load(file=open(file=metadata_df_path, mode='rb'))['metadata_df']

except:
    metadata_csv_reader = csv.reader(open(file=metadata_path, mode='r', encoding='utf-8'))
    columns = next(metadata_csv_reader)
    metadata_df = pd.DataFrame(
        data=np.asarray(a=list(metadata_csv_reader)),
        columns=columns
    )

cprint(text='-' * 100, color=line_color)
cprint(text='metadata_df:', color=text_color)
cprint(text='-' * 100, color=line_color)

[32m----------------------------------------------------------------------------------------------------[0m
[35mmetadata_df:[0m
[32m----------------------------------------------------------------------------------------------------[0m


In [6]:
metadata_df

Unnamed: 0,Unnamed: 1,Workgroup,Work,Language,Words,URN,URL
0,0,Anonymous,Anametresis Pontou,"grc,lat,deu",374,urn:cts:greekLit:ggm0001.ggm001.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
1,1,,Isaias,"lat,eng",36226,urn:cts:hebrewlit:heb0001.heb010.1st1K-eng1,https://scaife.perseus.org/reader/urn:cts:hebr...
2,2,Pinytus,De Epistola Pinyti ad Dionysium,"grc,lat",162,urn:cts:greekLit:ogl0001.ogl001.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
3,3,pseudo-Aristotle,De mundo,grc,6446,urn:cts:greekLit:stoa0033a.tlg028.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
4,4,pseudo-Aristotle,De spiritu,grc,3460,urn:cts:greekLit:stoa0033a.tlg043.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
...,...,...,...,...,...,...,...
974,974,Scholia in Sophoclem,Scholia in Sophoclem (scholia vetera),"grc,lat",89341,urn:cts:greekLit:tlg5037.tlg004.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
975,975,Anonymi In Aristotelis Librum Alterum Analytic...,Anonymi in analyticorum posteriorum librum alt...,"grc,lat",25619,urn:cts:greekLit:tlg9004.tlg001.opp-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
976,976,,Libanius Opera,lat,7534,urn:cts:greekLit:tlg9006.tlg011.opp-grc1,https://scaife.perseus.org/reader/urn:cts:gree...
977,977,Suda,Suidae lexicon,"grc,lat",821723,urn:cts:greekLit:tlg9010.tlg001.1st1K-grc1,https://scaife.perseus.org/reader/urn:cts:gree...


## Import Data

Import text.

### *From URN*

In [7]:
def load_urn(urn: str):
    """
    Load text from URN.

    Parameters:
        urn (str): URN link that identifies specific work.

    Returns:
        str: Text loaded from URI.
    """
    if not re.search(pattern='.+:\d+(\.\d+)*$', string=urn):
        urn += ':1'
    
    uri = f'https://scaife.perseus.org/library/passage/{urn}/text/'
    req = requests.get(url=uri)
    return req.text

In [8]:
# Example
urn = 'urn:cts:greekLit:stoa0146d.stoa001.opp-lat1'
text = load_urn(urn=urn)
url = f'https://scaife.perseus.org/library/passage/{metadata_df.at[0, "URL"].split("/")[-2]}/text/'
cprint(text='-' * 100, color=line_color)
cprint(text='Loading from URN:', color=text_color)
cprint(text=urn, color=reference_color)
cprint(text=url, color=text_color)
cprint(text='-' * 100, color=line_color)
cprint(text=text, color=text_color)

[32m----------------------------------------------------------------------------------------------------[0m
[35mLoading from URN:[0m
[37murn:cts:greekLit:stoa0146d.stoa001.opp-lat1[0m
[35mhttps://scaife.perseus.org/library/passage/urn:cts:greekLit:ggm0001.ggm001.1st1K-grc1:1.1/text/[0m
[32m----------------------------------------------------------------------------------------------------[0m
[35mACTA ARCHELAI. Thesaurus verus sive disputatio habita in Carcharis eiTitate Mesopotamiae
Archelai episeopi adversus Manen, Judicantibus Manippo et
Aegialeo et Clandio et Cleobolo. In qua urbe erat quidani vir Marcellus 
 nomine qui vita et studiis et genere, prudentia quoque et honestate
valde clanis habebatnr; faeultatibus etiam copiosus et quoii
omnium maximiim est, religiosiasime deum timens, et his quae de
Christo dicebantur semper cum timore auscultans, nec quicquam omnino
honi erat quod illi viro deesset; unde et honore plurimo ab nniversa 
 eivitate colebatnr plnrimisque ipse 

Finding ppotential sections of the same work by guessing URLs to individual sections.

In [9]:
def get_sections_from_urn(urn: str, limit: int=None):
    """
    Get all first-level sections of a text identified by the URN.
    
    Parameters:
        urn (str): URN link that identifies specific work.
        limit (int): The number of sections to extract from URN. Defaults to None. If None, all available sections are extracted.
    
    Returns:
        Iterable[Tuple]: Iterable of the section number, URN, and URL of work. 
    """
#     if __debug__:
#         cprint(text='-' * 100, color=line_color)
#         cprint(text='URN:', color=text_color)
#         cprint(text='-' * 100, color=line_color)
    pprint(urn)
    
    section_data = list()
    for section in count(start=1, step=1):    
        potential_uri = f'https://scaife.perseus.org/library/passage/{urn}:{section}/text/'
        redirected_uri = requests.get(url=potential_uri).url

#         if __debug__:
#             cprint(text='-' * 100, color=line_color)
#             cprint(text='Potential and redirected URIs:', color=text_color)
#             cprint(text='-' * 100, color=line_color)
#             pprint(potential_uri)
#             pprint(redirected_uri)
        
        if potential_uri != redirected_uri:
            break
        elif limit is not None and section > limit:
            break
        
        section_url = f'https://scaife.perseus.org/reader/{urn}:{section}/'

#         if __debug__:
#             cprint(text='-' * 100, color=line_color)
#             cprint(text='Section URL:', color=text_color)
#             cprint(text='-' * 100, color=line_color)
#             pprint(section_url)
        
        # appending URL of section of text to collected dataata
        section_data.append((section, tup.URN, section_url,))
    
        pprint(section)
    return np.asarray(a=section_data, dtype=tuple)

In [10]:
data_by_urn = list()
for tup in tqdm.tqdm(metadata_df.itertuples()):
    data_by_urn.append(get_sections_from_urn(urn=tup.URN, limit=5))

0it [00:00, ?it/s]

'urn:cts:greekLit:ggm0001.ggm001.1st1K-grc1'
1
2


1it [00:02,  2.71s/it]

'urn:cts:hebrewlit:heb0001.heb010.1st1K-eng1'
1
2
3
4
5


2it [00:07,  4.03s/it]

'urn:cts:greekLit:ogl0001.ogl001.1st1K-grc1'
1


3it [00:09,  3.01s/it]

'urn:cts:greekLit:stoa0033a.tlg028.1st1K-grc1'
1
2
3
4
5


4it [00:14,  3.87s/it]

'urn:cts:greekLit:stoa0033a.tlg043.1st1K-grc1'
1
2
3
4
5


5it [00:20,  4.62s/it]

'urn:cts:greekLit:stoa0121.stoa001.opp-grc1'
1


5it [00:21,  4.37s/it]


KeyboardInterrupt: 

Removing any URN groups with no numerical-based sections in its URLs.

In [None]:
data_by_urn = np.asarray(a=[
    np.asarray(a=[section_group for section_group in urn_group if len(section_group) == 3])
                                 for urn_group in data_by_urn if len(urn_group) > 0])
len(data_by_urn)

In [None]:
data = np.vstack(tup=data_by_urn)
columns = np.asarray(a=list(['Section', 'URN', 'URL']))
section_df = pd.DataFrame(
    data=data,
    columns=columns
)

In [None]:
section_df