In [1]:
import pandas as pd # for working with dataframes
from os import listdir # for retrieving files from directory
from os.path import isfile, join # for retrieving files from directory
from pathlib import Path # for retrieving files from directory

In [2]:
data_dir = "./modified_texts/" # set data directory
file_names = [f for f in listdir(data_dir) if isfile(join(data_dir, f))] # create a list of file names
file_names.remove('.DS_Store') # remove .DS_Store file
file_names.sort() # sort the list of file names
file_names # check list of file names

subchapter_names = [] # get subchapter names without ".txt"
for i in range(len(file_names)):
    subchapter_names.append(file_names[i][:-4])

texts = [] # a list to store text strings
for i in file_names: # for each file
    # read in text, replace \n with space, store in list
    texts.append(Path(data_dir+i).read_text().replace('\n', ' ')) 

In [3]:
df = pd.read_csv("./verify_subchapter.csv") # read in subchapter data
subchapter = df["subchapter"].str.split(";", expand=True) # split subchapter data
subchapter.fillna(0, inplace=True) # replace None and NaN with 0
nahuatl = df["nahuatl"] # isolate the nahuatl column
ID = df["ID"] # isolate the ID column
type = df["type"] # isolate the type column

In [4]:
# store information for search in lists
chapter_list = []
ID_list = []
type_list = []
name_list = []

for i in range(len(subchapter_names)): # for each subchapter

    chapter = subchapter_names[i] # get the current chapter name
    
    if chapter[0]=="0": # remove leading zeros from chapter name
        chapter = chapter[1:]
    
    matches = [] # save matches to chapter
    rows = [] # save rows/index in df to matches
    
    for j in range(len(subchapter)): # for each row 
        for k in range(len(subchapter.columns)): # for each column
    
            if chapter in str(subchapter.iloc[j,k]): # if subchapter found in entry, save to list
                matches.append(subchapter.iloc[j,k])
                rows.append(j)

    plus_one_chapter = "1"+chapter # add "1" to remove unwanted chapters

    remove_inds = [] # save the indices to remove in matches and rows
    for s in range(len(matches)):
        if plus_one_chapter in matches[s]:
            remove_inds.append(s)
    
    remove_inds.sort(reverse=True) # reverse sort indices for removal
    
    for index in remove_inds: # remove indices from matches and rows
        del matches[index]
        del rows[index]
    
    for l in range(len(matches)): # remove leading and trailing zeros from matches
        if matches[l][0] == " ":
            matches[l] = matches[l][1:]
        if matches[l][-1] == " ":
            matches[l] = matches[l][:-1]
    
    nahuatl_names = [] # find nahuatl names for each matched index
    types = [] # find type for each matched index
    IDs = [] # find IDs for each matched index
    for m in rows:
        nahuatl_names.append(nahuatl.iloc[m])
        types.append(type.iloc[m])
        IDs.append(ID.iloc[m])
    
    nahuatl_names = [x.lower() for x in nahuatl_names] # make all names lowercase
    
    for n in range(len(nahuatl_names)): # split by "; " if multiple names
        if ";" in nahuatl_names[n]:
            nahuatl_names[n] = nahuatl_names[n].split("; ")
    
    for o in range(len(matches)): # find alternate names in matches
        if " " in matches[o]:
            matches[o] = matches[o][:matches[o].rfind(" ")]
        else: # if no alternate, replace with nahuatl name
            matches[o] = nahuatl_names[o]
    
    names_list = [] # combine the names in matches and nahuatl_names together for each entry
    for p in range(len(matches)):
        
        temp_list = []
        if isinstance(matches[p], list):
            for q in matches[p]:
                temp_list.append(q)
        else:
            temp_list.append(matches[p])
    
        if isinstance(nahuatl_names[p], list):
            for q in nahuatl_names[p]:
                temp_list.append(q)
        else:
            temp_list.append(nahuatl_names[p])
    
        names_list.append(temp_list)
    
    unique_names = [] # find only unique names for each entry
    for r in range(len(names_list)):
        unique_names.append(list(set(names_list[r])))

    # store information to lists
    chapter_list.append(chapter)
    ID_list.append(IDs)
    type_list.append(types)
    name_list.append(unique_names)



In [5]:
present_list = [] # overall present/absent chapter/IDs
absent_list = []

for i in range(len(chapter_list)):

    # get current data for the text
    curr_chapter = chapter_list[i]
    curr_txt = texts[i]
    curr_names = name_list[i]
    curr_IDs = ID_list[i]
    curr_types = type_list[i]
    
    # for each of the current names
    # determine if present/absent in current text
    for j in range(len(curr_names)):
        present = "No"
        for name in curr_names[j]:
            if name in curr_txt:
                present = "Yes"
                continue
        if present == "Yes": # save presence/absence info
            present_list.append([curr_chapter,curr_IDs[j], curr_names[j]])
        else:
            absent_list.append([curr_chapter,curr_IDs[j], curr_names[j]])

In [6]:
pd.DataFrame(absent_list, columns=["chapter","ID","names"]).to_csv("absent_list.csv")