In [1]:
import pandas as pd # for working with dataframes
from os import listdir # for retrieving files from directory
from os.path import isfile, join # for retrieving files from directory

In [2]:
data_dir = "./modified_texts/" # set data directory
file_names = [f for f in listdir(data_dir) if isfile(join(data_dir, f))] # create a list of file names
file_names.remove('.DS_Store') # remove .DS_Store file
file_names.sort() # sort the list of file names
file_names # check list of file names

subchapter_names = [] # get subchapter names without ".txt"
for i in range(len(file_names)):
    subchapter_names.append(file_names[i][:-4])

In [3]:
df = pd.read_csv("./verify_subchapter.csv") # read in subchapter data
subchapter = df["subchapter"].str.split(";", expand=True) # split subchapter data
subchapter.fillna(0, inplace=True) # replace None and NaN with 0
nahuatl = df["nahuatl"] # isolate the nahuatl column
ID = df["ID"] # isolate the ID column
type = df["type"] # isolate the type column

In [4]:
# store information for search in lists
chapter_list = []
ID_list = []
type_list = []
name_list = []

for i in range(len(subchapter_names)): # for each subchapter

    chapter = subchapter_names[i] # get the current chapter name
    
    if chapter[0]=="0": # remove leading zeros from chapter name
        chapter = chapter[1:]
    
    matches = [] # save matches to chapter
    rows = [] # save rows/index in df to matches
    
    for j in range(len(subchapter)): # for each row 
        for k in range(len(subchapter.columns)): # for each column
    
            if chapter in str(subchapter.iloc[j,k]): # if subchapter found in entry, save to list
                matches.append(subchapter.iloc[j,k])
                rows.append(j)
    
    for l in range(len(matches)): # remove leading and trailing zeros from matches
        if matches[l][0] == " ":
            matches[l] = matches[l][1:]
        if matches[l][-1] == " ":
            matches[l] = matches[l][:-1]
    
    nahuatl_names = [] # find nahuatl names for each matched index
    types = [] # find type for each matched index
    IDs = [] # find IDs for each matched index
    for m in rows:
        nahuatl_names.append(nahuatl.iloc[m])
        types.append(type.iloc[m])
        IDs.append(ID.iloc[m])
    
    nahuatl_names = [x.lower() for x in nahuatl_names] # make all names lowercase
    
    for n in range(len(nahuatl_names)): # split by "; " if multiple names
        if ";" in nahuatl_names[n]:
            nahuatl_names[n] = nahuatl_names[n].split("; ")
    
    for o in range(len(matches)): # find alternate names in matches
        if " " in matches[o]:
            matches[o] = matches[o][:matches[o].rfind(" ")]
        else: # if no alternate, replace with nahuatl name
            matches[o] = nahuatl_names[o]
    
    names_list = [] # combine the names in matches and nahuatl_names together for each entry
    for p in range(len(matches)):
        
        temp_list = []
        if isinstance(matches[p], list):
            for q in matches[p]:
                temp_list.append(q)
        else:
            temp_list.append(matches[p])
    
        if isinstance(nahuatl_names[p], list):
            for q in nahuatl_names[p]:
                temp_list.append(q)
        else:
            temp_list.append(nahuatl_names[p])
    
        names_list.append(temp_list)
    
    unique_names = [] # find only unique names for each entry
    for r in range(len(names_list)):
        unique_names.append(list(set(names_list[r])))

    # store information to lists
    chapter_list.append(chapter)
    ID_list.append(IDs)
    type_list.append(types)
    name_list.append(unique_names)



In [5]:
print(len(chapter_list))
chapter_list

98


['1a',
 '1b',
 '1c',
 '1d',
 '1e',
 '1f',
 '2a',
 '2b',
 '2c',
 '2d',
 '2e',
 '2f',
 '2g',
 '2h',
 '3a',
 '4a',
 '4b',
 '4c',
 '5a',
 '5b',
 '5c',
 '5d',
 '5e',
 '5f',
 '5g',
 '5h',
 '5i',
 '6a',
 '6b',
 '6c',
 '6d',
 '6e',
 '6f',
 '6g',
 '6h',
 '6i',
 '7a',
 '7b',
 '7c',
 '7d',
 '7e',
 '7f',
 '7g',
 '7h',
 '7i',
 '7j',
 '7k',
 '7l',
 '7m',
 '7n',
 '8a',
 '8b',
 '8c',
 '8d',
 '8e',
 '8f',
 '8g',
 '8h',
 '8i',
 '8j',
 '8k',
 '8l',
 '9a',
 '9b',
 '9c',
 '9d',
 '9e',
 '9f',
 '9g',
 '9h',
 '9i',
 '9j',
 '9k',
 '9l',
 '9m',
 '9n',
 '9o',
 '9p',
 '9q',
 '10a',
 '10b',
 '10c',
 '10d',
 '10e',
 '10f',
 '10g',
 '10h',
 '10i',
 '10j',
 '10k',
 '11a',
 '11b',
 '11c',
 '11d',
 '11e',
 '12a',
 '12b',
 '13a']

In [6]:
print(len(ID_list))
ID_list

98


[[28,
  49,
  70,
  70,
  70,
  123,
  134,
  143,
  150,
  208,
  240,
  266,
  267,
  268,
  268,
  269,
  309,
  310,
  310,
  310,
  310],
 [9, 92, 198, 215, 220, 268],
 [25, 31, 36, 68, 107, 134, 195, 245, 266, 268, 268, 268, 270],
 [16, 32, 56, 83, 103, 132, 163, 173, 176, 188],
 [12, 90, 114, 177, 271],
 [272],
 [28, 81, 105, 117, 198, 202, 215, 220, 265, 279],
 [48, 94, 110, 110, 118, 150, 194, 272, 273, 274],
 [],
 [],
 [107],
 [164, 172],
 [76, 77, 221],
 [],
 [24,
  112,
  178,
  179,
  213,
  215,
  252,
  257,
  266,
  268,
  268,
  272,
  272,
  272,
  275,
  295,
  298,
  300,
  301],
 [15, 236],
 [39],
 [228],
 [],
 [],
 [124, 276, 277, 278],
 [150, 180, 208, 268],
 [43, 94, 150, 240, 279],
 [41, 208],
 [160, 171],
 [28, 100, 100, 107, 196, 247, 277, 280],
 [208, 215, 256],
 [56, 116, 128, 215, 278],
 [196, 237],
 [122, 154, 168, 214],
 [14, 28, 204],
 [28, 170, 202],
 [178, 257, 259],
 [178, 257],
 [235],
 [54, 148, 252],
 [214],
 [19, 41, 45, 94, 128, 217, 233, 260, 2

In [7]:
print(len(type_list))
type_list

98


[['plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'stone',
  'stone',
  'stone',
  'stone',
  'stone',
  'other',
  'animal',
  'animal',
  'animal',
  'animal'],
 ['plant', 'plant', 'plant', 'plant', 'plant', 'stone'],
 ['plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'stone',
  'stone',
  'stone',
  'stone',
  'animal'],
 ['plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant'],
 ['plant', 'plant', 'plant', 'plant', 'animal'],
 ['stone'],
 ['plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'stone'],
 ['plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'stone',
  'other',
  'bird'],
 [],
 [],
 ['plant'],
 ['plant', 'plant'],
 ['plant', 'plant', 'plant'],
 [],
 ['plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  'plant',
  '

In [8]:
print(len(name_list))
name_list

98


[[['eca-patli'],
  ['iztac oco-xochitl'],
  ['cihua-patli'],
  ['cihua-patli'],
  ['cihua-patli'],
  ['nochtli'],
  ['quauh-alahuac'],
  ['a-huexotl', 'quetzal-ahuexotl'],
  ['te-amoxtli'],
  ['tlanextia', 'tlanextia xiuhtontli'],
  ['xal-tomatl'],
  ['tetlahuitl'],
  ['iztac tlalli'],
  ['eztetl'],
  ['eztetl'],
  ['te-mamatlatzin'],
  ['octli'],
  ['tlaquatzin'],
  ['tlaquatzin'],
  ['tlaquatzin'],
  ['tlaquatzin']],
 [['ahuiyac-xihuitl'],
  ['chipauac', 'chipauac xihuitl'],
  ['tlal-ahuehuetl'],
  ['tlatlanquaye'],
  ['tla-yapaloni'],
  ['eztetl']],
 [['ayo-nelhuatl'],
  ['xiuh-elo-quilitl', 'xiuh-eloquilitl'],
  ['iztac huitz-quahuitl'],
  ['centzon-xochitl'],
  ['malinalli'],
  ['quauh-alahuac'],
  ['tlaco-popotli', 'tlaco-popotl'],
  ['xiuh-patli'],
  ['tetlahuitl'],
  ['eztetl'],
  ['eztetl'],
  ['eztetl'],
  ['atzitzicuilotl']],
 [['atoya-xocotl'],
  ['elo-zacatl'],
  ['quauh-yyauhtli'],
  ['copal-quahuitl'],
  ['huitz-quilitl'],
  ['pozahualiz-xiuhtontli'],
  ['tequam-maiti'],