In [None]:
#===========================================================================
#PRELIMINARIES

#This code allows you to download data from the Open Richly Annotated 
#Cuneiform Corpus (ORACC).

#This project is code that I have remixed from Niek Veldhuis' Computational 
#Assyriology (Compass) project (https://github.com/niekveldhuis/compass). I 
#have not written the original code, but I did put two part of his code 
#together and add filters in order to produce a dataset from ORACC I required 
#for my project.

#In the same folder you download this file, you need a folder called "output".
#This folder will be where your saved data will be saved to.

#You are free to use, reuse, and remix this code, but please credit myself 
#(Ellie Bennett) and Niek Veldhuis.
#CC BY-SA 4.0
#===========================================================================

In [1]:
#===========================================================================
#IMPORTS PACKAGES
#This bit of code imports the packages needed to make the rest of the code 
#work. 
#===========================================================================
import pandas as pd
import zipfile
import json
#from tqdm._tqdm_notebook import tqdm_notebook
from tqdm.auto import tqdm
import os
import sys


Project(s): rinap/rinap1, rinap/rinap2, rinap/rinap3, rinap/rinap4, rinap/rinap5
Saving http://oracc.museum.upenn.edu/rinap/rinap4/json/rinap-rinap4.zip as jsonzip/rinap-rinap4.zip


HBox(children=(IntProgress(value=1, bar_style='info', description='rinap/rinap4', max=1, style=ProgressStyle(d…


Saving http://oracc.museum.upenn.edu/rinap/rinap5/json/rinap-rinap5.zip as jsonzip/rinap-rinap5.zip


HBox(children=(IntProgress(value=1, bar_style='info', description='rinap/rinap5', max=1, style=ProgressStyle(d…


Saving http://oracc.museum.upenn.edu/rinap/rinap1/json/rinap-rinap1.zip as jsonzip/rinap-rinap1.zip


HBox(children=(IntProgress(value=1, bar_style='info', description='rinap/rinap1', max=1, style=ProgressStyle(d…


Saving http://oracc.museum.upenn.edu/rinap/rinap2/json/rinap-rinap2.zip as jsonzip/rinap-rinap2.zip


HBox(children=(IntProgress(value=1, bar_style='info', description='rinap/rinap2', max=1, style=ProgressStyle(d…


Saving http://oracc.museum.upenn.edu/rinap/rinap3/json/rinap-rinap3.zip as jsonzip/rinap-rinap3.zip


HBox(children=(IntProgress(value=1, bar_style='info', description='rinap/rinap3', max=1, style=ProgressStyle(d…




In [2]:
#===========================================================================
#WHAT SUB-PROJECTS?
#This will produce a text box for you to input the sub-projects on ORACC you 
#want data from. You need it to be in the format of: project/sub-project. For 
#example, If I only want data from RINAP 4, I would type rinap/rinap4. If I 
#want data from several projects, for example, all of RINAP, I would type in 
#all the projects separated by a comma and space. 

#E.g.: rinap/rinap1, rinap/rinap2, rinap/rinap3, rinap/rinap4, rinap/rinap5

#I recommend you make a list in a .txt file of all the sub-projects you want 
#to use, so you can easily repeat this step.

#When you have typed in the project names, press ENTER, and the code will 
#create a .zip folder with all the JSON files from those projects in the same 
#folder as the code.
#===========================================================================
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from utils import *

directories = ['jsonzip', 'output']
make_dirs(directories)

projects = input('Project(s): ').lower()
p = format_project_list(projects)

p = oracc_download(p)

Project(s): rinap/rinap1, rinap/rinap2, rinap/rinap3, rinap/rinap4, rinap/rinap5
Saving http://oracc.museum.upenn.edu/rinap/rinap4/json/rinap-rinap4.zip as jsonzip/rinap-rinap4.zip


HBox(children=(IntProgress(value=1, bar_style='info', description='rinap/rinap4', max=1, style=ProgressStyle(d…


Saving http://oracc.museum.upenn.edu/rinap/rinap5/json/rinap-rinap5.zip as jsonzip/rinap-rinap5.zip


HBox(children=(IntProgress(value=1, bar_style='info', description='rinap/rinap5', max=1, style=ProgressStyle(d…


Saving http://oracc.museum.upenn.edu/rinap/rinap1/json/rinap-rinap1.zip as jsonzip/rinap-rinap1.zip


HBox(children=(IntProgress(value=1, bar_style='info', description='rinap/rinap1', max=1, style=ProgressStyle(d…


Saving http://oracc.museum.upenn.edu/rinap/rinap2/json/rinap-rinap2.zip as jsonzip/rinap-rinap2.zip


HBox(children=(IntProgress(value=1, bar_style='info', description='rinap/rinap2', max=1, style=ProgressStyle(d…


Saving http://oracc.museum.upenn.edu/rinap/rinap3/json/rinap-rinap3.zip as jsonzip/rinap-rinap3.zip


HBox(children=(IntProgress(value=1, bar_style='info', description='rinap/rinap3', max=1, style=ProgressStyle(d…




In [4]:
#===========================================================================
#LEMMA TABLE
#This is the first part of Niek's code, and produces a table (or dataframe) 
#that includes the CDLI ID number (or P number) of the text, and all of the 
#words in the text.

#I have provided comments in the sections where you can adjust the code for 
#your own project.
#===========================================================================
def parsejson(text):
    for JSONobject in text["cdl"]:
        if "cdl" in JSONobject: 
            parsejson(JSONobject)
        if "f" in JSONobject:
            lemm = JSONobject["f"]
            lemm["id_text"] = id_text
            lemm_l.append(lemm)
    return

# initiate the list that will hold all the lemmatization data of all texts in all requested projects
lemm_l = [] 
for project in p:
    # print("Parsing " + project)
    file = "jsonzip/" + project.replace("/", "-") + ".zip"
    try:
        z = zipfile.ZipFile(file)       # create a Zipfile object
    except:
        print(file + " does not exist or is not a proper ZIP file")
        continue
    
    files = z.namelist()     # list of all the files in the ZIP
    files = [name for name in files if "corpusjson" in name and name[-5:] == '.json']                                                                                                  #that holds all the P, Q, and X numbers.
    for filename in tqdm(files, desc=project):                            #iterate over the file names
        #id_text = project + filename[-13:-5] commented out to make index match other df # id_text is, for instance, blms/P414332
        id_text = filename[-12:-5]
        try:
            text = z.read(filename).decode('utf-8')         #read and decode the json file of one particular text
            data_json = json.loads(text)                # make it into a json object (essentially a dictionary)
            parsejson(data_json)               # and send to the parsejson() function
        except:
            tqdm.write(id_text + ' is not available or not complete')
    z.close()
    
word_df = pd.DataFrame(lemm_l)

# replace NaN (Not a Number) with empty string
word_df = word_df.fillna('')
word_df

findreplace = {' ' : '-', ',' : ''}
word_df = word_df.replace({'gw' : findreplace, 'sense' : findreplace}, regex = True)

#This is where you can change how a word is displayed. 
#Now it will have a word as: lemma[guideword]POS. 
#ORACC has explanations for what each of these terms mean: http://oracc.museum.upenn.edu/doc/help/lemmatising/primer/index.html
word_df["lemma"] = word_df["cf"] + '[' + word_df["gw"] + ']' + word_df["pos"] 

#This changes a word that would appear as 'x[NA]NA' to an underscore.
word_df.loc[word_df["cf"] == "" , 'lemma'] = word_df['cf'] + '_' 

#I only want the IDs and the lemma. 
#You can just show the lemma as well by deleting the 'id_text', bit in square brackets.
word_df[['id_text', 'lemma']]

#Here the code displays a dataframe as each row is a different CDLI ID, followed by the lemma.
doc_df = word_df.groupby("id_text").agg({"lemma": ' '.join})
doc_df

HBox(children=(IntProgress(value=0, description='rinap/rinap4', max=183, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='rinap/rinap5', max=314, style=ProgressStyle(description_width…

Q008342 is not available or not complete
Q008347 is not available or not complete



HBox(children=(IntProgress(value=0, description='rinap/rinap1', max=96, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='rinap/rinap2', max=151, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='rinap/rinap3', max=261, style=ProgressStyle(description_width…




Unnamed: 0_level_0,lemma
id_text,Unnamed: 1_level_1
Q003230,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003231,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003232,_ _ _ _ _ _ _ _ _ qabû[say]V _ _ alāku[go]V _ ...
Q003233,_ kamîš[in-a-bound-state]AV u[and]CNJ Teušpa[C...
Q003234,ša[that]REL ahu[brother]N rabû[great]AJ ahu[br...
...,...
Q008355,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...
Q008356,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...
Q008357,_ Aššur-bani-apli[Ashurbanipal-king-of-Assyria...
Q008358,ēkallu[palace]N Aššur-bani-apli[Ashurbanipal-k...


In [6]:
#===========================================================================
#METADATA TABLE
#This is the second bit of Niek's code. It produces a table of all the 
#metadata associated with a specific text.

#This code produces the table with all the metadata. I recommend making this 
#and saving the dataframe, and then inspecting the full list of different metadata. 
#Then you can choose which ones are most applicable to your project, and 
#continue in this process.
#===========================================================================

dfcat = pd.DataFrame() 
for project in p:
    file = "jsonzip/" + project.replace("/", "-") + ".zip"
    try:
        z = zipfile.ZipFile(file)      
    except:
        print(file + " does not exist or is not a proper ZIP file")
        continue
    try:
        st = z.read(project + '/catalogue.json').decode('utf-8')  #read and decode the catalogue.json file of one project
                                                                
    except:
        print(project + '/catalogue.json' + ' is not available or not complete')
        continue
    cat = json.loads(st)
    cat = cat['members']  # select the 'members' node 
    for item in cat.values():
        item['project'] = project # add project name as separate field
    cat_df = pd.DataFrame(cat).T
    dfcat = pd.concat([dfcat, cat_df], sort=True)  # sort=True is necessary in case catalogs have a different set of fields
dfcat

Unnamed: 0,Q_designation,ancient_date,ancient_day,ancient_month,ancient_year,cdli_id,century,collection,credits,date_of_origin,...,ruler,script,script_remarks,script_type,seal_id,subgenre,subproject,supergenre,trans,year_name_eponym
Q003230,,,,,,P462851,,"British Museum, London, UK; Oriental Institute...","Created by Erle Leichty, and the Royal Inscrip...","673, 672",...,Esarhaddon,Neo-Assyrian,inscribed,Cuneiform,,Esarhaddon,,LIT,[en],"Atar-ili, Nabû-bēlī-uṣur"
Q003231,,,,,,P462852,,"National Museum of Iraq, Baghdad, Iraq; Britis...","Created by Erle Leichty, Jamie Novotny, and th...",676,...,Esarhaddon,Neo-Assyrian,inscribed,Cuneiform,,Esarhaddon,,LIT,[en],Banbâ
Q003232,,,,,,P462853,,"British Museum, London, UK","Created by Erle Leichty, Jamie Novotny, and th...",676-669,...,Esarhaddon,Neo-Assyrian,inscribed,Cuneiform,,Esarhaddon,,LIT,[en],
Q003233,,,,,,P462854,,"National Museum of Iraq, Baghdad, Iraq","Created by Erle Leichty, Jamie Novotny, and th...",676-669,...,Esarhaddon,Neo-Assyrian,inscribed,Cuneiform,,Esarhaddon,,LIT,[en],
Q003234,,,,,,P462855,,"British Museum, London, UK","Created by Erle Leichty, Jamie Novotny, and th...",671-670,...,Esarhaddon,Neo-Assyrian,inscribed,Cuneiform,,Esarhaddon,,LIT,[en],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q004080,,,,,,P467271,,"British Museum, London, UK","Created by A. Kirk Grayson, Jamie Novotny, and...",704-681,...,Sennacherib,Neo-Assyrian,inscribed,Cuneiform,,Sennacherib,,LIT,[en],
Q004081,,,,,,P467272,,"British Museum, London, UK","Created by A. Kirk Grayson, Jamie Novotny, and...",704-681,...,Sennacherib,Neo-Assyrian,inscribed,Cuneiform,,Sennacherib,,LIT,[en],
Q004082,,,,,,P467273,,"British Museum, London, UK","Created by A. Kirk Grayson, Jamie Novotny, and...",704-681,...,Sennacherib,Neo-Assyrian,inscribed,Cuneiform,,Sennacherib,,LIT,[en],
Q004088,,,,,,P467274,,"Vorderasiatisches Museum, Berlin, Germany","Created by A. Kirk Grayson, Jamie Novotny, and...",704-681,...,Sennacherib,Neo-Assyrian,inscribed,Cuneiform,,Sennacherib,,LIT,[en],


In [None]:
#===========================================================================
#SAVE THE FILE
#You can choose to save this table at this point so you can inspect all the 
#different types of metadata available to you.

#This will save as a .csv file to your output folder.
#===========================================================================

savefile =  'metadata.csv' #Choose the name of the file for your table.
with open('output/' + savefile, 'w', encoding="utf-8") as w:
    dfcat.to_csv(w, index=True) #make sure the beginning of this line (before .to_csv) matches your table name.

In [8]:
#===========================================================================
#CLEAN YOUR DATA
#This bit of code will make a table only showing the metadata you want for 
#the texts.
#===========================================================================

#This bit of code fills fields that are empty with an empty string, not NaN.
dfcatblank = dfcat.fillna('')

#This is where you choose the columns of your desired metadata. 
#In this case, I wanted the columns 'id_text', 'period', 'language', and 'genre'.
dfmeta = dfcatblank[['id_text', 'period', 'language', 'genre']] 
dfmeta

Unnamed: 0,id_text,period,language,genre
Q003230,,Neo-Assyrian,Akkadian,Royal Inscription
Q003231,,Neo-Assyrian,Akkadian,Royal Inscription
Q003232,,Neo-Assyrian,Akkadian,Royal Inscription
Q003233,,Neo-Assyrian,Akkadian,Royal Inscription
Q003234,,Neo-Assyrian,Akkadian,Royal Inscription
...,...,...,...,...
Q004080,,Neo-Assyrian,Akkadian,Royal Inscription
Q004081,,Neo-Assyrian,Akkadian,Royal Inscription
Q004082,,Neo-Assyrian,Akkadian,Royal Inscription
Q004088,,Neo-Assyrian,Akkadian,Royal Inscription


In [10]:
#===========================================================================
#JOINING THE TABLES
#So now you have two tables: one with the lemma, and one with the metadata.
#This bit of code joins the two together based on the index - here called 
#id_index.
#===========================================================================

dflemmameta = doc_df.join(dfmeta)
dflemmameta

Unnamed: 0_level_0,lemma,id_text,period,language,genre
id_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q003230,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...,,Neo-Assyrian,Akkadian,Royal Inscription
Q003231,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...,,Neo-Assyrian,Akkadian,Royal Inscription
Q003232,_ _ _ _ _ _ _ _ _ qabû[say]V _ _ alāku[go]V _ ...,,Neo-Assyrian,Akkadian,Royal Inscription
Q003233,_ kamîš[in-a-bound-state]AV u[and]CNJ Teušpa[C...,,Neo-Assyrian,Akkadian,Royal Inscription
Q003234,ša[that]REL ahu[brother]N rabû[great]AJ ahu[br...,,Neo-Assyrian,Akkadian,Royal Inscription
...,...,...,...,...,...
Q008355,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...,,Neo-Assyrian,Akkadian,Royal Inscription
Q008356,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...,,Neo-Assyrian,Akkadian,Royal Inscription
Q008357,_ Aššur-bani-apli[Ashurbanipal-king-of-Assyria...,,Neo-Assyrian,Akkadian,Royal Inscription
Q008358,ēkallu[palace]N Aššur-bani-apli[Ashurbanipal-k...,,Neo-Assyrian,Akkadian,Royal Inscription


In [13]:
#===========================================================================
#CHANGING COLUMN ORDER
#You now have everything in your table, but to make it easier to look at, 
#we'll re-order the columns so lemma are at the end.

#Make a note of how many rows there are - Jupyter Notebooks will automatically 
#tell you.
#===========================================================================

dfmetalemma = dflemmameta[['id_text', 'period', 'language', 'genre', 'lemma']]
dfmetalemma

Unnamed: 0_level_0,id_text,period,language,genre,lemma
id_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q003230,,Neo-Assyrian,Akkadian,Royal Inscription,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003231,,Neo-Assyrian,Akkadian,Royal Inscription,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003232,,Neo-Assyrian,Akkadian,Royal Inscription,_ _ _ _ _ _ _ _ _ qabû[say]V _ _ alāku[go]V _ ...
Q003233,,Neo-Assyrian,Akkadian,Royal Inscription,_ kamîš[in-a-bound-state]AV u[and]CNJ Teušpa[C...
Q003234,,Neo-Assyrian,Akkadian,Royal Inscription,ša[that]REL ahu[brother]N rabû[great]AJ ahu[br...
...,...,...,...,...,...
Q008355,,Neo-Assyrian,Akkadian,Royal Inscription,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...
Q008356,,Neo-Assyrian,Akkadian,Royal Inscription,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...
Q008357,,Neo-Assyrian,Akkadian,Royal Inscription,_ Aššur-bani-apli[Ashurbanipal-king-of-Assyria...
Q008358,,Neo-Assyrian,Akkadian,Royal Inscription,ēkallu[palace]N Aššur-bani-apli[Ashurbanipal-k...


In [15]:
#===========================================================================
#FILTERING ACCORDING TO GENRE

#Here you can filter the dataframe according to a column. In this case, it 
#is filtering the 'genre' column according to 'Royal Inscription'.
#===========================================================================

dfgenre = dfmetalemma[dfmetalemma['genre']=='Royal Inscription']
dfgenre

Unnamed: 0_level_0,id_text,period,language,genre,lemma
id_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q003230,,Neo-Assyrian,Akkadian,Royal Inscription,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003231,,Neo-Assyrian,Akkadian,Royal Inscription,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003232,,Neo-Assyrian,Akkadian,Royal Inscription,_ _ _ _ _ _ _ _ _ qabû[say]V _ _ alāku[go]V _ ...
Q003233,,Neo-Assyrian,Akkadian,Royal Inscription,_ kamîš[in-a-bound-state]AV u[and]CNJ Teušpa[C...
Q003234,,Neo-Assyrian,Akkadian,Royal Inscription,ša[that]REL ahu[brother]N rabû[great]AJ ahu[br...
...,...,...,...,...,...
Q008355,,Neo-Assyrian,Akkadian,Royal Inscription,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...
Q008356,,Neo-Assyrian,Akkadian,Royal Inscription,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...
Q008357,,Neo-Assyrian,Akkadian,Royal Inscription,_ Aššur-bani-apli[Ashurbanipal-king-of-Assyria...
Q008358,,Neo-Assyrian,Akkadian,Royal Inscription,ēkallu[palace]N Aššur-bani-apli[Ashurbanipal-k...


In [7]:
#===========================================================================
#SAVE THE FILE
#You can choose to save this table at this point.

#This will save as a .csv file to your output folder.
#===========================================================================
savefile =  'Royal_Inscriptions.csv'#Choose the name of the file for your table.
with open('output/' + savefile, 'w', encoding="utf-8") as w:
    dfgenre.to_csv(w, index=True) #make sure the beginning of this line (before .to_csv) matches your table name.

In [16]:
#===========================================================================
#FILTERING ACCORDING TO lANGUAGE

#Here you can filter the dataframe according to a column. In this case, it 
#is filtering the 'language' column according to 'Akkadian'.
#===========================================================================
neoasslemma = dfgenre[dfgenre['language']=='Akkadian']
neoasslemma

Unnamed: 0_level_0,id_text,period,language,genre,lemma
id_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q003230,,Neo-Assyrian,Akkadian,Royal Inscription,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003231,,Neo-Assyrian,Akkadian,Royal Inscription,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003232,,Neo-Assyrian,Akkadian,Royal Inscription,_ _ _ _ _ _ _ _ _ qabû[say]V _ _ alāku[go]V _ ...
Q003233,,Neo-Assyrian,Akkadian,Royal Inscription,_ kamîš[in-a-bound-state]AV u[and]CNJ Teušpa[C...
Q003234,,Neo-Assyrian,Akkadian,Royal Inscription,ša[that]REL ahu[brother]N rabû[great]AJ ahu[br...
...,...,...,...,...,...
Q008355,,Neo-Assyrian,Akkadian,Royal Inscription,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...
Q008356,,Neo-Assyrian,Akkadian,Royal Inscription,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...
Q008357,,Neo-Assyrian,Akkadian,Royal Inscription,_ Aššur-bani-apli[Ashurbanipal-king-of-Assyria...
Q008358,,Neo-Assyrian,Akkadian,Royal Inscription,ēkallu[palace]N Aššur-bani-apli[Ashurbanipal-k...


In [10]:
#===========================================================================
#SAVE THE FILE
#You can choose to save this table at this point.

#This will save as a .csv file to your output folder.
#===========================================================================
savefile =  'Royal_Inscriptions_Akkadian.csv'#Choose the name of the file for your table.
with open('output/' + savefile, 'w', encoding="utf-8") as w:
    neoasslemma.to_csv(w, index=True) #make sure the beginning of this line (before .to_csv) matches your table name.

In [17]:
#===========================================================================
#ONLY SHOW THE TEXTS CONTENTS
#Now you've filtered your data according to your parameters, you can now 
#choose to only view the lemma. The following code only displays the 'lemma' 
#column.
#===========================================================================

Cleanperiodlemma = neoasslemma[['lemma']]
Cleanperiodlemma

Unnamed: 0_level_0,lemma
id_text,Unnamed: 1_level_1
Q003230,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003231,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003232,_ _ _ _ _ _ _ _ _ qabû[say]V _ _ alāku[go]V _ ...
Q003233,_ kamîš[in-a-bound-state]AV u[and]CNJ Teušpa[C...
Q003234,ša[that]REL ahu[brother]N rabû[great]AJ ahu[br...
...,...
Q008355,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...
Q008356,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...
Q008357,_ Aššur-bani-apli[Ashurbanipal-king-of-Assyria...
Q008358,ēkallu[palace]N Aššur-bani-apli[Ashurbanipal-k...


In [18]:
#===========================================================================
#REMOVE DUPLICATE TEXTS
#ORACC has texts that appear in multiple sub-projects. This can skew any 
#statistical analysis. To avoid this, you remove the duplicates according to 
#their P number.
#===========================================================================

NeoAsslemmanoduplicates = Cleanperiodlemma.drop_duplicates(subset=['lemma'], keep='last') #The last bit of code says that if there are duplicates, keep the last instance.
NeoAsslemmanoduplicates

Unnamed: 0_level_0,lemma
id_text,Unnamed: 1_level_1
Q003230,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003231,ēkallu[palace]N Aššur-ahu-iddina[Esarhaddon-ki...
Q003232,_ _ _ _ _ _ _ _ _ qabû[say]V _ _ alāku[go]V _ ...
Q003233,_ kamîš[in-a-bound-state]AV u[and]CNJ Teušpa[C...
Q003234,ša[that]REL ahu[brother]N rabû[great]AJ ahu[br...
...,...
Q008354,Edimgalkalama[1]TN ša[that]REL qerbu[centre]N ...
Q008356,ana[to]PRP Sutiti[1]DN bēltu[lady]N Aššur-bani...
Q008357,_ Aššur-bani-apli[Ashurbanipal-king-of-Assyria...
Q008358,ēkallu[palace]N Aššur-bani-apli[Ashurbanipal-k...


In [13]:
#===========================================================================
#SAVE FINAL TABLE
#You should now have a table that is ready for your digital project. 
#This will save as a .csv file to your output folder.
#===========================================================================
savefile =  'Final_Table.csv'#Choose the name of the file for your table.
with open('output/' + savefile, 'w', encoding="utf-8") as w:
    NeoAsslemmanoduplicates.to_csv(w, index=True) #make sure the beginning of this line (before .to_csv) matches your table name.