This notebook contains code in sections to:
1. scrape WordSmyth as a set of html webpages from the internet
2. load and parse text from the downloaded set of webpages into a plain text file
3. organise text file into a dictionary dataframe
4. merge or otherwise modify glosses in dictionary dataframe to take into account cross-references among glosses
5. tally the number of senses and meaning in the dataframe

NOTES: 
1. assumes file structure where script and scraped WordSmyth html files are in the same folder, with subfolder for html files
3. interim dataframe can be saved at various steps to allow for manual checking and modification before next step -- checks are recommended for first-pass due to idiosyncracies in a small percentage of words/ glosses
4. there are sections which may do more than is needed -- can be skipped or substituted with modified input at next step
5. by default has a filter by existence of covariates (SUBTLwf, nLet, nSyll, nPhon, coltNOrth, coltNPhon, old20, pld20, posBigram, isHomograph, isHomophone) taken from http://blairarmstrong.net/tools/Union_Subtl_CMUPron/index.html which requires a separate "fullCov.csv" file which is included in the same folder

#### Set filepaths and parameters in the following cell:

In [None]:
#parameter setting for: 

##1. subfolder name for data output/ html downloads
outFolderName = "/Oct2019" # html files originally scraped (and downloaded) in Oct 2019

##2. whether to first conduct scrape from internet (onlineScrape = True) to download html files into zipped folder
##   or parse directly from downloaded html files in zipped folder (onlineScrape = False)
### online: set WordSmyth page range (by default from 1 to 61495, which covers all valid pages as of Oct 2019)
### offline: set folder to find downloaded html files -- by default same as outFolderName; 
###          set identifying tag for paresing files (from scrape to dataframe) -- by default day of parse
from datetime import date

onlineScrape = False

if onlineScrape == True:
    minpage = 1
    maxpage = 61495
else:
    htmlFolderName = outFolderName
    parsetag = str(date.today())

##3. whether to use full word list from the dictionary or only a partial/filtered list
### set function to filter, currently by column in dataframe (NOTE: dictionary merged with filter list in dataframe if toFilter)

### currently filtered by presence of a subset of covariate measures; other filtering list in .csv format can be substituted

toFilter = False

filterFileName = "/fullCov.csv"

if toFilter == True:
    filetag = "filtered"
else:
    filetag = "allwords"

    
def filterFnc(parseDF, filterDFName): #parseDF = parsed and flagged dictionary dataframe, filterDF = filter file name
    filterDF = pd.read_csv(folderpath + filterDFName)
    
    parseDF = parseDF.merge(filterDF, 'left', on = 'word')
    
    parseDF = (parseDF[parseDF['old20'].notnull()]).copy(deep=True).reset_index(drop=True)
    
    return parseDF


In [None]:
#get file path for current folder (assumes wordsmyth files are saved in same folder as script in a folder named)

import os
folderpath = os.path.abspath('')

#depreciated:
#folderpath = os.path.dirname(os.path.abspath(__file__))

print(folderpath)

if not os.path.exists(folderpath + outFolderName):
    os.makedirs(folderpath + outFolderName)

if onlineScrape == True:  
    if not os.path.exists(folderpath + outFolderName + "/wordsmyth_html"):
        os.makedirs(folderpath + outFolderName + "/wordsmyth_html")

In [None]:
#WordSmyth Scraper (and download) saves dictionary text (word, part-of-speech, definition) to a .txt file

import re
import multiprocessing
import collections
import csv
import os
import ssl
import sys
import time
import uuid
#import urllib2 #python 2
import urllib.request
import urllib.error
import wget
import requests
import zipfile
import shutil
#from importlib import reload
from datetime import date
from itertools import compress
from nltk.tokenize import RegexpTokenizer
#from nltk.stemmer import PorterStemmer
from bs4 import BeautifulSoup
#from multiprocessing import Pool
if sys.version[0] == '2': #(python 3 already uses utf-8 by default)
    reload(sys)
    sys.setdefaultencoding("utf-8")

#function to parse html page (same regardless of whether online or offline)
##takes in html page and log file to write to; does not return variable
##NOTE: some formatting has changed in html post original scrape/html files from Oct 2019


##########get pages from website##########

if onlineScrape == True:
    logf = open(folderpath + outFolderName + "/ScrapeExceptions" + parsetag + ".txt", "w+")

    for i in range(minpage,maxpage):

        url = "https://www.wordsmyth.net/?level=3&ent=&rid=" + str(i) + "&f_div=lc_div_alpha"
    
        while True:
            try:
                #page = requests.get(url, verify=False, timeout=30) #python3
                page = urllib.request.urlopen(url, timeout=30)
                #response = urllib2.urlopen(url, timeout = 30) #python2
                break
            #except urllib2.HTTPError, detail: #python2
            except urllib.error.HTTPError as detail:
                if detail.errno == 500:
                    time.sleep(1)
                    continue
                else:
                    logf.write(str(i)+' error\n')
                    raise MyException("Word ID " + str(i) + "failed")
        
        #python3:
        fid = open(folderpath + outFolderName + "/wordsmyth_html/WS" + str(i) + ".html", "wb")
        shutil.copyfileobj(page, fid)
        fid.close()
        
        #python2:
        #page = response.read() 
        #with open(folderpath + outFolderName + '/wordsmyth_html/WS'+str(i) +'.html', 'w') as fid:
            #fid.write(page)
            
        #soup = BeautifulSoup(page, 'html.parser')
        
    logf.close()

    #zip from uncompressed folder
    with zipfile.ZipFile(folderpath + outFolderName + '/wordsmyth_html.zip', 'w') as zip_object:
        for folder_name, sub_folders, file_names in os.walk(folderpath + outFolderName + '/wordsmyth_html'):
            for filename in file_names:
                file_path = os.path.join(folder_name, filename)
                zip_object.write(file_path, os.path.basename(file_path))

In [None]:
#open a file for main scrape and a file to log exceptions
f = open(folderpath + outFolderName + "/WordSmythScrape" + parsetag + ".txt","w+", encoding="utf-8", errors="surrogateescape")
logf = open(folderpath + outFolderName + "/ScrapeExceptions" + parsetag + ".txt", "a+", encoding="utf-8", errors="surrogateescape")

##########get pages from saved html##########

#extraction from zip folder
with zipfile.ZipFile(folderpath + outFolderName + "/wordsmyth_html.zip", 'r') as zip_ref:
    zip_ref.extractall(folderpath + outFolderName)

WSpages = [file for file in os.listdir(folderpath + outFolderName + "/wordsmyth_html/") if file.endswith('.html')]
for fname in WSpages:
    print(fname)
    soup = BeautifulSoup(open(folderpath + outFolderName + "/wordsmyth_html/" + fname), "html.parser")
        
############################parsing pages

    #break if id has no content
    if len(soup.find_all('div', {'class':'notfound'})) != 0:
        #logf.write(str(i)+' missing \n') #from web
        logf.write(fname+' missing \n') #from file
        continue
    
    
    word = soup.find_all('h3', {'class':"headword"})[-1].get_text() #NOTE: 'h1' in scrapes after Oct 2019

    inflections = []
    alltd = soup.find_all('td', {"class":"title"})
    for td in alltd:
        if len(td.find_all('a', {'title':"Click to learn more about parts of speech"})) > 0 :
            #inflections.append('pos=' + td.next_sibling.next_sibling.getText().encode('utf-8')) #python2
            inflections.append('pos=' + td.next_sibling.next_sibling.getText())
        if len(td.find_all('a', {"title":"Click to learn more about inflections"})) > 0 :
            #inflections.append(td.next_sibling.next_sibling.getText().encode('utf-8')) #python2
            inflections.append(td.next_sibling.next_sibling.getText())
    
    inflections_paired=[]
    for i in range(len(inflections)-1):
        if inflections[i][0:4] == 'pos=' and inflections[i+1][0:4] != 'pos=':
            inflections_paired.append((inflections[i][4:], inflections[i+1]))

    #pos = soup.find_all('tr', class_="postitle")
    defn = soup.find_all('tr', {'class':['postitle','definition']})
    
    g=0
    
    f.write("#" + word + '\n')
    
    for d in defn:
        box = d.find_all('dl')
        if type(box)!=type(None):
            for b in box:
                b.extract()
        if d['class'][0]=='postitle':
            pos = d.get_text().split('\n')[2]
            f.write('@'+pos+'\n')
            f.write('infn:' + str([i[1] for i in inflections_paired if i[0] == pos]) + '\n')
        elif d['class'][0]=='definition':
            g=g+1

            f.write("defn"+str(g)+":")
            if type(d.find_all("a"))!=type(None):
                for l in d.find_all("a"):
                    l.extract()
            if type(d.find_all("em"))!=type(None):
                for e in d.find_all("em"):
                    e.extract()
            f.write(d.get_text().strip()+'\n')
        
logf.close()
f.close()
                         

In [None]:
#get basic dictionary (as is) as dataframe from text

import re
import pandas as pd

def readfile(fname):
    #import codecs #python 2
    #with codecs.open(fname, encoding='utf-8') as f:

    with open(fname, "r", encoding='utf-8') as f:
    
        dic = f.readlines()

    return dic

#how text file from scrape (above) is written: lemmas start with #, pos start with @, 
##inflections start with infn:, definitions start with defn*:
#to populate dataframe, when # is encountered, lemma is set till next #; when @ is encountered, pos is set till next @, 
##one definition per line, inheriting current lemma and pos
#once dataframe is populated, separate number in lemma from string: 
##if there are repeated string entries in dictionary, leave it
##if there are no repeated string entries in dictionary, delete entry from dataframe (not proper word)


#THE FOLLOWING CAN BE SUBSTITUTED FOR OTHER SCRAPED FILES
#text = readfile('C:\\Users\\modi1\\Dropbox\\PhD\\vectors code\\dictionary\\Oct2019\\WordSmythScrape2019-10-29.txt')
text = readfile(folderpath + outFolderName + "/WordSmythScrape" + parsetag + ".txt")

lemma = "*"
word = "*"
pos = "^"
infln = []
defn = []
meaningNo = 0
glossNo = 0
meanings = [['wordOG', 'word', 'meaningNo', 'infln', 'pos', 'gloss', 'glossNo']]
#meanings = pd.DataFrame(columns=['word', 'pos', 'gloss'])
#l=0
#progress = 0
for line in text:
    #progress = progress + 1
    #print (str(progress/(sum(1 for line in text))) + '%')
    if line.startswith("#"):
        pos = "^"
        defn = []
        infln = ""
        word = line[line.index("#")+1:].strip()
        lemma = word[:word.rfind(re.findall('[^0-9]', word)[-1])+1]
        wordN = word[word.rfind(re.findall('[^0-9]', word)[-1])+1:]
    elif line.startswith("infn"):
        infln = line[line.index("[")+1:line.index("]")-1]
    elif line.startswith("@"):
        pos = line[line.index("@")+1:]
    elif line.startswith("defn"):
        defn = line[line.index(":")+1:]
        defnN = line[4:line.index(":")]
        #meanings.at[l, 'word'] = lemma.strip()
        #meanings.at[l, 'pos'] = pos.strip()
        #meanings.at[l, 'gloss'] = defn.strip()
        #l=l+1
        meanings.append([word, lemma, wordN, infln.strip('u\''), pos.strip(),defn.strip(), defnN])

        
dic = pd.DataFrame(meanings).T.set_index(0).T
#output basic dictionary as csv
dic.to_csv(folderpath + outFolderName + '/dictionary' + parsetag + '.csv', index = False, encoding='utf-8')
#pickle file for python use (reading csv results in evaluation of protected words like "null")
dic.to_pickle(folderpath + outFolderName + '/dictionary' + parsetag + '.pkl') 

In [None]:
#take basic dictionary dataframe and flag for things to exclude/ merge/ check
##NEW: search for words with alternative spellings 
## -- check if they all lead back to the same glosses anyway or have different entries
##remove lemmas with more than one word (search for space)
##remove glosses where pos are abbreviations
##check if word is inflected form -- not counted if word same as its own inflections

#recommendation for manual check (some are not uniform indicators; remove anything that is not proper word):
## definitions that contain the string "abbr." 
## referents which are not found for manual check (printed as cell output)


#THE FOLLOWING CAN BE SUBSTITUTED FOR OTHER PICKLED DICTIONARY FILES
#dic = pd.read_pickle('C:\\Users\\modi1\\Dropbox\\PhD\\vectors code\\dictionary\\Oct2019\\dictionary.pkl')
dic = pd.read_pickle(folderpath + outFolderName + '/dictionary' + parsetag + '.pkl')

dic_flagged = dic.copy(deep=True).reset_index(drop=True)

dic_flagged["isPhrase"] = 0

dic_flagged["isAbbr"] = 0

dic_flagged["isInfln"] = 0

#for each gloss that is phrase/abbreviation, get word

phraseList=[]
abbrList=[]
for i in dic_flagged.index:
    if 'phrase' in dic_flagged.at[i,'pos']:
        phraseList.append(dic_flagged.at[i,'word'])
    elif 'abbr' in dic_flagged.at[i,'pos']:
        abbrList.append(dic_flagged.at[i,'word'])

#for each word that has pos with phrase or abbreviation, flag in dataframe        
dic_flagged.loc[dic_flagged['word'].isin(phraseList),'isPhrase'] = 1
dic_flagged.loc[dic_flagged['word'].isin(abbrList),'isAbbr'] = 1


#get list of inflections        
#inflnList=sum(([i.split(", ") for i in pd.unique(dic['infln'])]),[])
inflnList=[i for i in pd.unique(dic['infln'])]

#for each word that is an inflection of another word, flag in dataframe
##note that some words are exactly the same as their corresponding inflection; unflag these
dic_flagged.loc[dic_flagged['word'].isin(inflnList),'isInfln'] = 1

for i in dic_flagged.index:
    if not pd.isna(dic_flagged.at[i, 'infln']):
        if dic_flagged.at[i, 'word'] in dic_flagged.at[i, 'infln'].split(', '):
            dic_flagged.at[i, 'isInfln'] = 0 
        
#flag glosses that refers to other glosses, what gloss are they refering to? ALSO CHECK THIS MANUALLY
possibleRef=('variant of ', 'pl. of', 'plural of', 'past participle of', 'past tense of', 'present participle of')
for i in dic_flagged.index:
    if dic_flagged.at[i, 'isInfln'] == 1: 
        dic_flagged.at[i, 'hasRef'] = 1
        dic_flagged.at[i, 'ref'] = list(dic_flagged[dic_flagged['infln'].str.contains(', ' + dic_flagged.at[i, 'word'] + '$| ' + dic_flagged.at[i, 'word'] + '\,|^' + dic_flagged.at[i, 'word'] + '$|^' + dic_flagged.at[i, 'word'] + '\,')==True][['word', 'pos']].drop_duplicates().itertuples(index=False, name=None))
    if dic_flagged.at[i, 'gloss'].startswith('see '): 
        dic_flagged.at[i, 'hasRef'] = 1
        dic_flagged.at[i, 'ref'] = dic_flagged.at[i, 'gloss'][dic_flagged.at[i, 'gloss'].index('see ')+4:-1]
    if any(s in dic_flagged.at[i, 'gloss'] for s in possibleRef):
        dic_flagged.at[i, 'hasRef'] = 1
        dic_flagged.at[i, 'ref'] = dic_flagged.at[i, 'gloss'][dic_flagged.at[i, 'gloss'].rindex(' of ')+4:-1]

dic_flagged['hasRef']=dic_flagged['hasRef'].fillna(value = 0)
dic_flagged['meaningNo']=dic_flagged['meaningNo'].fillna(value = 1)


#merge with filtering file (default is a dataframe of words with full set of covariates + wordnet senses)
if toFilter == True:
    dic_flagged = filterFnc(dic_flagged, filterFileName)
    
#    #CAN BE SUBSTITUTED
#    #fullCov = pd.read_csv("C:\\Users\\modi1\\Dropbox\\PhD\\vectors code\\dictionary\\fullCov.csv") 
#    filterDF = pd.read_csv(folderpath + outFolderName + filterFileName)
#    
#    dic_flagged = dic_flagged.merge(filterDF, 'left', on='word') 
#    #ADD ANY FILTERING FUNCTION HERE
#    #dic_flagged = (dic_flagged[dic_flagged['old20'].notnull()]).copy(deep=True).reset_index(drop=True)


#output dictionary dataframe flagged for special cases and merged covariates
dic_flagged.to_csv(folderpath + outFolderName + '/dic_flagged_' + filetag + parsetag + '.csv', index = False, encoding='utf-8')
dic_flagged.to_pickle(folderpath + outFolderName + '/dic_flagged_' + filetag + parsetag + '.pkl') 
    

### Recommended to do manual checking of the dic_flagged output here for accuracy of filters

In [None]:
## Append/ replace any gloss with external referents as flagged above

import regex
import re
import numpy as np
import regex
import pandas as pd


#import MANUALLY CHECKED LIST (take only rows with covariates)
#dic_flagged = pd.read_csv("C:\\Users\\modi1\\Dropbox\\PhD\\vectors code\\dictionary\\Oct2019\\dicflagged_trimmed.csv") 


#OR use unfiltered variable created from previous section
#dic_flagged.merge(fullCov) #if using unfiltered variable created from previous section
dic_flagged = pd.read_pickle(folderpath + outFolderName + '/dic_flagged_' + filetag + parsetag + '.pkl')

#intiate new merged_dictionary
merged_dic = pd.DataFrame(columns=list(dic_flagged.columns) + ['mergedSense', 'mergedGlossNo'])

#get unique word list of all dictionary words (not stripped for meaningNo)
#ANY FURTHER FILTERING FUNCTION TO BE ADDED HERE
allwords = pd.unique(dic_flagged['wordOG'])

possibleRef=('variant of ', 'pl. of', 'plural of', 'past participle of', 'past tense of', 'present participle of')

#print to file cases where referent marked as existent but is not found -- needs manual check
reflog = open(folderpath + outFolderName + "/checkreferent_" + filetag + parsetag + ".txt","w+", encoding="utf-8")

dropsense = False
#for each word, get all senses as separate df
for w in allwords:
    #sensedf = []
    sensedf = (dic_flagged.loc[dic_flagged['wordOG'] == w]).copy(deep=True).reset_index(drop=True)
    sensedf2 = sensedf.copy(deep=True) #to refer back to for creating sensedf_add (in case sensedf has no more rows after edit)
##check if word has referent: if no, mark appendedSense as 0 for all rows and go to next word;
    if sum(sensedf['hasRef']) == 0:
        sensedf['mergedSense'] = 0
        sensedf['mergedGlossNo'] = sensedf['glossNo']
        #merged_dic = merged_dic.append(sensedf)
###if word has referent: go through line by line to see what kind of reference it is: 
####ignore null reference
####extract all tuples as tuples
####check what is left, if any for need to append pos
    else:
        reflist = []

        for i in sensedf.index.values:
            dropsense = False
            if type(sensedf.at[i,'ref']) == float:
                continue
            #alltuples = [tuple(t.split(', ')) for t in re.findall('\((.*?)\)', str(sensedf.at[i,'ref']))]
#######all tuples of word and pos. Note that outermost parentheses taken to be delimiter as there are nested parentheses
            alltuples = [tuple(str(sensedf.at[i,'ref'])[t.start()+1:t.end()-1].split(', ')) for t in regex.finditer(r'\(([^()]|(?R))*\)', str(sensedf.at[i,'ref']))]
            reflist.extend(alltuples)
#######everything else
            sense_i = re.sub('\([^>]+\)', '', str(sensedf.at[i,'ref']).strip('[]'))
            
            if not any(i.isalnum() for i in sense_i):
                continue
####if gloss refers to referent directly delete original gloss
####Note: for sense inflections, take only verb senses; for plurals, take only noun senses 
            #if any(s in sensedf.at[i, 'gloss'] for s in possibleRef) or sensedf.at[i, 'gloss'].startswith('see '):
            #    reflist.append(sensedf.loc[i, 'ref'])
            if any(s in sensedf.at[i,'gloss'] for s in ('pl. of', 'plural of')):
                dropsense = True
                #nounsenses = (sensedf.loc[i, 'ref']).split(', ')
                nounsenses = [n for n in sense_i.split(', ') if any(let.isalnum() for let in n)]
                #reflist.extend(tuple(zip(nounsenses, ['(?<!pro)noun'])))
                reflist.extend(map(lambda s: (s, '(?<!pro)noun'), nounsenses))
                
            if any(s in sensedf.at[i,'gloss'] for s in ('past tense of', 'past participle of', 'present participle of')):
                dropsense = True
                #verbsenses = (sensedf.loc[i, 'ref']).split(', ')
                verbsenses = [v for v in sense_i.split(', ') if any(let.isalnum() for let in v)]
                #reflist.extend(tuple(zip(verbsenses, ['(?<!ad)verb'])))
                reflist.extend(map(lambda s: (s, '(?<!ad)verb'), verbsenses))
                
            if ('variant of' in sensedf.at[i,'gloss']) or (sensedf.at[i, 'gloss'].startswith('see ')):
                dropsense = True
                #reflist.extend((sensedf.loc[i, 'ref']).split(', '))
                reflist.extend([lem for lem in sense_i.split(', ') if any(let.isalnum() for let in lem)])               
                               
            if dropsense == False:
            #if (all(s not in sensedf.at[i,'gloss'] for s in possibleRef) and not ('variant of' in sensedf.at[i,'gloss']) and not (sensedf.at[i, 'gloss'].startswith('see ')):
                reflist.extend([lem for lem in sense_i.split(', ') if any(let.isalnum() for let in lem)])
                           
            if dropsense == True:
                sensedf.drop([i], inplace = True)

        sensedf['mergedSense'] = 0
###append UNIQUE referent senses to word (flag as appendedSense) 
###[Note that some referents are in the form of list of tuples]
###keep count of how many new senses there are to append
        #for r in sensedf.ref:
        #    if type(r) != float and r not in reflist:
        #        reflist.extend(r)
        #print(reflist)
        for r in range(len(reflist)):
            if type(reflist[r]) == str:
                if not dic_flagged['wordOG'].eq(reflist[r]).any():
                    reflog.write("referent not found: word = " + w + ", referent = " + reflist[r] + "\n")
            elif type(reflist[r]) == tuple:
                if reflist[r][0][-1].isdigit():
                    if not dic_flagged['wordOG'].eq(reflist[r][0]).any():    
                        reflog.write("referent not found: word = " + w + ", referent = " + reflist[r][0] + "\n")
                else:
                    if not dic_flagged['word'].eq(reflist[r][0]).any():    
                        reflog.write("referent not found: word = " + w + ", referent = " + reflist[r][0] + "\n")
                        
            if (type(reflist[r]) == tuple and len(reflist[r]) == 2):
                if reflist[r][1] == 'transitive verb':
                    refpos = '(?<!in)transitive verb'
                else:
                    refpos = reflist[r][1]

            if r == 0:
                if type(reflist[r]) == str:
                    refsenses = (dic_flagged.loc[dic_flagged['wordOG'] == reflist[r]]).copy(deep=True).reset_index(drop=True)
                elif type(reflist[r]) == tuple:
                    if reflist[r][0][-1].isdigit():
                        refsenses = (dic_flagged.loc[(dic_flagged['wordOG'] == reflist[r][0]) & (dic_flagged['pos'].str.contains(refpos))]).copy(deep=True).reset_index(drop=True)
                    else:
                        refsenses = (dic_flagged.loc[(dic_flagged['word'] == reflist[r][0]) & (dic_flagged['pos'].str.contains(refpos))]).copy(deep=True).reset_index(drop=True)
            else:
                if type(reflist[r]) == str:
                    pd.concat([refsenses, (dic_flagged.loc[dic_flagged['wordOG'] == reflist[r], ('pos', 'gloss')])], axis=0)
                    #refsenses.append((dic_flagged.loc[dic_flagged['wordOG'] == reflist[r], ('pos', 'gloss')]).copy(deep=True).reset_index(drop=True))
                elif type(reflist[r]) == tuple: 
                    if reflist[r][0][-1].isdigit():
                        pd.concat([refsenses, (dic_flagged.loc[dic_flagged['wordOG'] == reflist[r][0], ('pos', 'gloss')])], axis=0)
                        #refsenses.append((dic_flagged.loc[(dic_flagged['wordOG'] == reflist[r][0]) & (dic_flagged['pos'].str.contains(refpos)), ('pos', 'gloss')]).copy(deep=True).reset_index(drop=True))
                    else:
                        pd.concat([refsenses, (dic_flagged.loc[dic_flagged['word'] == reflist[r][0], ('pos', 'gloss')])], axis=0)
                        #refsenses.append((dic_flagged.loc[(dic_flagged['word'] == reflist[r][0]) & (dic_flagged['pos'].str.contains(refpos)), ('pos', 'gloss')]).copy(deep=True).reset_index(drop=True))

        sensedf_add = pd.DataFrame(np.repeat(sensedf2.loc[sensedf2.index == 0].values,len(refsenses),axis=0),columns=sensedf2.columns)    # copy all columns for current word as scaffolding for referred senses
        
        for s in sensedf_add.index:
            sensedf_add.loc[s, 'pos'] = refsenses.loc[s, 'pos']
            sensedf_add.loc[s, 'gloss'] = refsenses.loc[s, 'gloss']
            sensedf_add.loc[s, 'mergedSense'] = 1
            sensedf_add.loc[s, 'glossNo'] = refsenses.loc[s, 'glossNo']
        
        #drop duplicate rows
        sensedf = pd.concat([sensedf, sensedf_add], axis=0).drop_duplicates(subset = ['pos', 'gloss', 'glossNo']).reset_index()
        #glossNo is now a mixture of original glossNo (mergedSense == 0) and reference glossNo (mergedSense == 1)
        #set new column with new glossNos
        sensedf['mergedGlossNo'] = sensedf.index + 1


    #append checked sense df to merged_dictionary
    merged_dic = pd.concat([merged_dic, sensedf], axis=0)
    #merged_dic.append(sensedf).reset_index()

reflog.close()

##output dataframe with inflections/ derivations merged in
#merged_dic.reset_index().to_csv(folderpath + outFolderName + '/dicmerged_allwords.csv', index = False, encoding='utf-8')
merged_dic.reset_index().to_pickle(folderpath + outFolderName + '/dicmerged_' + filetag + parsetag + '.pkl')


### Recommended to do manual check of referent print-out here for any additional substitutions to merged_dic

In [None]:
#do tally of NOS and NOM (NOTE: NOM only counted when NOM >1)

## THE FOLLOWING CAN BE SUBSTITUED FOR OTHER DICTIONARY FILES
#dic_df = pd.read_pickle('folderpath + outFolderName + '/dicmerged1811.pkl')
dic_df = pd.read_pickle(folderpath + outFolderName + '/dicmerged_' + filetag + parsetag + '.pkl')

tally_dic =  pd.pivot_table(data = dic_df, 
                            index = ['word'],
                            values=["mergedSense", "mergedGlossNo", "meaningNo"],
                            aggfunc={"mergedSense":np.sum, "mergedGlossNo": len, "meaningNo": max},
                            #if using dictionary without merged senses:
                            #values=["glossNo", "meaningNo"],
                            #aggfunc={"glossNo":max, "meaningNo": max}
                           )
tally_dic = tally_dic.rename(columns={"meaningNo": "NOM",
                          "mergedSense": "mergedSenseCount", 
                          "mergedGlossNo": "NOS_merged"})
#tally_dic = tally_dic.rename(columns={"meaningNo": "NOM",
#                          "glossNo": "NOS_original"})

tally_dic['word'] = tally_dic.index
tally_dic.reset_index(drop=True, inplace=True)

#output tallied file -- also includes all the covariate, merging and flagging information
merged_dic.merge(tally_dic).to_csv(folderpath + outFolderName + '/dictally_' + filetag + parsetag + '.csv', 
                                   index = False, encoding='utf-8')