First: Text Parsing!
====================
This is where the information that we want is extracted from the text we have.

In [16]:
import re, requests, json, datetime

In [17]:
import os
def Ensure_dir(d):
    """Makes sure a directory 'd' exists, and if it doesn't, it creates one."""
    if not os.path.exists(d):
        os.makedirs(d)
    return 0

In [18]:
def open_texts(file_start_str,first,last):
    """Creates a dictionary from a set of .htm files.
    Assumes all have the same starting string immediately followed by a page number.
    Returns a dictionary with page numbers as keys and page contents as values."""
    out_dict = {}
    for i in range(first,last+1):
        with open("{}{}.htm".format(file_start_str,i),encoding="utf-8") as htmfile:
            out_dict[i] = htmfile.read()
    return out_dict

Creating a dictionary called `market_text` and filling it out with key/value pairs corresponding to page numbers and page texts.

In [19]:
market_text = open_texts("html/slovar",10,99)
market_text.update(open_texts("html/slovar",102,133)) #skiping page 100-101, no text present

Cleanup cell. Currently fixing some common capitalization errors, and replacing nbsp's with normal spaces.

In [20]:
for p in market_text.keys():
    #nbsp to normal space
    market_text[p] = re.sub("&nbsp;"," ",market_text[p])
    #Capitalization fixes
    market_text[p] = re.sub(r"([> ])ВЪ",r"\1Въ",market_text[p])
    market_text[p] = re.sub(r"([а-яіѣѳ]+)(Ѣ)",r"\1ѣ",market_text[p])
    market_text[p] = re.sub(r"(Ѣ)([а-яіѣѳ]+)",r"ѣ\2",market_text[p])
    market_text[p] = re.sub(r"([а-яіѣѳ]+)(Ъ)",r"\1ъ",market_text[p])

In [21]:
def extract_entries(text_dict,entrymatch,tailmatch,entry_name='name_old_orth',entry_text='description_old_orth'):
    """
    Extracts entries from text dict. Takes dict with format {pageNumber: pageText...} 
    and returns dict with entry name and text as well as page number from dict.
    
    text_dict: dict with format {pageNumber: text...} created with previous function open_texts.
    
    entrymatch: regex string to match entry name and text. Regex should be formatted so that 
    entry name is captured in entry text, entry text is in first capture group, and entry 
    name is in first capture group, e.g. r">(([А-ЯІѢѲ .-]+)[<, ].*)</p>".
    
    tailmatch: regex string to match tail from previous entry at beginning of page.
    
    entry_name: string to be used as key for entry name.
    
    entry_text: string to be used as key for entry text.
    """
    out_dict = {}
    last_entry = ""
    counter = 1
    pages = list(text_dict.keys())
    pages.sort()
    for p in pages:
        # finds trailing descriptions at beginning of page, appends to last entry on previous page.
        trail = re.findall(tailmatch,text_dict[p])
        if len(trail)>0 and len(re.findall(entrymatch,trail[0]))==0:
            out_dict[last_entry][entry_text]+=trail[0]
        # finds all place names on page, captures descriptions as well. puts into dictionary with name as key and desc as value.
        places = re.findall(entrymatch,text_dict[p])
        if len(places)>0:
            for v in places:
                placename = v[1].strip()
                out_dict[counter] = {entry_name:placename,entry_text:v[0],'page':p}
                counter+=1
            last_entry = counter-1
    return out_dict

In [22]:
test = ">ОМЕЛЬНИКЪ, </span><span class=\"font0\" style=\"font-style:italic;\">мѣстечко</span><span class=\"font0\"> Кіевскаго Намѣстничества въ Миргородскомъ уѣздѣ. Въ ономъ бываетъ въ году двѣ ярмарки : одна Іюля 20, а другая Октябру 14; торгуютъ на оныхъ пріѣзжіе изъ уѣзду жители разными мѣлочными товарами, тамъ родящимися и дѣлающимися, какъ - то хлѣбомъ , горячимъ виномъ, разиымъ скотомъ, овчинами , кожами и проч. иногда пріѣзжаютъ и Великороссійскіе купцы съ краснымъ товаромъ."
re.findall(r">(([А-ЯІѢѲ .-]+)[<, ].*)</p>",test)

[]

In [23]:
market_towns = extract_entries(market_text,r">(([А-ЯІѢѲ .-]+)[<, ].*)</p>",r"<body>\s+(<p><span class=\"font0\">.*</span></p>)") # creating dict market_towns to hold individual place info.
market_towns.pop(1,None) #Known extraneous hit at beginning of doc, removed.
market_towns.pop(2,None) #Known extraneous hit at beginning of doc, removed.

{'description_old_orth': 'ЯРМАРОКЪ и ТОРГОВЪ.</span>',
 'name_old_orth': 'ЯРМАРОКЪ',
 'page': 10}

In this cell, we're looking for town types, which will be in italics. We're also looking for administrative divisions based on their titles, both Namiestnichestva and uezd.

In [24]:
from lxml import html
def strip_html(text):
    return html.fromstring(text).text_content()

In [25]:
def extract_value(entrydict,val_name,val_regex,entry_text_key='description_old_orth',strip_val=" ",strip_html=False):
    """
    Extracts a value from a dict with some text value with key entry_text_key. Uses val_regex to extract a value and
    add it to the entrydict, which is the return value.
    
    entrydict: dict with some entry that has text from which individual values can be extracted.
    
    val_name: name of the value to be extracted. This will be used as the key to modify entrydict.
    
    val_regex: a regular expression string or a list of regular expression strings. These will be used to extract
    the desired value from the text in entrydict. If passing a list, the first regular expression to find a match 
    will be used to create the value k:v pair. Also note that if no regular expressions find matches, entrydict will 
    be returned without modification.
    
    entry_text_key: key which corresponds to the text with values to be extracted in entrydict
    
    strip_val: value will be passed to a str.strip() to be applied to the value extracted. Defaults to " ", which is 
    default behavior for str.strip()
    
    strip_html: whether html tags should be removed or not. If set to true, html tags will be stripped from text.
    Defaults to False.
    """
    entry_text = entrydict[entry_text_key]
    if strip_html:
        entry_text = html.fromstring(entry_text).text_content()
    if type(val_regex) == str:
        val_regex = [val_regex]
    for r in val_regex:
        if len(re.findall(r,entry_text))!=0:
            entrydict[val_name]=re.findall(r,entry_text)[0].strip(strip_val)
            break
    return(entrydict)

In [26]:
prov_regex = [r">?([А-ЯІѢѲа-яѣі, -]*На(?:мѣ){0,1}стнич[аел]ств[аѣ](?: сего имени){0,1})",r">[ ]*([А-ЯІѢѲа-яѣі,.]+(?: [А-ЯІѢѲа-яѣі,]+){0,1}\s+Губерніи)"]
uezd_regex = [r"[Вв][Ъъ] уѣзд[уѣ] города ([А-ЯІѢѲа-яѣі-]+)",r"([А-ЯІѢѲа-яѣі]+\s+уѣзд[уѣ])"]
type_regex = r"italic;\">([[А-ЯІѢѲа-яіѣѳ, ]+)</span>"
alt_name_regex = [r",([А-ЯІѢѲ -]+) тожь",r"или ([А-ЯІѢѲ]+)",r"правильнѣе ([А-ЯІѢѲ]+)"]

In [27]:
for t in market_towns:
    market_towns[t].update(extract_value(market_towns[t],'featuretype_old_orth',type_regex,strip_val=", "))
    market_towns[t].update(extract_value(market_towns[t],'alt_name_old_orth',alt_name_regex,strip_html=True))
    market_towns[t].update(extract_value(market_towns[t],'admin1_old_orth',prov_regex))
    market_towns[t].update(extract_value(market_towns[t],'admin2_old_orth',uezd_regex,strip_html=True))
    if "ЧАСОВНЯ" in market_towns[t]['name_old_orth']:
        market_towns[t]['featuretype_old_orth'] = 'чацовня'

Validation
==========
The next two code blocks are to find entries that may need to be separated in the OCR text. It does this by looking for sequences of capitalized letters, and comparing them to the names found previously. If there's a sequence of capitalized letters that isn't a name or an alternate name, it will be printed below the second code block. Look up the place name in the pandas dataframe a few cells down to see if it looks like two entries have been unintentionally merged. If this is the case, check the OCR text and fix it, then re-run to make sure it worked. Sometimes there will be capitalized words that are not part of another entry, so watch out for those.  
  
This is a very important process to get right. If entries are merged, this can mess up the indexing of text entries, and that can sever the link between entries in the text and entries that have been geocoded, making it much harder to update geocoded entries with changes from the OCR text.

In [28]:
for t in market_towns:
    caps = re.findall(r"([А-ЯІѢѲ]+)[<, .]",market_towns[t]['description_old_orth'])
    caps = [c.strip() for c in caps]
    if len(caps)==0:
        print("This is confusing, entry has no strings of capital letters: {}".format(market_towns[t]['description_old_orth']))
    elif len(caps)>1:
        for c in caps:
            if c==market_towns[t]['name_old_orth']:
                caps.pop(caps.index(c))
                break
        for c in caps:
            if 'alt_name_old_orth' in market_towns[t] and c==market_towns[t]['alt_name_old_orth']:
                caps.pop(caps.index(c))
                break
        for c in caps:
            if len(c)==1:
                caps.pop(caps.index(c))
        if " ".join(caps)==market_towns[t]['name_old_orth']:
            caps = []
        if " - ".join(caps)==market_towns[t]['name_old_orth']:
            caps = []
        if len(caps)!=0:
            market_towns[t]['extra_caps'] = caps

In [29]:
for t in market_towns:
    if 'extra_caps' in market_towns[t]:
        print("{}".format(market_towns[t]['extra_caps']))
        print("Place name: {}".format(market_towns[t]['name_old_orth']))
        if 'alt_name_old_orth' in market_towns[t]:
            print("Alt name: {}".format(market_towns[t]['alt_name_old_orth']))
        print("")

['ВИЛЮЙСКОЕ', 'ВЕРХНЕЕ', 'СРЕДНЕЕ']
Place name: ВИЛЮЙСКОЕ ВЕРХНЕЕ

['КОЛПИНѢ']
Place name: ПЯТНИЦКОЕ

['БЕРЕНДѢЕВѢ']
Place name: ПЯТНИЦКОЕ

['ВЫЧЕГОДСКЪ']
Place name: СОЛЬ-ВЫЧЕГОДСКЪ

['ХОЛМУ']
Place name: СПАСЪ

['ЯМЩИКАХЪ']
Place name: СПАСЪ

['ХОЛМЪ', 'СПАСА']
Place name: ХОЛМЪ У СПАСА

['ЧАСОВНЯ', 'СВ', 'ДУХА']
Place name: ЧАСОВНЯ СВ. ДУХА



In [30]:
with open("output/enhanced_dataset.json","r") as fp:
    original_dataset = json.load(fp)
for k,v in market_towns.items():
    if k in original_dataset:
        for k1,v1 in v:
            if k1 in original_dataset[k]:
                if v1 != original_dataset[k][k1]:
                    v['query_datetime'] = datetime.datetime.strftime(datetime.datetime.now(),"%Y-%m-%dT%H:%M:%S")
                    break

Pandas!
=======
Everything has now been put into a pandas dataframe, so no more dicts!
Still doing some text parsing and cleanup, but based on the data that we've collected, not the text.

In [31]:
import pandas as pd
import numpy as np

In [32]:
# creation of pandas dataframe based on previously created dict.
market_df = pd.DataFrame.from_dict(market_towns,orient="index")
market_df = market_df[['name_old_orth','alt_name_old_orth','featuretype_old_orth','admin1_old_orth','admin2_old_orth','description_old_orth','page']]
market_df.head()

Unnamed: 0,name_old_orth,alt_name_old_orth,featuretype_old_orth,admin1_old_orth,admin2_old_orth,description_old_orth,page
3,АВДУЛОВО,ИВАНОВСКОЕ,село,Рязанскаго Намѣстничества,Донковскомъ уѣздѣ,"АВДУЛОВО, ИВАНОВСКОЕ тожь, </span><span class=...",10
4,АВЖЕНСКАЯ,,волость,Вологодскаго Намѣстничества,и уѣзду,"АВЖЕНСКАЯ , </span><span class=""font0"" style=""...",10
5,АВРАМІЕВЪ,,монастырь,Костромскаго Намѣстничества,,"АВРАМІЕВЪ</span><span class=""font0"" style=""fon...",10
6,АДОВАТОВО,,село,Нижегородскаго Намѣстничества,Арзамаскомъ уѣздѣ,"АДОВАТОВО, </span><span class=""font0"" style=""f...",10
7,АКСЕЛЬ,,село,Тамбовскаго Намѣстничества,Темниковскомъ уѣздѣ,"АКСЕЛЬ, </span><span class=""font0"" style=""font...",10


In [33]:
market_df[market_df.name_old_orth=='ВОЛЬНАЯ']

Unnamed: 0,name_old_orth,alt_name_old_orth,featuretype_old_orth,admin1_old_orth,admin2_old_orth,description_old_orth,page
122,ВОЛЬНАЯ,,слобода,Харьковскаго Намѣстничества,Бѣлогородскомъ уѣздѣ,"ВОЛЬНАЯ, </span><span class=""font0"" style=""fon...",29


In [34]:
#pd.options.display.max_colwidth = 2000 #sets column width to 1000, should display entire description in df display when run.

In [35]:
#These are false positives for uyezdi, they mention counties elsewhere in the description
market_df.admin2_old_orth = market_df.admin2_old_orth.replace(r'[вВ]ъ уѣздѣ',"not_in_text",regex=True)
market_df.admin2_old_orth = market_df.admin2_old_orth.replace(r'изъ уѣзду',"not_in_text",regex=True)

Adding URLs to link to Hathi Trust

In [36]:
market_df['SOURCE'] = market_df.page.apply(lambda x: "http://babel.hathitrust.org/cgi/pt?id=uc1.b4583201;view=1up;seq={}".format(x-1))

Some admin cleanup
==================
Here's a bit more housekeeping, replacing parts of administrative unit names that don't carry meaning, dealing with variant spellings, and getting rid of html in text descriptions

In [37]:
# Replacing parts of names that don't carry meaning
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('Графа Воронцова ','',regex=True) #a count of an area
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('Графа Шереметева, ','',regex=True) #a count of an area
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('Графа Гендрикова въ ','',regex=True) #a count of an area
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('Неплюевыхъ, ','',regex=True) #a count of an area
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('разныхъ Помѣщиковъ ','',regex=True) #different landlords
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('разныхъ Господъ ','',regex=True) #various gentlemen
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('Меньшикова въ ','',regex=True)
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('Неплюе-выхъ, ','',regex=True)
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('уральскихъ Козаковъ въ ','',regex=True) #of the Ural Cossacks
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('урочищѣ Собора Николая Чудотворца Гостунскаго, ','',regex=True) #In the tract of the Cathedral of St. Nicholas Gostunskii
market_df.admin1_old_orth = market_df.admin1_old_orth.replace(' же','',regex=True)
market_df.admin1_old_orth = market_df.admin1_old_orth.replace(r'^[Вв]ъ ','',regex=True)

In [38]:
#market_df.ix[market_df.admin1_old_orth=='Неплюевыхъ, Черниговскаго Намѣстничества']

In [39]:
#market_df.admin1_old_orth.unique()
#Use this output to spot check Namiestnichestva for errors

In [40]:
#market_df.admin2_old_orth.unique()
#Use this output to spot check uezd for errors

In [41]:
# Replacing variant spellings of Namiestnichestva and uyezd with a single spelling.
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('На(?:мѣ){0,1}стнич[ел]ств[аѣ]','Намѣстничества',regex=True)
market_df.admin1_old_orth = market_df.admin1_old_orth.replace('уѣзд[уѣ]','уѣздѣ',regex=True)

When is a name not a name?
==========================
When it's actually a reference to something else in the text!  
These next few cells replace phrases that mean "the same thing as..." and replace them with what they're talking about.  
Anything that uses the `replace_sameas` function is replacing "horizontally", so when a city is also a seat of an administrative unit, or a town is in an uyezd that is also a Namiestnichestva.  
Anything that has a `replacer` is replacing "vertically", when the text is referring to a previous entry.

In [42]:
def replace_sameas(df, original, replacement, match_list, kind="Намѣстничества"):
    """
    df: dataframe to be modified
    original: column name in pandas data frame
    replacement: column name in same pandas data frame
    match_list: list of strings which, in original, indicate that they should be replaced by the corresponding values in replacement
    
    For each string in match_list, this function replaces said string in original with the corresponding value in replacement, converted to title case for convenience.
    Place types are added after place names for consistent further processing. Type defaults to Намѣстничества.
    This is an in place transformation in the dataframe.
    """
    for match in match_list:
        cond = df[original] == match
        df[original][cond] = df[replacement][cond].str.title()+" {}".format(kind)
    return 0

In [43]:
replacer = market_df.ix[np.logical_and(market_df.admin1_old_orth == "того Намѣстничества",np.logical_or(market_df.admin1_old_orth=="и уѣзду",market_df.admin1_old_orth=="и уѣздѣ"))].index.tolist()
for x in replacer:
    market_df.loc[x,'admin2_old_orth'] = market_df.loc[x-1]['admin2_old_orth']
    market_df.loc[x,'admin1_old_orth'] = market_df.loc[x-1]['admin1_old_orth']

In [44]:
replacer = market_df.ix[market_df.admin1_old_orth == "того Намѣстничества"].index.tolist()
for x in replacer:
    market_df.loc[x,'admin1_old_orth'] = market_df.loc[x-1]['admin1_old_orth']

In [45]:
replacer = market_df.ix[market_df.admin1_old_orth == "той Губерніи"].index.tolist()
for x in replacer:
    market_df.loc[x,'admin1_old_orth'] = market_df.loc[x-1]['admin1_old_orth']

In [46]:
# Replacing "same Namiestnichestva" statements with appropriate Namiestnichestva name
match_list = ["Намѣстничества сего имени"]
replace_sameas(market_df,'admin1_old_orth','name_old_orth',match_list)

0

In [47]:
# Replacing "and uyezd" statements with appropriate Namiestnichestva name
match_list = ["и уѣзду","и уѣздѣ"]
replace_sameas(market_df,'admin2_old_orth','admin1_old_orth',match_list,kind="уѣздѣ")
market_df.admin2_old_orth = market_df.admin2_old_orth.replace(r' На(?:мѣ){0,1}стнич[ел]ств[аѣ]','',regex=True)

Stripping html from text, preserving html text in case it's needed

In [48]:
from lxml import html
def strip_html(text):
    """
    Strips HTML entities from variable text
    """
    return html.fromstring(text).text_content()

In [49]:
market_df['html_text'] = market_df.description_old_orth

In [50]:
market_df.description_old_orth = market_df.description_old_orth.apply(strip_html)

Orthography Modernization
=========================

In [51]:
import re,json
from lxml import html

with open("resources/special_cases.json",'r',encoding='utf-8') as fp:
    lookup_dict = json.load(fp)

def ortho_rules(text,lookup_dict):
    """
    Takes a text string and a lookup dict as input
    Removes any lingering html tags
    Breaks the string down into component words with split
    Uses replacements to modernize orthography
    Returns re-assembled text with modern orthography
    """
    if type(text)==str:
        text = html.fromstring(text).text_content()
        textlist = text.split(' ')
        out_list = []
        for word in textlist:
            if word in [',','.',';',':']:
                out_list.append(word)
                continue
            punct = ""
            if len(re.findall(r"([,.;:])$",word))!=0:
                   punct = re.findall(r"([,.;:])$",word)[0]
            word = word.strip(',.;:')
            if word in lookup_dict:
                word = lookup_dict[word]
                out_list.append(word)
            else:
                word = re.sub("[Ъъ]$",'',word)
                word = re.sub(r"^([че]{0,2}[ч]?[вбнр]?[веоаи])з([ПФКТШСЧпфктшсч])",r"\1с\2",word,re.IGNORECASE)
                word = re.sub("аго$","ого",word,re.IGNORECASE)
                word = re.sub("яго$","его",word,re.IGNORECASE)
                word = re.sub("ыя$","ые",word,re.IGNORECASE)
                word = re.sub("iя$","ие",word,re.IGNORECASE)
                word = word.replace('i','и') #english
                word = word.replace('I','И') #english
                word = word.replace('і','и') #cyrillic
                word = word.replace('І','И') #cyrillic
                word = word.replace('Ѣ','Е')
                word = word.replace('ѣ','е')
                word = word.replace('Ѳ','Ф')
                word = word.replace('ѳ','ф')
                word += punct
                out_list.append(word)
        out_str = ' '.join(out_list)
        return out_str
    else:
        return None

In [52]:
text = "ВЕРХОВАЖСКОЙ, посадъ Вологодскаго Намѣстничества. Здѣсь бываетъ ярмарка около 15 числа Марша мѣсяца потри дни. Останавливаются здѣсь купцы съ товарами ѣдущіе съ Благовѣщенской Важенской ярмарки; также съѣзжается туда и крестьянство съ своими для продажи пріуготовленіями."
ortho_rules(text,lookup_dict)

'ВЕРХОВАЖСКОЙ, посад Вологодского Наместничества. Здесь бывает ярмарка около 15 числа Марша месяца потри дни. Останавливаются здесь купцы с товарами едущие с Благовещенской Важенской ярмарки; также съезжается туда и крестьянство с своими для продажи приуготовлениями.'

In [53]:
market_df['name_new_orth'] = market_df.name_old_orth.apply(ortho_rules,args=[lookup_dict])

In [54]:
market_df.name_old_orth

3                 АВДУЛОВО
4                АВЖЕНСКАЯ
5                АВРАМІЕВЪ
6                АДОВАТОВО
7                   АКСЕЛЬ
8                  АЛАТЫРЬ
9     АЛЕКСАНДРОВА ПУСТЫНЯ
10         АЛЕКСАНДРОВСКОЕ
11                АЛЕКСИНЪ
12              АЛЕКСѢЕВКА
13            АЛЕКСѢЕВСКАЯ
14                  АЛЕШНЯ
15                  АНАЕВО
16               АНДРЕЕВКА
17               АНТОНОВКА
...
790        ЮРЬЕВЪ ПОЛЬСКІЙ
791    ЮРЬЕВЕЦЪ ПОВОЛГСКІЙ
792               ЯБЛУНОВЪ
793                ЯГОТИНЪ
794                 ЯДРИНЪ
795                ЯЗЫКОВО
796                ЯКУТСКЪ
797                 ЯМПОЛЬ
798                 ЯМПОЛЬ
799                 ЯМСКАЯ
800                ЯРАНСКЪ
801                ЯРЕНСКЪ
802                 ЯРЕСКИ
803              ЯРОСЛАВЛЬ
804                    ЯШЪ
Name: name_old_orth, Length: 802, dtype: object

In [55]:
market_df['alt_name_new_orth'] = market_df.alt_name_old_orth.apply(ortho_rules,args=[lookup_dict])
market_df['admin1_new_orth'] = market_df.admin1_old_orth.apply(ortho_rules,args=[lookup_dict])
market_df['admin2_new_orth'] = market_df.admin2_old_orth.apply(ortho_rules,args=[lookup_dict])
#market_df.head()

Lemmatization
=============
Here's where we start lemmatizing some of the text. `ru_stemmer` below uses the nltk snowball stemmer to generalize terms.

In [56]:
import nltk

In [57]:
ru_stemmer = nltk.stem.SnowballStemmer('russian')

In [58]:
market_df['admin1_stem'] = market_df[market_df.admin1_new_orth.isnull()==False].admin1_new_orth.replace(' [А-Яа-я]+$','',regex=True).apply(ru_stemmer.stem)
market_df['admin2_stem'] = market_df[market_df.admin2_new_orth.isnull()==False].admin2_new_orth.replace(' уезде$','',regex=True).apply(ru_stemmer.stem)

In [59]:
#standardizing admin1 names from stems
market_df.admin1_stem = market_df.admin1_stem.replace('владимерк','владимерск')
market_df.admin1_stem = market_df.admin1_stem.replace('володимерск','владимерск')
market_df.admin1_stem = market_df.admin1_stem.replace('воронеж','воронежск')
market_df.admin1_stem = market_df.admin1_stem.replace('выборгск','выборг')
market_df.admin1_stem = market_df.admin1_stem.replace('костромск','кострома')
market_df.admin1_stem = market_df.admin1_stem.replace('костром','кострома')
market_df.admin1_stem = market_df.admin1_stem.replace('гороховск','гороховецск')
market_df.admin1_stem = market_df.admin1_stem.replace('новогородского - северск','новогород - северск')
market_df.admin1_stem = market_df.admin1_stem.replace('новогородск','новгородск')
market_df.admin1_stem = market_df.admin1_stem.replace('олонец','олонецк')
market_df.admin1_stem = market_df.admin1_stem.replace('пенз','пензенск')
market_df.admin1_stem = market_df.admin1_stem.replace('перм','пермск')
market_df.admin1_stem = market_df.admin1_stem.replace('псков','псковск')
market_df.admin1_stem = market_df.admin1_stem.replace('риг','рижск')
market_df.admin1_stem = market_df.admin1_stem.replace('рязан','рязанск')
market_df.admin1_stem = market_df.admin1_stem.replace('сарат','саратовск')
market_df.admin1_stem = market_df.admin1_stem.replace('тамб','тамбовск')
market_df.admin1_stem = market_df.admin1_stem.replace('твер','тверск')
market_df.admin1_stem = market_df.admin1_stem.replace('тул','тульск')
market_df.admin1_stem = market_df.admin1_stem.replace('тульскаг','тульск')
market_df.admin1_stem = market_df.admin1_stem.replace('черниг','черниговск')
market_df.admin1_stem = market_df.admin1_stem.replace('ярославл','ярославск')
market_df.admin1_stem = market_df.admin1_stem.replace('харьк','харьковск')
market_df.admin1_stem = market_df.admin1_stem.replace('с. петербургск','с петербургск')
market_df.admin1_stem = market_df.admin1_stem.replace('симбирск','синбирск')
market_df.admin1_stem = market_df.admin1_stem.replace('новогородскаго-северск','новогород - северск')

In [60]:
#standardizing admin2 names from stems
market_df.admin2_stem = market_df.admin2_stem.replace('гороховск','гороховецк')
market_df.admin2_stem = market_df.admin2_stem.replace('алатырск','алатырьск')
market_df.admin2_stem = market_df.admin2_stem.replace('харьков','харьковск')

In [61]:
market_df.ix[market_df.featuretype_old_orth=='городъ','admin2_stem'] = market_df.name_new_orth.str.lower()

In [62]:
market_df.admin1_stem.unique()

array(['рязанск', 'вологодск', 'кострома', 'нижегородск', 'тамбовск',
       'синбирск', 'ярославск', 'пензенск', 'тульск', 'воронежск',
       'екатеринославск', 'курск', 'харьковск', 'могилевск', 'орловск',
       'архангельск', 'кавказск', 'саратовск', nan, 'киевск',
       'новогород - северск', 'черниговск', 'тобольск', 'московск',
       'казанск', 'полотск', 'пермск', 'уфимск', 'владимерск', 'тверск',
       'новгородск', 'калужск', 'смоленск', 'рижск', 'иркутск', 'выборг',
       'псковск', 'олонецк', 'ревельск', 'вятск', 'с петербургск'], dtype=object)

In [63]:
len(market_df.ix[market_df.admin2_old_orth.isnull()==False])/len(market_df)

0.5399002493765586

In [64]:
market_df.ix[np.logical_and(market_df.admin1_old_orth.isnull(),market_df.admin2_old_orth.isnull()==False)][['name_old_orth','admin1_old_orth','description_old_orth']]
#Spot check to make sure that Namiestnichestva isn't in text, but uezd is.

Unnamed: 0,name_old_orth,admin1_old_orth,description_old_orth
77,БУРЕЦКОЙ,,"БУРЕЦКОЙ, погостъ въ Новгородскомъ уѣздѣ. Здѣс..."
121,ВОЛЫНЦЫ,,"ВОЛЫНЦЫ, мѣстечко въ уѣздѣ города Полотска. Зд..."
383,ЛИПЦЫ,,"ЛИПЦЫ, слобода въ уѣздѣ города Харькова. Здѣсь..."
384,ЛИСКОЙ,,"ЛИСКОЙ , волокъ , что на Сингѣ, въ Вологодском..."
386,ЛОПОТОВЪ,,"ЛОПОТОВЪ, монастырь въ Вологодскомъ уѣздѣ. Здѣ..."
492,ОЛЫШАНАЯ,,"ОЛЫШАНАЯ, слобода въ уѣздѣ города Харькова. Зд..."
519,ПЕРЕКОПЕЦЪ,,"ПЕРЕКОПЕЦЪ , мѣстечко въ уѣздѣ города Харькова..."
611,СВИРСКОЙ АЛЕКСАНДРОВСКОЙ,,"СВИРСКОЙ АЛЕКСАНДРОВСКОЙ , монастырь въ Олонец..."
677,ТАРНЯНСКОЙ,,"ТАРНЯНСКОЙ, городокъ Кокшайской четверти въ уѣ..."
694,ТОМАРОВКА,,"ТОМАРОВКА, слобода въ уѣздѣ города Карпова. Въ..."


In [65]:
market_df.admin2_stem.fillna(value='not_in_text',inplace=True)
market_df.admin1_stem.fillna(value='not_in_text',inplace=True)

Adding partof IDs
=================

In [66]:
with open("resources/place_adj.json","r",encoding='utf-8') as fp:
    adj_dict = json.load(fp)

In [67]:
def dedupe_dict(to_dedupe):
    """
    removes key/value pairs that are the same as one another
    """
    to_pop = []
    for a in to_dedupe:
        if a == to_dedupe[a]:
            to_pop.append(a)
    for v in to_pop:
        to_dedupe.pop(v)

In [68]:
dedupe_dict(adj_dict)

In [69]:
# replaces non-adjectival stems with adjectival stems
market_df.admin2_stem.replace(adj_dict,inplace=True)

In [70]:
market_df['admin_hierarchy'] = market_df.admin1_stem
cond = market_df.admin2_stem.notnull()
market_df.admin_hierarchy[cond] = market_df.admin_hierarchy + "|" + market_df.admin2_stem

In [71]:
with open("resources/admin_lookup.json","r",encoding='utf-8') as fp:
    new_lookup = json.load(fp)

In [72]:
errlist = []

In [73]:
def partof_prov_lookup(admin_hierarchy,lookup=new_lookup):
    admins = admin_hierarchy.split('|')
    admin1_stem = admins[0]
    admin2_stem = admins[1]
    if admin1_stem == 'not_in_text':
        admin1 = None
    if admin2_stem == 'not_in_text':
        admin2 = None
    if admin1_stem != 'not_in_text' and admin1_stem in lookup:
        admin1 = lookup[admin1_stem]['id']
        if admin2_stem != 'not_in_text' and admin2_stem in lookup[admin1_stem]:
            admin2 = lookup[admin1_stem][admin2_stem]['id']
        else:
            if admin2_stem != 'not_in_text':
                print("{} was not found in lookup under admin1: {}".format(admin2_stem,admin1_stem))
                admin2 = None
                errlist.append(admin2_stem)
    else:
        if admin1_stem != 'not_in_text':
            print("{} was not found in lookup as admin1.".format(admin1_stem))
            admin1 = None
            admin2 = None
        else:
            admin2 = None
            for p in lookup:
                if admin2_stem in lookup[p]:
                    admin1 = lookup[p]['id']
                    admin2 = lookup[p][admin2_stem]['id']
                    #print("CHECK ME: {}|{}".format(admin1,admin2))
                    break
    try:
        return admin1,admin2
    except UnboundLocalError:
        print("UNBOUND: {}, {}".format(admin1_stem,admin2_stem))
        return None,None

In [74]:
market_df['admin1_partofID'], market_df['admin2_partofID'] = '', ''

In [75]:
market_df['admin1_partofID'], market_df['admin2_partofID'] = zip(*market_df.admin_hierarchy.apply(partof_prov_lookup))

белополье was not found in lookup under admin1: харьковск
венденск was not found in lookup under admin1: рижск
волск was not found in lookup under admin1: саратовск
мглинск was not found in lookup under admin1: черниговск
глухов was not found in lookup under admin1: новогород - северск
грязовецк was not found in lookup under admin1: вологодск
енисейск was not found in lookup under admin1: тобольск
золочев was not found in lookup under admin1: харьковск
клинск was not found in lookup under admin1: московск
конотоп was not found in lookup under admin1: новогород - северск
короп was not found in lookup under admin1: новогород - северск
корсунь was not found in lookup under admin1: синбирск
кошира was not found in lookup under admin1: тульск
кременчуг was not found in lookup under admin1: екатеринославск
крестцы was not found in lookup under admin1: новгородск
кролевец was not found in lookup under admin1: новогород - северск
кунгур was not found in lookup under admin1: пермск
ладога новая

In [76]:
with open("resources/partof_prov_bounds.json","r",encoding='utf-8') as fp:
    popb = json.load(fp)
market_df['admin1_std_name'] = market_df.admin1_partofID.apply(lambda x: popb[str(float(x))]['name'] if x is not None else None)

In [77]:
errlist = set(errlist)
errlist = list(errlist)
errlist.sort()
len(errlist)

46

In [78]:
#market_df[market_df.featuretype_old_orth=="городъ"].admin2_stem.unique()

In [79]:
market_df.admin1_partofID = market_df.admin1_partofID.replace(0,np.nan)
market_df.admin2_partofID = market_df.admin2_partofID.replace(0,np.nan)

In [80]:
market_df['partof_id'] = np.nan

In [81]:
market_df.ix[market_df.admin2_partofID.isnull()==False,'partof_id'] = market_df.admin2_partofID
market_df.ix[market_df.admin2_partofID.isnull(),'partof_id'] = market_df.admin1_partofID

This cell is useful for making sure that things that don't have partof IDs really are the odd ducks that they seem like they should be, and not the result of an OCR error.

In [82]:
#market_df[market_df.partof_id.isnull()][['name_old_orth','description_old_orth']]

In [83]:
market_df.name_old_orth = market_df.name_old_orth.str.title()
market_df.name_new_orth = market_df.name_new_orth.str.title()
market_df.alt_name_old_orth = market_df.alt_name_old_orth.str.title()
market_df.alt_name_new_orth = market_df.alt_name_new_orth.str.title()

In [84]:
market_df['txt_id'] = market_df.index

In [85]:
market_df.to_excel("output/process/no_geo.xlsx",encoding="utf-8")
market_df.to_csv("output/process/no_geo.csv",encoding="utf-8")