## Read database

In [1]:
import pandas as pd

geonames = pd.read_csv('geonames/geonames_all.csv', '\t',
                       dtype={'altnames': object, 'cc2': object, 'acode1': object, 'acode2': object, 'acode3': object, 'acode4': object})
geonames_ = geonames[['geonameid', 'name', 'altnames', 'fclass', 'fcode', 'ccode', 'acode1', 'pop']]

## All together:

In [None]:
# csv: ppl
print('csv: ppl')
CountriesSupported = ['RU', 'UA', 'BY', 'KZ']
ppls_sng = geonames_.ix[(bool(geonames_.fcode.notnull)) & (geonames_.fcode.str.startswith('PPL')) & (
geonames_['ccode'].isin(CountriesSupported))]

ppls_1000 = pd.read_csv('geonames/cities1000.txt', '\t',
                        dtype={'altnames': object, 'cc2': object, 'acode1': object, 'acode2': object, 'acode3': object, 'acode4': object})
ppls_1000_ = ppls_1000[['geonameid', 'name', 'altnames', 'fclass', 'fcode', 'ccode', 'acode1', 'pop']]

ppls = ppls_1000_
ppls_no_sng = ppls.ix[~(ppls['ccode'].isin(CountriesSupported))]

ppls_all = ppls_sng.append(ppls_no_sng)

print(ppls.shape, ppls_no_sng.shape)
print(ppls_sng.shape, ppls_all.shape)

ppls_all.to_csv('geonames/all_ppls.csv', sep='\t', index=False)

# csv: region
print('csv: region')
geonames_region = geonames_.ix[(bool(geonames_.fcode.notnull)) & (geonames_.fcode.str.startswith('ADM'))]
geonames_region.to_csv('geonames/all_regions.csv', sep='\t', index=False)

# csv: country
print('csv: country')
geonames_country = geonames_.ix[(bool(geonames_.fcode.notnull)) & (geonames_.fcode.str.startswith('PCL'))]
geonames_country.to_csv('geonames/all_countries.csv', sep='\t', index=False)

csv: ppl
(142257, 8) (135646, 8)
(270841, 8) (406487, 8)
csv: region
csv: country


## Script for parsing user locations 

In [21]:
geonames.to_csv('geonames/geonames_all.csv', sep = '\t', index=False)

In [4]:
CountriesSupported = ['RU', 'UA', 'BY', 'KZ']
ppls_sng = geonames_.ix[(bool(geonames_.fcode.notnull)) & (geonames_.fcode.str.startswith('PPL')) & (geonames_['ccode'].isin(CountriesSupported))]

#geonames_ppl.to_csv('geonames/all_ppls_.csv', sep = '\t', index=False)
print(ppls_sng.shape)
#print(geonames_ppl[:10])

(270841, 8)


In [None]:
print(geonames.columns)

Index(['geonameid', 'name', 'asciiname', 'altnames', 'lat', 'long', 'fclass',
       'fcode', 'ccode', 'cc2', 'acode1', 'acode2', 'acode3', 'acode4', 'pop',
       'elev', 'gtopo', 'tz', 'mdate'],
      dtype='object')


In [10]:
ppls_1000 = pd.read_csv('geonames/cities1000.txt', '\t',
                       dtype={'altnames': object, 'cc2': object, 'acode1': object, 'acode2': object, 'acode3': object, 'acode4': object})
ppls_1000_ = ppls_1000[['geonameid', 'name', 'altnames', 'fclass', 'fcode', 'ccode', 'acode1', 'pop']]

In [11]:
ppls = ppls_1000_
ppls_no_sng = ppls.ix[~(ppls['ccode'].isin(CountriesSupported))]

In [12]:
ppls_all = ppls_sng.append(ppls_no_sng)

In [13]:
print(ppls.shape, ppls_no_sng.shape)
print(ppls_sng.shape, ppls_all.shape)

(142257, 8) (135646, 8)
(270841, 8) (406487, 8)


In [14]:
ppls_all.to_csv('geonames/all_ppls.csv', sep = '\t', index=False)

In [33]:
ppls_all.ix[ppls_all['geonameid'] == 550280]

Unnamed: 0,geonameid,name,altnames,fclass,fcode,ccode,acode1
7643635,550280,Khimki,"Chimkai,Chimki,Gorad Khimki,Himki,Jimki,Khimki...",P,PPL,RU,47


In [35]:
df = pd.read_csv('geo/database/geonames_ru.csv', sep='\t')
df.ix[df['geonameid'] == 550280]

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,geonameid,name,asciiname,altnames,lat,long,fclass,fcode,ccode,cc2,acode1,acode2,acode3,acode4,pop,elev,gtopo,tz,mdate
89578,550280,Khimki,Khimki,"Chimki,Himki,Khimki,Moskau-Chimki,himki,Ĥimki,...",55.89704,37.42969,P,PPL,RU,,48,,,,142618,,185,Europe/Moscow,2012-01-17


### PostgreSQL

In [109]:
import psycopg2
conn = psycopg2.connect("dbname=geonames user=deserg")

cursor = conn.cursor()
cursor.execute('CREATE INDEX ppl_idx ON geonames_ppl USING GIN (name gin_trgm_ops);')``
cursor.execute('CREATE INDEX region_idx ON geonames_region USING GIN (name gin_trgm_ops);')
cursor.execute('CREATE INDEX country_idx ON geonames_country USING GIN (name gin_trgm_ops);')
cursor.execute('select set_limit(0.55);')

In [99]:
levels = ['country', 'region', 'ppls']

def get_with_errs(level_ind, name):
    if level_ind < 0 or level_ind >= len(levels):
        return

    query = '''
        SELECT geonameid, name FROM geonames_{0} where '{1}' % name or '{1}' % ANY(string_to_array(altnames, ','));
    '''
    
    cursor.execute(query.format(levels[level_ind], name))
    list_tup = cursor.fetchmany()
    for tup in list_tup:
        print(tup)

def get_country_with_errs(name):
    get_with_errs(0, name)

        
def get_region_with_errs(name):
    get_with_errs(1, name)
    
def get_ppl_with_errs(name):
    get_with_errs(2, name)

In [111]:
import time

t = time.time()
get_region_with_errs('Kiev')
print('Time spent:', time.time() - t)

(6548066, 'Kieve')
Time spent: 8.323333978652954


### Parse geonames.csv

In [1]:
from pandas import read_csv
import pandas as pd
import numpy as np

CountriesSupported = ['RU', 'UA', 'BY', 'KZ']
Levels = ['city', 'region', 'country']
City, Region, Country = Levels[0], Levels[1], Levels[2]


def char_range(c1, c2):
    """Generates the characters from `c1` to `c2`, inclusive."""
    for c in range(ord(c1), ord(c2)+1):
        yield chr(c)

RussianLetters = set(list(char_range('а', 'я')) + list(char_range('А', 'Я')))


dfs = []
for country in [country.lower() for country in CountriesSupported]:
    dfs.append(read_csv('geo/database/geonames_' + country + '.csv', '\t', dtype = {'cc2': object, 'acode1': object, 'acode2': object}))
    
geonames = pd.concat(dfs)
geonames_cut = read_csv('geo/database/geonames_cut.csv', ';', dtype = {'cc2': object, 'acode1': object, 'acode2': object})

geonames['altnames'] = geonames['altnames'].astype('str')
geonames_cut['altnames'] = geonames_cut['altnames'].astype('str')

#print(geonames.ix[(geonames.fcode.str.startswith('PPL')) & (geonames.name.str.startswith('Mos'))])
#print(geonames.ix[geonames.name.str.contains('Oblast’')][['name', 'altnames']])

In [118]:
import pandas as pd

regions = pd.read_csv('geo/database/all_regions.csv', sep='\t')

loc_name = 'Санкт-Петербург'
regions.ix[(regions.name == loc_name) | ((regions.altnames.notnull) & (regions.altnames.str.contains(loc_name)))]

Unnamed: 0,geonameid,name,altnames,fclass,fcode,ccode,acode1
2822,536203,Sankt-Peterburg,"Gorod-Geroy Leningrad,Leningrad,Léningrad,Pete...",A,ADM1,RU,66


In [120]:
ppls = pd.read_csv('geo/database/all_ppls.csv', sep='\t')

loc_name = 'Санкт-Петербург'
ppls.ix[(ppls.name == loc_name) | ((ppls.altnames.notnull) & (ppls.altnames.str.contains(loc_name)))]

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,geonameid,name,altnames,fclass,fcode,ccode,acode1
165425,498817,Saint Petersburg,"Agia Petroupole,Betuyrbukh,LED,Leningrad,Lenin...",P,PPLA,RU,66


### Parse toponims-utf8.txt

In [3]:
toponims = read_csv('geo/toponims-utf8.txt', '\t')

#print(toponims[toponims.])

### Helper functions

In [4]:
import editdistance as ee
MaxEditDistance = 1

def get_records_for_location_strict(geonames, location):
    records = geonames.ix[(geonames.name == location) | (geonames.altnames.str.contains(location))]
    result = pd.DataFrame(columns = list(geonames))
    for record in records.iterrows():
        names = [record[1]['name']]
        names += record[1]['altnames'].split(',')
        if location in names:
            #print(names)
            result = result.append(record[1])
    #print(len(result))
    return result
            

def get_records_for_location_non_strict(geonames, location, max_edit_distance):
    result = pd.DataFrame(columns = list(geonames))
    edit_distances = []
    for record in geonames.iterrows():
        names = [record[1]['name']]
        names += record[1]['altnames'].split(',')
        #print(names)
        
        dists = [ee.eval(name, location) + 1 for name in names]
        dists[0] -= 1
        #print(dists)
        if max_edit_distance < 0:
            max_edit_distance = 0
        min_dist = min(dists)
        min_dist_index = dists.index(min_dist)
        
        if min_dist <= max_edit_distance:
            #print(names[min_dist_index], min_dist)
            result = result.append(record[1])
            edit_distances.append(dists[min_dist_index])
    #print(len(result))
    return (result, edit_distances)

def get_records_for_location(geonames, location, strict, max_edit_distance):
    if strict:
        return get_records_for_location_strict(geonames, location)
    else:
        return get_records_for_location_non_strict(geonames, location, max_edit_distance)

def get_records_for_code(geonames, location, fcode_start, strict = True, max_edit_distance = MaxEditDistance):
    records_for_fcode = geonames.ix[(bool(geonames.fcode.notnull)) & (geonames.fcode.str.startswith(fcode_start))]
    return get_records_for_location(records_for_fcode, location, strict, max_edit_distance)

def get_records_for_class(geonames, location, fclass, strict = True, max_edit_distance = MaxEditDistance):
    records_for_fclass = geonames.ix[geonames.fclass == fclass]
    return get_records_for_location(records_for_fclass, location, strict, max_edit_distance)

def get_records_for_region(region, country = 'RU'):
    return geonames.ix[(geonames.ccode == country) & (geonames.fcode.str.startswith('ADM')) & (geonames.acode1 == region)]

def str_equals_record(string, record):
    names = [record['name']]
    names += record['altnames'].split(',')
    return string in names

def loc_str_equals_str(database, location1, location2):
    records1 = get_records_for_code(database, location1, '')
    records2 = get_records_for_code(database, location2, '')
    for record in records1.iterrows():
        if str_equals_record(location2, record[1]):
            return True
    for record in records2.iterrows():
        if str_equals_record(location1, record[1]):
            return True
    return False
#get_records_for_code(geonames, 'Vologd', 'ADM1', False, 1)

### Parse region codes

In [5]:
def get_region_by_capital(geonames, ppl, region, country):
    regions = get_records_for_code(geonames, ppl, 'ADM')# geonames.ix[(geonames['altnames'].str.contains(ppl)) & (geonames.fcode == 'ADM1')]
    if regions.empty:
        if not get_records_for_region(region, country).empty:
            return 'WRONG! - ' + ppl
        else:
            return ppl
    else:
        return regions.name.iloc[0]

regions_map = {}

def parse_region_map():
    codes = open('geo/admin1Codes.txt')
    
    for line in codes:
        code, name = line.split('\t', 1)
        country, region = code.split('.', 1)

        if country not in CountriesSupported:
            continue

        if (country not in regions_map.keys()) :
            regions_map[country] = {}

        try:
            int_reg = int(region)
        except ValueError:
            int_reg = -1    

        if int_reg > 0:
            region_name = get_region_by_capital(geonames, name.strip(), region, country)
            #print(region_name)
            regions_map[country][int_reg] = region_name
        elif int_reg == 0:
            country_name = name.split(' ')[0]
            regions_map[country][0] = country_name

parse_region_map()
#print(get_records_for_region('2', 'UA'))
#print(get_region_by_capital(geonames, "Luhans'ka", '2', 'UA'))
#get_records_for_region('26')
#rec.empty
#get_region_by_capital(geonames, 'Zabaïkalski', '14')



In [6]:
#print(regions_map['RU'][0])
for country in CountriesSupported:
    for key, val in regions_map[country].items():
        if (val.startswith('WRONG!')):
            print(country, '.', key, ' - ', val, sep = '')

#regions_map['UA']
#get_region_by_capital(geonames, 'Kamchatskaya Oblast')


#[if (x == 'KOKOKOKOKOKO') x  for x in regions_map['RU'].values()]
#get_region_by_capital(geonames, 'Kamchatskaya Oblast')
#get_records_for_code(geonames, 'Jewish Autonomous Oblast', 'ADM1', False)

In [7]:
geonames.ix[(geonames.ccode == 'UA') & (geonames.fcode.str.startswith('ADM')) & (geonames.acode1 == '18')]

Unnamed: 0,geonameid,name,asciiname,altnames,lat,long,fclass,fcode,ccode,cc2,acode1,acode2,acode3,acode4,pop,elev,gtopo,tz,mdate
9978,696634,Poltavs’ka Oblast’,Poltavs'ka Oblast',"Oblast Poltava,Oblast Poltawa,Oblast de Poltav...",49.5,34.0,A,ADM1,UA,,18,,,,1569479,,78,Europe/Zaporozhye,2012-08-19


In [8]:
#print(get_region_by_capital(geonames, 'Buryatiya'))
get_records_for_code(geonames, "Chernihivs'ka", 'ADM')

Unnamed: 0,geonameid,name,asciiname,altnames,lat,long,fclass,fcode,ccode,cc2,acode1,acode2,acode3,acode4,pop,elev,gtopo,tz,mdate


In [9]:
import transliterate as tr

ru_codes = []
for region in regions_map['RU'].values():
    ru_codes.append((region, tr.translit(region, 'ru')))
    
#print(ru_codes)
tr.translit('эе', 'ru', reversed = True)

'ee'

### Read locations

In [10]:
loc_f = open('geo/user_source_locations.txt.sorted.uniq', 'r')

loc_arr = []

for line in loc_f:
    loc = line.strip().split(' ', 1)
    if (len(loc) == 1):
        continue
    loc = loc[1].split(',')
    loc = [s.strip() for s in loc]
    loc_arr.append(loc)


print(loc_arr[:20])

[['Russian Federation'], ['United States'], ['Москва', 'Москва', 'Russian Federation'], ['Москва', 'Russian Federation'], ['Ukraine'], ['Москва', 'Russian Federation'], ['Moscow', 'Москва', 'Russian Federation'], ['United Kingdom'], ['Canada'], ['California', 'United States'], ['Санкт-Петербург', 'Санкт-Петербург', 'Russian Federation'], ['Singapore'], ['Kiev', 'Ukraine'], ['New York', 'United States'], ['Санкт-Петербург', 'Russian Federation'], ['Belarus'], ['Australia'], ['Санкт-Петербург', 'Russian Federation'], ['Seattle', 'Washington', 'United States'], ['London', 'United Kingdom']]


### Transliterate words

In [2]:
import transliterate as tr

print(tr.get_available_language_codes())
text_en = "Lorem ipsum dolor sit amet"
text_ru = "Привет, world!"
tr.translit(text_ru, 'ru', reversed = True)

loc_en = []

#for loc in loc_arr:
#    loc_en.append([translit(val, 'ru', reversed = True) for val in loc])

kyan = tr.translit('абвгдежзиклмнопрстуюфхцчшщ', 'ru', reversed = True)
print(kyan)
print(tr.translit(kyan, 'ru'))

['bg', 'el', 'mk', 'uk', 'ru', 'hy', 'mn', 'l1', 'ka']
abvgdezhziklmnoprstujufhtschshsch
абвгдежзиклмнопрстуюфхцчшсч


### Obtain combinations ["city": string, "region": string, "country": string]

#### get_regions(populated_place, database)

In [12]:
def get_regions(geonames, populated_place):
    places_df = geonames.ix[(geonames.fclass == 'P') &
    ((geonames.altnames.str.contains(populated_place)) |
     (geonames.name == populated_place))][['altnames', 'ccode', 'acode1']]
    regions = []
    for row in places_df.itertuples():
        altnames = row[1].split(',')
        if populated_place in altnames:
            country = row[2]
            try:
                region_num = int(row[3])
            except:
                region_num = 0
            region = regions_map[country][region_num]
            regions.append(region)
    return list(set(regions))

def get_countries(geonames, region):
    regions = get_records_for_code(geonames, region, 'ADM')
    countires = []
    for record in regions.iterrows():
        country_alias = record[1].ccode
        countires.append(regions_map[country_alias][0])
    return list(set(countires))

def get_info_for_ppl(geonames, ppl):
    regions = get_regions(geonames, ppl)
    info = []
    for region in regions:
        countries = get_countries(geonames, region)
        if countries:
            info += [[region, country] for country in countries]
        else:
            info.append(region)
    return info
# def unfiy_loc(location):
#     for word in location

#print(get_countries(geonames, 'Moscow'))
get_info_for_ppl(geonames, 'Нальчик')
#get_regions(geonames, 'Petropavlovsk-Kamchatskiy')
#print(geonames_full.ix[(geonames_full.fclass == 'A') & (geonames_full.altnames.str.contains('Moskva'))])

[['Kabardino-Balkarskaya Respublika', 'Russia']]

In [69]:
#0 - ppl, 1 - region, 2 - country

LocationCodes = ['PPL', 'ADM', 'PCL']
CityCode = LocationCodes[0]
RegionCode = LocationCodes[1]
CountryCode = LocationCodes[2]

def get_location_types(database, location):
    types = {0: False, 1: False, 2: False}
    for i in range(3):
        types[i] = not get_records_for_code(database, location, LocationCodes[i]).empty
    return types

def get_similar_locations(database, location, raw = True, max_edit_distance = MaxEditDistance):
    similar = {}
    for code in LocationCodes:
        records, edit_distances = get_records_for_code(database, location, code, False, max_edit_distance)
        records_list = []
        for record in records.iterrows():
            if (raw):
                records_list.append(record[1])
            else:
                records_list.append(record[1]['name'])
        similar[code] = [(records_list[i], location, edit_distances[i]) for i in range(len(records_list))]
    return similar


def get_full_similar(database, full_location, raw = True, max_edit_distance = MaxEditDistance):
    full_similar = []
    for location in full_location:
        similar = get_similar_locations(database, location, raw, max_edit_distance)
        if similar:
            full_similar.append(similar)
    return full_similar


def prepare_combinations_1(database, full_similar, result):
    if len(full_similar) != 1:
        return
    print('1!!')
    similar = full_similar[0]
    for code in LocationCodes:
        if similar[code]:
            result += [{code: pair} for pair in similar[code]]

def prepare_combinations_2(database, full_similar, result):
    if len(full_similar) != 2:
        return
    print('2!!')
    for code0 in LocationCodes:
        for code1 in LocationCodes:
            if (code0 != code1) and full_similar[0][code0] and full_similar[1][code1]:
                for pair0 in full_similar[0][code0]:
                    for pair1 in full_similar[1][code1]:
                        result.append({code0: pair0, code1: pair1})
    
def prepare_combinations_3_greater(database, full_similar, used, combination, result):
    if len(combination.keys()) == 3:
        result.append(combination.copy())
    else:
        for i in range(len(full_similar)):
            if not used[i]:
                used[i] = True
                code = LocationCodes[len(combination.keys())]
                record_pair = full_similar[i][code]
                if record_pair:
                    combination[code] = record_pair
                    prepare_combinations_3_greater(database, full_similar, used, combination, result)
                    del combination[code]
                used[i] = False


def prepare_level_data_list(multi_combination, level):
    level_pair = multi_combination[LocationCodes[level]]
    level_record_list = level_pair[0]
    level_edit_dist_list = level_pair[1]
    level_data_list = []
    for i in range(len(level_record_list)):
        record = level_record_list[i]
        edit_dist = level_edit_dist_list[i]
        level_data_list.append((record, edit_dist))
    return level_data_list

def get_possible_combinations(database, full_similar):
    combinations = []
    if (not full_similar) or (len(full_similar) > 3):
        return combinations
    
    if len(full_similar) == 1:
        prepare_combinations_1(database, full_similar, combinations)
    elif len(full_similar) == 2:
        prepare_combinations_2(database, full_similar, combinations)
    else:
        combinations_prep = []
        prepare_combinations_3_greater(database, full_similar, [False for i in range(len(full_similar))], {}, combinations_prep)
        for multi_combination in combinations_prep:
            for pair0 in multi_combination[LocationCodes[0]]:
                for pair1 in multi_combination[LocationCodes[1]]:
                    for pair2 in multi_combination[LocationCodes[2]]:
                       combinations.append({LocationCodes[0]: pair0, LocationCodes[1]: pair1, LocationCodes[2]: pair2})
    print(combinations)
    return combinations

In [31]:
full_location = ['Vologda', 'Vologda', 'Россия']
full_similar = get_full_similar(geonames, full_location)

In [34]:
print(full_similar)
#combinations = get_possible_combinations(geonames, full_similar)

[{'PCL': [], 'ADM': [(geonameid                                               472454
name                                      Vologodskaya Oblast’
asciiname                                 Vologodskaya Oblast'
altnames     NULL,Vologda,Vologda Oblast,Vologodskaja oblas...
lat                                                         60
long                                                        42
fclass                                                       A
fcode                                                     ADM1
ccode                                                       RU
cc2                                                        NaN
acode1                                                      85
acode2                                                     NaN
acode3                                                     NaN
acode4                                                     NaN
pop                                                1.24504e+06
elev                             

### Fine evaluating

In [22]:
FineValues = [10, 10, 10, 10]
FineMessages = [
    'Location region not equal to the region',
    'Location country not equal to region country',
    'Region country not equal to country',
    'Location country not equal to country'
]

def check_fine_1(location_region, region, fine, fine_message):
    if not str_equals_record(location_region, region):
        fine += FineValues[0]
        fine_message += FineMessages[0] + ' - ' + str(FineValues[0]) + '\n'
    return (fine, fine_message)

def check_fine_2(database, location_country, region_country, fine, fine_message):
    if not loc_str_equals_str(database, location_country, region_country):
        fine += FineValues[1]
        fine_message += FineMessages[1] + ' - ' + str(FineValues[1]) + '\n'
    return (fine, fine_message)

def check_fine_3(region_country, country, fine, fine_message):
    if not str_equals_record(region_country, country):
        fine += FineValues[2]
        fine_message += FineMessages[2] + str(FineValues[2]) + '\n'
    return (fine, fine_message)

def check_fine_4(location_country, country, fine, fine_message):
    if not str_equals_record(location_country, country):
        fine += FineValues[3]
        fine_message += FineMessages[3] + str(FineValues[3]) + '\n'
    return (fine, fine_message)

def eval_fine(database, combination):
    
    LocationCode = LocationCodes[0]
    RegionCode = LocationCodes[1]
    CountryCode = LocationCodes[2]
    
    location_exists = LocationCode in combination.keys()
    region_exists = RegionCode in combination.keys()
    country_exists = CountryCode in combination.keys()
    
    print(location_exists, region_exists, country_exists)
    
    if location_exists:
        location, location_origin, location_dist = combination[LocationCode]
    if region_exists:
        region, region_origin, region_dist = combination[RegionCode]
    if country_exists:
        country, country_origin, country_dist = combination[CountryCode]
    
    total_fine_val = 0
    total_fine_message = ''
    
    
    edit_distance_fine_val = 0
    edit_distance_fine_message = ''
    if location_exists and location_dist > 0:
        edit_distance_fine_val += location_dist
        edit_distance_fine_message += 'Location: ' + location['name'] + ' - ' + location_origin + ': ' + str(location_dist) + '\n'
    if region_exists and region_dist > 0:
        edit_distance_fine_val += region_dist
        edit_distance_fine_message += 'Region: ' + region['name'] + ' - ' + region_origin + ': ' + str(region_dist) + '\n'
    if country_exists and country_dist > 0:
        edit_distance_fine_val += country_dist
        edit_distance_fine_message += 'Country: ' + country['name'] + ' - ' + country_origin + ': ' + str(country_dist) + '\n'
    if edit_distance_fine_message:
        edit_distance_fine_message = 'Edit distance fine: \n' + edit_distance_fine_message
        total_fine_val += edit_distance_fine_val
        total_fine_message += edit_distance_fine_message
    
    
    if location_exists:
        location_info = get_info_for_ppl(database, location['name'])
    if region_exists:
        region_countries = get_countries(database, region['name'])
    
    discrepancy_fine_val = 1000
    discrepancy_fine_message = ''
    
    if location_exists and region_exists and country_exists:
        for info in location_info:
            for region_country in region_countries:
                fine = 0
                fine_message = ''
                fine, fine_message = check_fine_1(info[0], region, fine, fine_message)
                fine, fine_message = check_fine_2(database, info[1], region_country, fine, fine_message)
                fine, fine_message = check_fine_3(region_country, country, fine, fine_message)
                fine, fine_message = check_fine_4(info[1], country, fine, fine_message)
                if fine < discrepancy_fine_val:
                    discrepancy_fine_val = fine
                    discrepancy_fine_message = fine_message
    elif location_exists and region_exists:
        for info in location_info:
            for region_country in region_countries:
                fine = 0
                fine_message = ''
                fine, fine_message = check_fine_1(info[0], region, fine, fine_message)
                fine, fine_message = check_fine_2(database, info[1], region_country, fine, fine_message)
                if fine < discrepancy_fine_val:
                    discrepancy_fine_val = fine
                    discrepancy_fine_message = fine_message
    elif location_exists and country_exists:
        for info in location_info:
            fine = 0
            fine_message = ''
            fine, fine_message = check_fine_4(info[1], country, fine, fine_message)
            if fine < discrepancy_fine_val:
                discrepancy_fine_val = fine
                discrepancy_fine_message = fine_message
    elif region_exists and country_exists:
        for region_country in region_countries:
            fine = 0
            fine_message = ''
            fine, fine_message = check_fine_3(region_country, country, fine, fine_message)
            if fine < discrepancy_fine_val:
                discrepancy_fine_val = fine
                discrepancy_fine_message = fine_message
                
    if discrepancy_fine_message:
        discrepancy_fine_message = 'Discrepancy fine: \n' + discrepancy_fine_message
        total_fine_val += discrepancy_fine_val
        total_fine_message += discrepancy_fine_message

        
    return (total_fine_val, total_fine_message)

In [40]:
def obtain_location(database, full_location):
    full_similar = get_full_similar(database, full_location)
    print('FULL SIMILAR\n')
    combinations = get_possible_combinations(database, full_similar)
    print('COMBINATIONS\n')
    fined_combinations = []
    for combination in combinations:
        fine, message = eval_fine(database, combination)
        fined_combinations.append((combination, fine, message))
    sorted_combinations = sorted(fined_combinations, key = lambda tup: tup[1])
    
    return sorted_combinations

In [53]:
def ready_combinations(sorted_combinations):
    ready_combinations = []
    commented_combinations = []
    for combination_fine in sorted_combinations:
        combination, fine, message = combination_fine
        
        ready = []
        for i in range(3):
            level = Levels[i]
            code = LocationCodes[i]
            entry = combination[code][0]['name']
            ready.append(level + ': ' + entry)
        
        result = ', '.join(ready)
        ready_combinations.append(result)
        commented_combinations.append(result + ';\n' + str(fine) + ' ' + message)
    return ready_combinations, commented_combinations

In [44]:
loc = obtain_location(geonames, ['Dor', 'Vologda', 'Russia'])

FULL SIMILAR
 [{'PCL': [], 'ADM': [], 'PPL': [(geonameid                              475633
name                                      Bor
asciiname                                 Bor
altnames     Bor,Verkhne-Bor,Verkhniy Bor,Бор
lat                                   58.5842
long                                  55.9138
fclass                                      P
fcode                                     PPL
ccode                                      RU
cc2                                       NaN
acode1                                     90
acode2                                    NaN
acode3                                    NaN
acode4                                    NaN
pop                                         0
elev                                      NaN
gtopo                                     168
tz                         Asia/Yekaterinburg
mdate                              2012-04-05
Name: 15204, dtype: object, 'Dor', 1), (geonameid                          5171



True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True
True True True


TypeError: cannot compare a dtyped [object] array with a scalar of type [bool]

In [56]:
r, c = ready_combinations(loc)
c

['city: Vologda, region: Vologodskaya Oblast’, country: Russian Federation; 2 Edit distance fine: \nRegion: Vologodskaya Oblast’ - Vologda: 1\nCountry: Russian Federation - Russia: 1\n',
 'city: Vologda, region: Vologodskaya Oblast’, country: Russian Federation; 2 Edit distance fine: \nRegion: Vologodskaya Oblast’ - Vologda: 1\nCountry: Russian Federation - Russia: 1\n']

In [75]:
import re

line = 'kokok <<Location><o>a, b</o></Location>dsd<<Location><o>d, a, b</o></Location>'

p = re.compile('<<Location><o>(.*?)</o></Location>')

al = re.findall(p, line)

for s in al:
    old_loc = '<<Location><o>' + s + '</o></Location>'
    new_loc = '<<Location><o>' + 'WOW' + s + 'UU'  + '</o></Location>'
    line = line.replace(old_loc, new_loc)

print(line)
loc = p.search(line).group(1)
loc = loc.split(',')
loc = [s.strip() for s in loc]


print(','.join(['s', 'd']))

kokok <<Location><o>WOWa, bUU</o></Location>dsd<<Location><o>WOWd, a, bUU</o></Location>
s,d


In [86]:
def get_cyrillic_name(record):
    names = [record['name']] + record['altnames'].split(',')
    max_inters = 0
    max_name = names[0]
    for name in names:
        inters = 0
        for char in name:
            if char in RussianLetters:
                inters += 1
        if inters > max_inters:
            print(inters, name)
            max_inters = inters
            max_name = name
    return max_name

#print(combinations[0])
series = combinations[0]['PCL'][0]
get_cyrillic_name(combinations[0]['PCL'][0])

4 Орос
17 Расійская Федэрацыя
19 Российская Федерация


'Российская Федерация'