In [41]:
import pandas as pd
import numpy as np
import pickle
import os.path

## Export to Pandas DataFrame
Для того что бы быстрее взаимодействовать с данными переводим их в DataFrame а также сохраняем в бинарном виде через модуль Pickle

In [42]:
def save_pickle(data, file:str):
    with open(file, 'wb') as data_file:
        pickle.dump(data, data_file)

def load_pickle(file):
    with open(file, 'rb') as data_file:
        return pickle.load(data_file)
    
def save_to_csv(data, file_name):
    data.to_csv(file_name, encoding='utf-16', sep=';', index=False)
            
def save_to_xlsx(data, file_name):
    label = list(data[0].keys())
    pd_frame = pd.DataFrame(
        data,
        columns=label
    )
    pd_frame.to_excel(file_name)

In [43]:
FIND_REGION = '50'

Первоначальная работа с dbf файлами. В дальнейшем все dbf файлы преобразуются в бинарный формат. Обязательно нужны файлы "КЛАДР" в директории data.

In [44]:
from dbfread import DBF

ALTNAMES = DBF('data/ALTNAMES.dbf')
DOMA = DBF('data/DOMA.dbf')
KLADR = DBF('data/KLADR.dbf')
SOCRBASE = DBF('data/SOCRBASE.dbf')
STREET = DBF('data/STREET.dbf')
NAMEMAP = DBF('data/NAMEMAP.dbf')

In [45]:
kladr = pd.DataFrame(iter(KLADR))
streets = pd.DataFrame(iter(STREET))
homes = pd.DataFrame(iter(DOMA))
short_names = pd.DataFrame(iter(SOCRBASE))
altnames = pd.DataFrame(iter(ALTNAMES))
namemap = pd.DataFrame(iter(NAMEMAP))

При этом сразу фильтруем только актуальные адреса, заканчивающиеся на '00'.

In [46]:
regexp_1 = '^'+FIND_REGION+'\d*00$'
regexp_2 = '^'+FIND_REGION+'\d*'

first_kladr = kladr[kladr['CODE'].str.contains(regexp_1)]
first_streets = streets[streets['CODE'].str.contains(regexp_1)]
first_homes = homes[homes['CODE'].str.contains(regexp_2)]
first_altnames = altnames[altnames['NEWCODE'].str.contains(regexp_2)]
first_namemap = namemap[namemap['CODE'].str.contains(regexp_2)]


In [47]:
save_pickle(first_kladr, 'data/'+FIND_REGION+'_region_kladr.pkl')
save_pickle(first_streets, 'data/'+FIND_REGION+'_region_streets.pkl')
save_pickle(first_homes, 'data/'+FIND_REGION+'_region_homes.pkl')
save_pickle(first_altnames, 'data/'+FIND_REGION+'_region_altnames.pkl')
save_pickle(first_namemap, 'data/'+FIND_REGION+'_region_namemap.pkl')
save_pickle(short_names, 'data/short_names.pkl')


### Загружаем DataFrame
Уже работает с бинарным форматом (это ускоряет работы при отладке)

In [48]:
kladr:pd.DataFrame = load_pickle('data/'+FIND_REGION+'_region_kladr.pkl')
streets:pd.DataFrame = load_pickle('data/'+FIND_REGION+'_region_streets.pkl')
homes:pd.DataFrame = load_pickle('data/'+FIND_REGION+'_region_homes.pkl')
short_names:pd.DataFrame = load_pickle('data/short_names.pkl')

## Описание самого класса и всего функционала

In [51]:
from numpy import NaN
from pandas import isnull


class KladrFinder():
    def __init__(
            self,
            kladr: pd.DataFrame,
            streets: pd.DataFrame,
            homes: pd.DataFrame,
            short_names: pd.DataFrame,
            region=31
        ) -> None:
        self.kladr = kladr
        self.streets =streets
        self.homes = homes
        self.short_names = short_names
        self.region = region
        self.filename_save_df = 'data/'+str(self.region)+'_region_df.pkl'
        
        street_df_columns =  [
            'level_1_name',
            'level_1_short_name',
            'level_1_code',
            
            'level_2_name',
            'level_2_short_name',
            'level_2_code',
            
            'level_3_name',
            'level_3_short_name',
            'level_3_code',
            
            'level_4_name',
            'level_4_short_name',
            'level_4_code',
            
            'level_5_name',
            'level_5_short_name',
            'level_5_code',
            
            'level_6_name',
            'level_6_short_name',
        ] 
        self.streets_df = pd.DataFrame(columns=street_df_columns, )
         
        self.streets_list_dict = []
         
        self.anomaly = []   
        
        if os.path.isfile(self.filename_save_df):
            self.streets_df = self.load_df()
        else:
            self.calculate()
            
    def add_anomaly(self, message):
        self.anomaly.append(message)
                
    def load_df(self):
        with open(self.filename_save_df, 'rb') as data_file:
            return pickle.load(data_file)
        
    def save_df(self):
        with open(self.filename_save_df, 'wb') as data_file:
            pickle.dump(self.streets_df, data_file)

    def find_level_1(self, code):
        new_code = code[:2]+('0'*11)
        return self.find_kladr_for_code(new_code)
    
    def find_level_2(self, code):
        if (code[2:5] == '000'):
            return None
        new_code = code[:5]+('0'*8)
        return self.find_kladr_for_code(new_code)
    
    def find_level_3(self, code):
        if (code[5:8] == '000'):
            return None       
        new_code = code[:8]+('0'*5)
        return self.find_kladr_for_code(new_code)
    
    def find_level_4(self, code:str):
        if (code[8:11] == '000'):
            return None       
        new_code = code[:11]+('0'*2)
        return self.find_kladr_for_code(new_code)
        
    def find_level_6(self, code):
        return self.find_home_for_code(code)   
    
    def find_kladr_for_code(self, code:str):
        kladr_list = self.kladr[self.kladr['CODE'].str.contains(code)]
        kladr_list = kladr_list.reset_index(drop=True)
        return kladr_list

    def find_home_for_code(self, code:str):
        regexp = r'^'+code+'\d*'
        homes_df = self.homes[self.homes['CODE'].str.contains(regexp)]
        names = []
        short_name = 'ДОМ'        
        for index, row in homes_df.iterrows():
            temp = row['NAME'].split(',')
            for i in temp:
                names.append(i)
        return {
            'level_6_name': names,
            'level_6_short_name': short_name
        }
    
    def to_long_name(self, short_name):
        try:
            df = self.short_names
            line = df[df['SCNAME'] == short_name]['SOCRNAME'].iat[0]        
            return line
        except:
            a = 0

    
    def create_yandex_data(self):       
        df = self.streets_df
        short_data_list = [] 
        long_data_list = []
        index_length = len(df.index)
        cnt = 0       
        for index in df.index:
            cnt +=1
            if cnt % 1000 == 0:
                print('calculate', (cnt/index_length)*100, '%')
            homes = df['level_6_name'][index]
            for home in homes:
                text_levels_short = []
                text_levels_long = []
            
                if 'level_1_name' in df.columns:
                    if not pd.isnull(df['level_1_name'][index]):
                        text_levels_short.append(' '.join([df['level_1_name'][index], df['level_1_short_name'][index]]))
                        text_levels_long.append(' '.join([str(df['level_1_name'][index]), str(self.to_long_name(df['level_1_short_name'][index]))]))
                if 'level_2_name' in df.columns:
                    if not pd.isnull(df['level_2_name'][index]):
                        text_levels_short.append(' '.join([str(df['level_2_name'][index]), str(df['level_2_short_name'][index])]))
                        text_levels_long.append(' '.join([str(df['level_2_name'][index]), str(self.to_long_name(df['level_2_short_name'][index]))]))
                if 'level_3_name' in df.columns:
                    if not pd.isnull(df['level_3_name'][index]):
                        text_levels_short.append(' '.join([str(df['level_3_short_name'][index]), str(df['level_3_name'][index])]))
                        text_levels_long.append(' '.join([str(self.to_long_name(df['level_3_short_name'][index])), str(df['level_3_name'][index])]))
                if 'level_4_name' in df.columns:
                    if not pd.isnull(df['level_4_name'][index]):
                        text_levels_short.append(' '.join([str(df['level_4_short_name'][index]), str(df['level_4_name'][index])]))
                        text_levels_long.append(' '.join([str(self.to_long_name(df['level_4_short_name'][index])), str(df['level_4_name'][index])]))
                if 'level_5_name' in self.streets_df.columns:
                    if not pd.isnull(df['level_5_name'][index]):
                        text_levels_short.append(' '.join([str(df['level_5_name'][index]), str(df['level_5_short_name'][index])]))
                        text_levels_long.append(' '.join([str(df['level_5_name'][index]), str(self.to_long_name(df['level_5_short_name'][index]))]))
                        
                row_dict_short = ', '.join(text_levels_short)
                row_dict_short = ' '.join([row_dict_short, home])
                short_data_list.append(row_dict_short)
                
                row_dict_long = ', '.join(text_levels_long)
                row_dict_long = ' '.join([row_dict_long, home])
                long_data_list.append(row_dict_long)
                
        self.yandex_addresses_short = pd.DataFrame(short_data_list, columns=['name'])
        self.yandex_addresses_long = pd.DataFrame(long_data_list, columns=['name'])
    
    def create_google_data(self):       
        df = self.streets_df
        short_data_list = [] 
        long_data_list = []
        index_length = len(df.index)
        cnt = 0       
        for index in df.index:
            cnt +=1
            if cnt % 100 == 0:
                print('calculate', (cnt/index_length)*100, '%')
            homes = df['level_6_name'][index]
            for home in homes:
                text_levels_short = []
                text_levels_long = []
                
                if 'level_5_name' in self.streets_df.columns:
                    if not pd.isnull(df['level_5_name'][index]):
                        text_levels_short.append(' '.join([str(df['level_5_name'][index]), str(df['level_5_short_name'][index])]))
                        text_levels_long.append(' '.join([str(df['level_5_name'][index]), str(self.to_long_name(df['level_5_short_name'][index]))]))  
                text_levels_short.append(home)
                text_levels_long.append(home)
                if 'level_1_name' in df.columns:
                    if not pd.isnull(df['level_1_name'][index]):
                        text_levels_short.append(' '.join([str(df['level_1_name'][index]), str(df['level_1_short_name'][index])]))
                        text_levels_long.append(' '.join([str(df['level_1_name'][index]), str(self.to_long_name(df['level_1_short_name'][index]))]))
                if 'level_2_name' in df.columns:
                    if not pd.isnull(df['level_2_name'][index]):
                        text_levels_short.append(' '.join([str(df['level_2_name'][index]), str(df['level_2_short_name'][index])]))
                        text_levels_long.append(' '.join([str(df['level_2_name'][index]), str(self.to_long_name(df['level_2_short_name'][index]))]))
                if 'level_3_name' in df.columns:
                    if not pd.isnull(df['level_3_name'][index]):
                        text_levels_short.append(' '.join([str(df['level_3_short_name'][index]), str(df['level_3_name'][index])]))
                        text_levels_long.append(' '.join([str(self.to_long_name(df['level_3_short_name'][index])), str(df['level_3_name'][index])]))
                if 'level_4_name' in df.columns:
                    if not pd.isnull(df['level_4_name'][index]):
                        text_levels_short.append(' '.join([str(df['level_4_short_name'][index]), str(df['level_4_name'][index])]))
                        text_levels_long.append(' '.join([str(self.to_long_name(df['level_4_short_name'][index])), str(df['level_4_name'][index])]))
                
                text_levels_short.append('Россия')                        
                row_dict_short = ', '.join(text_levels_short)
                short_data_list.append(row_dict_short)
                
                text_levels_long.append('Россия')
                row_dict_long = ', '.join(text_levels_long)
                long_data_list.append(row_dict_long)
                
        self.google_addresses_short = pd.DataFrame(short_data_list, columns=['name'])
        self.google_addresses_long = pd.DataFrame(long_data_list, columns=['name'])    
    
    def calculate(self):
        cnt = 0
        for index, row in self.streets.iterrows():
            cnt +=1
            if cnt % 1000 == 0:
                print('calculate', cnt/len(self.streets)*100, '%')
            row_dict = {}
            try:
                row_dict.update({
                    'level_5_name': row['NAME'],
                    'level_5_short_name': row['SOCR'],
                    'level_5_code': row['CODE'],
                })
                code = row['CODE'][0:-6]+row['CODE'][-2:]
                locality = self.find_level_4(code)
                if locality is not None:
                    row_dict.update({
                        'level_4_name': locality['NAME'][0],
                        'level_4_short_name': locality['SOCR'][0],
                        'level_4_code': locality['CODE'][0],
                    })
                city = self.find_level_3(code)
                if city is not None:
                    row_dict.update({
                        'level_3_name': city['NAME'][0],
                        'level_3_short_name': city['SOCR'][0],
                        'level_3_code': city['CODE'][0],
                    })
                district = self.find_level_2(code)
                if district is not None:
                    row_dict.update({
                        'level_2_name': district['NAME'][0],
                        'level_2_short_name': district['SOCR'][0],
                        'level_2_code': district['CODE'][0],
                    })
                region = self.find_level_1(code)
                if region is not None:
                    row_dict.update({
                        'level_1_name': region['NAME'][0],
                        'level_1_short_name': region['SOCR'][0],
                        'level_1_code': region['CODE'][0],
                    })
                homes = self.find_level_6(row['CODE'])
                if homes is not None:
                    
                    row_dict.update(homes)
                self.streets_list_dict.append(
                    {
                        'index': index,
                        'row': row_dict
                    }
                )
            except:
                self.add_anomaly('Except. Index:{}, CODE:{}'.format(index, code))
                print('Except. Index:{}, CODE:{}'.format(index, code))
                continue
        self.streets_df = pd.DataFrame(
            [row['row'] for row in self.streets_list_dict],
            [index['index'] for index in self.streets_list_dict]
        )
        self.streets_list_dict.clear()
        self.save_df()       

run = KladrFinder(kladr, streets, homes, short_names, region=FIND_REGION)

## Запуск формирования данных в формате Yandex

In [52]:
run.create_yandex_data()
save_to_csv(run.yandex_addresses_long, FIND_REGION+'_region_yandex_long.csv')
save_to_csv(run.yandex_addresses_short, FIND_REGION+'_region_yandex_short.csv')

calculate 6.4114893889850615 %
calculate 12.822978777970123 %
calculate 19.234468166955185 %
calculate 25.645957555940246 %
calculate 32.0574469449253 %
calculate 38.46893633391037 %
calculate 44.88042572289543 %
calculate 51.29191511188049 %
calculate 57.70340450086555 %
calculate 64.1148938898506 %
calculate 70.52638327883567 %
calculate 76.93787266782074 %
calculate 83.3493620568058 %
calculate 89.76085144579086 %
calculate 96.17234083477591 %


## Запуск формирования данных в формате Google

In [53]:
run.create_google_data()
save_to_csv(run.google_addresses_long, FIND_REGION+'_region_google_long.csv')
save_to_csv(run.google_addresses_short, FIND_REGION+'_region_google_short.csv')


calculate 0.6411489388985061 %
calculate 1.2822978777970122 %
calculate 1.9234468166955183 %
calculate 2.5645957555940244 %
calculate 3.2057446944925307 %
calculate 3.8468936333910366 %
calculate 4.488042572289543 %
calculate 5.129191511188049 %
calculate 5.770340450086555 %
calculate 6.4114893889850615 %
calculate 7.052638327883567 %
calculate 7.693787266782073 %
calculate 8.33493620568058 %
calculate 8.976085144579086 %
calculate 9.617234083477593 %
calculate 10.258383022376098 %
calculate 10.899531961274604 %
calculate 11.54068090017311 %
calculate 12.181829839071616 %
calculate 12.822978777970123 %
calculate 13.46412771686863 %
calculate 14.105276655767135 %
calculate 14.746425594665642 %
calculate 15.387574533564147 %
calculate 16.02872347246265 %
calculate 16.66987241136116 %
calculate 17.311021350259665 %
calculate 17.952170289158172 %
calculate 18.59331922805668 %
calculate 19.234468166955185 %
calculate 19.875617105853692 %
calculate 20.516766044752195 %
calculate 21.157914983