In [75]:
import pandas as pd
import pickle

## Export to Pandas DataFrame
Для того что бы быстрее взаимодействовать с данными переводим их в DataFrame а также сохраняем в бинарном виде через модуль Pickle

In [76]:
def save_pickle(data, file:str):
    with open(file, 'wb') as data_file:
        pickle.dump(data, data_file)

def load_pickle(file):
    with open(file, 'rb') as data_file:
        return pickle.load(data_file)
    
def save_to_csv(data, file_name):
    data.to_csv(file_name, encoding='utf-16', sep=';', index=False)
            
def save_to_xlsx(data, file_name):
    label = list(data[0].keys())
    pd_frame = pd.DataFrame(
        data,
        columns=label
    )
    pd_frame.to_excel(file_name)

In [77]:
FIND_REGION = '50'

In [78]:
# from dbfread import DBF

# ALTNAMES = DBF('data/ALTNAMES.dbf')
# DOMA = DBF('data/DOMA.dbf')
# KLADR = DBF('data/KLADR.dbf')
# SOCRBASE = DBF('data/SOCRBASE.dbf')
# STREET = DBF('data/STREET.dbf')
# NAMEMAP = DBF('data/NAMEMAP.dbf')

In [79]:
# kladr = pd.DataFrame(iter(KLADR))
# streets = pd.DataFrame(iter(STREET))
# homes = pd.DataFrame(iter(DOMA))
# short_names = pd.DataFrame(iter(SOCRBASE))
# altnames = pd.DataFrame(iter(ALTNAMES))
# namemap = pd.DataFrame(iter(NAMEMAP))

In [80]:

# regexp_1 = '^'+FIND_REGION+'\d*00$'
# regexp_2 = '^'+FIND_REGION+'\d*'

# first_kladr = kladr[kladr['CODE'].str.contains(regexp_1)]
# first_streets = streets[streets['CODE'].str.contains(regexp_1)]
# first_homes = homes[homes['CODE'].str.contains(regexp_2)]
# first_altnames = altnames[altnames['NEWCODE'].str.contains(regexp_2)]
# first_namemap = namemap[namemap['CODE'].str.contains(regexp_2)]


In [81]:
# save_pickle(first_kladr, 'data/'+FIND_REGION+'_region_kladr.pkl')
# save_pickle(first_streets, 'data/'+FIND_REGION+'_region_streets.pkl')
# save_pickle(first_homes, 'data/'+FIND_REGION+'_region_homes.pkl')
# save_pickle(first_altnames, 'data/'+FIND_REGION+'_region_altnames.pkl')
# save_pickle(first_namemap, 'data/'+FIND_REGION+'_region_namemap.pkl')
# save_pickle(short_names, 'data/short_names.pkl')


### Загружаем DataFrame

In [82]:
kladr:pd.DataFrame = load_pickle('data/'+FIND_REGION+'_region_kladr.pkl')
streets:pd.DataFrame = load_pickle('data/'+FIND_REGION+'_region_streets.pkl')
homes:pd.DataFrame = load_pickle('data/'+FIND_REGION+'_region_homes.pkl')
short_names:pd.DataFrame = load_pickle('data/short_names.pkl')

## Выбираем только тестовые улицы

In [83]:
# test_streets = streets[2000:2010]
# test_streets.head()

In [84]:
from numpy import NaN


class KladrFinder():
    def __init__(
            self,
            kladr: pd.DataFrame,
            streets: pd.DataFrame,
            homes: pd.DataFrame,
            short_names: pd.DataFrame
        ) -> None:
        self.kladr = kladr
        self.streets =streets
        self.homes = homes
        self.short_names = short_names
        
        street_df_columns =  [
            'level_1_name',
            'level_1_short_name',
            'level_1_code',
            
            'level_2_name',
            'level_2_short_name',
            'level_2_code',
            
            'level_3_name',
            'level_3_short_name',
            'level_3_code',
            
            'level_4_name',
            'level_4_short_name',
            'level_4_code',
            
            'level_5_name',
            'level_5_short_name',
            'level_5_code',
            
            'level_6_name',
            'level_6_short_name',
        ] 
        self.streets_df = pd.DataFrame(columns=street_df_columns, )
         
        self.streets_list_dict = []
         
        self.anomaly = []   
    
    def add_anomaly(self, message):
        self.anomaly.append(message)
    

    def find_level_1(self, code):
        new_code = code[:2]+('0'*11)
        return self.find_kladr_for_code(new_code)
    
    def find_level_2(self, code):
        if (code[2:5] == '000'):
            return None
        new_code = code[:5]+('0'*8)
        return self.find_kladr_for_code(new_code)
    
    def find_level_3(self, code):
        if (code[5:8] == '000'):
            return None       
        new_code = code[:8]+('0'*5)
        return self.find_kladr_for_code(new_code)
    
    def find_level_4(self, code:str):
        if (code[8:11] == '000'):
            return None       
        new_code = code[:11]+('0'*2)
        return self.find_kladr_for_code(new_code)
        
    def find_level_6(self, code):
        return self.find_home_for_code(code)   
    
    def find_kladr_for_code(self, code:str):
        kladr_list = self.kladr[self.kladr['CODE'].str.contains(code)]
        kladr_list = kladr_list.reset_index(drop=True)
        return kladr_list

    def find_home_for_code(self, code:str):
        regexp = r'^'+code+'\d*'
        homes_df = self.homes[self.homes['CODE'].str.contains(regexp)]
        names = []
        short_name = 'ДОМ'        
        for index, row in homes_df.iterrows():
            temp = row['NAME'].split(',')
            for i in temp:
                names.append(i)
        return {
            'level_6_name': names,
            'level_6_short_name': short_name
        }
    
    def yandex_mask(
            self,
            l_1_name, l_1_short,
            l_2_name, l_2_short,
            l_3_name, l_3_short,
            l_4_name, l_4_short,
            l_5_name, l_5_short,
            l_6_name, l_6_short                             
        ):
        return ''
    
    def create_yandex_data(self):       
        short_data_list = [] 
        long_data_list = []
        for index in self.streets_df.index:
            homes = self.streets_df['level_6_name'][index]
            for home in homes:
                text_levels = []
                text_levels.append(' '.join([self.streets_df['level_1_name'][index], self.streets_df['level_1_short_name'][index]]))
                if 'level_2_name' in self.streets_df.columns:
                    if self.streets_df['level_2_name'][index] is not NaN:
                        level_2 = ' '.join([str(self.streets_df['level_2_name'][index]), str(self.streets_df['level_2_short_name'][index])])
                if 'level_3_name' in self.streets_df.columns:
                    if self.streets_df['level_3_name'][index] is not NaN:
                        text_levels.append(' '.join([str(self.streets_df['level_3_short_name'][index]), str(self.streets_df['level_3_name'][index])]))
                if 'level_4_name' in self.streets_df.columns:
                    if self.streets_df['level_4_name'][index] is not NaN:
                        text_levels.append(' '.join([str(self.streets_df['level_4_short_name'][index]), str(self.streets_df['level_4_name'][index])]))
                if 'level_5_name' in self.streets_df.columns:
                    if self.streets_df['level_5_name'][index] is not NaN:
                        text_levels.append(' '.join([str(self.streets_df['level_5_name'][index]), str(self.streets_df['level_5_short_name'][index])]))
                row_dict_short = ', '.join(text_levels)
                row_dict_short = ' '.join([row_dict_short, home])
                short_data_list.append(row_dict_short)
        self.yandex_addresses = pd.DataFrame(short_data_list, columns=['name'])
    
    def calculate(self):
        cnt = 0
        for index, row in self.streets.iterrows():
            cnt +=1
            if cnt % 100 == 0:
                print('calculate', cnt/len(self.streets)*100, '%')
            row_dict = {}
            try:
                row_dict.update({
                    'level_5_name': row['NAME'],
                    'level_5_short_name': row['SOCR'],
                    'level_5_code': row['CODE'],
                })
                code = row['CODE'][0:-6]+row['CODE'][-2:]
                locality = self.find_level_4(code)
                if locality is not None:
                    row_dict.update({
                        'level_4_name': locality['NAME'][0],
                        'level_4_short_name': locality['SOCR'][0],
                        'level_4_code': locality['CODE'][0],
                    })
                city = self.find_level_3(code)
                if city is not None:
                    row_dict.update({
                        'level_3_name': city['NAME'][0],
                        'level_3_short_name': city['SOCR'][0],
                        'level_3_code': city['CODE'][0],
                    })
                district = self.find_level_2(code)
                if district is not None:
                    row_dict.update({
                        'level_2_name': district['NAME'][0],
                        'level_2_short_name': district['SOCR'][0],
                        'level_2_code': district['CODE'][0],
                    })
                region = self.find_level_1(code)
                if region is not None:
                    row_dict.update({
                        'level_1_name': region['NAME'][0],
                        'level_1_short_name': region['SOCR'][0],
                        'level_1_code': region['CODE'][0],
                    })
                homes = self.find_level_6(row['CODE'])
                if homes is not None:
                    
                    row_dict.update(homes)
                self.streets_list_dict.append(
                    {
                        'index': index,
                        'row': row_dict
                    }
                )
            except:
                print('Except')
                continue
        self.streets_df = pd.DataFrame(
            [row['row'] for row in self.streets_list_dict],
            [index['index'] for index in self.streets_list_dict]
        )
        self.streets_list_dict.clear()
        
        
        

run = KladrFinder(kladr, streets, homes, short_names)

In [85]:
run.calculate()

calculate 0.15152892687213987 %
calculate 0.30305785374427974 %


In [None]:
run.create_yandex_data()
save_to_csv(run.yandex_addresses, FIND_REGION+'_region.csv')