# Импорты

In [1]:
import geojson
import numpy as np
import pandas as pd
import geopandas as gpd

# Данные по городам из OpenStreetMap

In [2]:
with open('data/osm_data_raw.geojson', 'r', encoding='utf-8') as file:
    osm_data = geojson.load(file)

In [3]:
osm_data.keys()

dict_keys(['type', 'generator', 'copyright', 'timestamp', 'features'])

In [4]:
len(osm_data['features'])

78100

In [5]:
osm_data_df = gpd.GeoDataFrame.from_features(osm_data['features'])
osm_data_df.head()

Unnamed: 0,geometry,@id,admin_level,capital,name,name:ar,name:be,name:ca,name:cs,name:da,...,old_name:mo,old_name:sg,abandoned:village,addr:reg,old_name:inh,alt_official_name,KN,alt_name:lbe,address,tourism
0,POINT (82.92345 55.02822),node/54,3.0,3,Новосибирск,نوفوسيبيرسك,Новасібірск,Novossibirsk,Novosibirsk,Novosibirsk,...,,,,,,,,,,
1,POINT (30.52414 50.45003),node/26150422,,yes,Київ,كييف,Кіеў,Kíiv,Kyjev,Kyiv,...,,,,,,,,,,
2,POINT (31.29433 51.49410),node/26150436,,4,Чернігів,,Чарнігаў,Txerníhiv,,,...,,,,,,,,,,
3,POINT (30.73928 46.48732),node/26150437,,4,Одеса,أوديسا,Адэса,Odessa,Oděsa,,...,,,,,,,,,,
4,POINT (24.03159 49.84195),node/26150791,4.0,4,Львів,لفيف,Львоў,Lviv,Lvov,,...,,,,,,,,,,


In [6]:
properties = ['addr:country', 'name', 'name:ru', 'place', 'population', 'geometry', ]

In [7]:
osm_cities_df = pd.DataFrame(index=osm_data_df.index)
osm_cities_df.head()

0
1
2
3
4


In [8]:
osm_cities_df['country'] = osm_data_df['addr:country'].str.strip()
osm_cities_df['type'] = osm_data_df['place'].str.strip()
osm_cities_df['name'] = osm_data_df['name:ru'].combine_first(osm_data_df['name']).str.strip()
osm_cities_df.describe()

Unnamed: 0,country,type,name
count,31467,78100,78034
unique,4,5,49646
top,RU,village,Александровка
freq,25229,68243,199


In [9]:
def parse_population(input: str):
    digit_str = input.split('(')[0].strip()
    digit_str = ''.join(filter(str.isdigit, digit_str))
    if digit_str.isnumeric():
        return int(digit_str)
    return np.nan

osm_cities_df['population'] = osm_data_df['population'].astype(str).map(parse_population).replace(0, np.nan)
osm_cities_df['population'].describe()

count    5.183900e+04
mean     3.935823e+03
std      6.994596e+04
min      1.000000e+00
25%      2.780000e+02
50%      5.260000e+02
75%      1.127000e+03
max      1.265505e+07
Name: population, dtype: float64

In [10]:
osm_cities_df['latitude'] = osm_data_df['geometry'].x
osm_cities_df['longitude'] = osm_data_df['geometry'].y

In [11]:
osm_cities_df = osm_cities_df[osm_cities_df['name'].notna() & (osm_cities_df['name'].str.len() != 0)]

In [12]:
osm_cities_df = (osm_cities_df
                 .sort_values(['name', 'population'], ascending=[True, False])
                 .drop_duplicates(subset=['name'], ))

In [13]:
osm_cities_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49645 entries, 77859 to 60681
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     21493 non-null  object 
 1   type        49645 non-null  object 
 2   name        49645 non-null  object 
 3   population  33998 non-null  float64
 4   latitude    49645 non-null  float64
 5   longitude   49645 non-null  float64
dtypes: float64(3), object(3)
memory usage: 2.7+ MB


In [14]:
osm_cities_df.to_csv('data/cities_info.csv', index=False)