# Получение данных об инфраструктуре поблизости от квартиры

## Импорт библиотек

In [1]:
import requests
import json
import time
import pandas as pd
import numpy as np

## Открытие данных

In [2]:
df = pd.read_csv('../clean_categorizated_data_without_outliers_with_coordinates.csv', sep=';')

In [3]:
df.sample(3)

Unnamed: 0,number of rooms,area of apartment,number of floors,apartment floor,price,year of construction,elevator,full address,concierge,garbage chute,...,district_Ленинский,district_Мотовилихинский,district_Орджоникидзевский,district_Свердловский,parking_за шлагбаумом во дворе,parking_открытая во дворе,parking_наземная многоуровневая,parking_подземная,lat,lon
506,2,77.0,10,6,4300000.0,2006,1.0,"Пермский край, Пермь, Кировоградская ул., 6",0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.993407,55.944284
610,1,36.4,5,4,2250000.0,1964,0.0,"Пермский край, Пермь, шоссе Космонавтов, 203А",0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.971764,56.158114
189,3,72.0,11,5,5200000.0,2003,1.0,"Пермский край, Пермь, ул. КИМ, 49",0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,58.024764,56.30116


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1767 entries, 0 to 1766
Data columns (total 38 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   number of rooms                  1767 non-null   int64  
 1   area of apartment                1767 non-null   float64
 2   number of floors                 1767 non-null   int64  
 3   apartment floor                  1767 non-null   int64  
 4   price                            1767 non-null   float64
 5   year of construction             1767 non-null   int64  
 6   elevator                         1767 non-null   float64
 7   full address                     1767 non-null   object 
 8   concierge                        1767 non-null   int64  
 9   garbage chute                    1767 non-null   int64  
 10  repair_дизайнерский              1767 non-null   float64
 11  repair_евро                      1767 non-null   float64
 12  repair_косметический

## Парсинг данных координат для определенных категорий

Запускаем алгоритм парсинга несколько раз, чтобы спарсить все подкатегории

In [77]:
all_categories = {
    "edu":{
        "amenities": ["college", "kindergarten", "library", "school", "university"],
        "data": set()
    },
    "health":{
        "amenities": ["clinic", "hospital"],
        "data": set()
    },
    "culture":{
        "amenities": ["cinema", "fountain", "theatre"],
        "data": set()
    },
    "eat":{
        "amenities": ["cafe", "fast_food", "pub", "restaurant"],
        "data": set()
    },
}

In [90]:
overpass_url = "http://overpass-api.de/api/interpreter"

for category in all_categories:
    if type(all_categories[category]) is set:
        continue
    for amenity in all_categories[category]['amenities']:
        time.sleep(3)
        overpass_query = f"""
        [out:json];
        node(57.5,55,58.3,56.6)[amenity={amenity}];out;
        """
        response = requests.get(overpass_url, 
                                params={'data': overpass_query})
        try:
            data = response.json()
            for element in data['elements']:
              if element['type'] == 'node':
                lon = element['lon']
                lat = element['lat']
                all_categories[category]['data'].add((lat, lon))
            all_categories[category]['amenities'].remove(amenity)
            if not all_categories[category]['amenities']:
                del all_categories[category]['amenities']
                all_categories[category] = all_categories[category]['data']
        except Exception as ex:
            print(amenity)
            print(ex)

In [97]:
for key in all_categories:
    print(key + ": " + str(len(all_categories[key])))

edu: 137
health: 86
culture: 20
eat: 552


In [98]:
for key in all_categories:
    all_categories[key] = list(all_categories[key])

In [99]:
with open("../amenities.json", 'w') as f:
    json.dump(all_categories, f)

In [5]:
with open("../amenities.json", 'r') as f:
    data = json.load(f)

In [6]:
for key in data:
    print(key + ": " + str(len(data[key])))

edu: 137
health: 86
culture: 20
eat: 552


## Расстояние между объектами через координатами

In [7]:
from math import sin, cos, asin, sqrt, pi

def get_dist(llong1, llat1, llong2, llat2):
    
    rad = 6372795
    
    #в радианах
    lat1 = llat1*pi/180.
    lat2 = llat2*pi/180.
    long1 = llong1*pi/180.
    long2 = llong2*pi/180.
    
    #косинусы и синусы широт и разницы долгот
    delta_long = long2 - long1
    delta_lat = lat2 - lat1
    
    #вычисления длины большого круга
    ad = 2 * asin(sqrt(sin(delta_lat/2)**2 + cos(lat1)*cos(lat2)*sin(delta_long/2)**2))
    dist = ad*rad
    
    return dist

## Генерация новых признаков - услуг вблизи квартиры

Для каждой категории вычисляем расстояние от квартиры до объекта, где предоставляют услугу данной категории

In [24]:
new_df = df.copy()
categories_array = np.zeros(shape=(len(new_df), 4))

In [27]:
for i in range(len(new_df)):
    appartment = new_df.loc[i]
    appartment_lat = appartment['lat']
    appartment_lon = appartment['lon']
    for cat_id, category in enumerate(data):
        for coordinates in data[category]:
            object_lat = coordinates[0]
            object_lon = coordinates[1]
            dist = get_dist(
                llong1=appartment_lon,
                llat1=appartment_lat,
                llong2=object_lon,
                llat2=object_lat
            )
            if dist < 5000:
                categories_array[i, cat_id] +=1

In [29]:
for cat_id, category in enumerate(data):
    new_df[category] = categories_array[:, cat_id]

In [33]:
new_df.sample(5)

Unnamed: 0,number of rooms,area of apartment,number of floors,apartment floor,price,year of construction,elevator,full address,concierge,garbage chute,...,parking_за шлагбаумом во дворе,parking_открытая во дворе,parking_наземная многоуровневая,parking_подземная,lat,lon,edu,health,culture,eat
1737,1,38.5,19,4,3500000.0,2016,1.0,"Пермский край, Пермь, Уфимская ул., 10А",1,1,...,0.0,0.0,0.0,0.0,57.974974,56.23153,49.0,42.0,11.0,364.0
1697,2,37.0,9,6,1600000.0,1986,1.0,"Пермский край, Пермь, Хабаровская ул., 173",0,1,...,0.0,0.0,0.0,0.0,58.043259,56.088069,7.0,0.0,0.0,15.0
424,1,37.3,6,5,2800000.0,2017,0.0,"Пермский край, Пермь, Лядовская ул., 127Б",0,0,...,0.0,0.0,0.0,0.0,58.035921,56.359267,18.0,3.0,0.0,25.0
666,1,23.7,5,2,1820000.0,1969,0.0,"Пермский край, Пермь, ул. Докучаева, 22",0,0,...,0.0,0.0,0.0,0.0,58.038414,56.133325,8.0,2.0,1.0,38.0
564,1,41.6,23,8,3800000.0,2021,2.0,"Пермский край, Пермь, ул. Комбайнёров, 43",1,0,...,0.0,0.0,0.0,0.0,57.974728,56.18516,18.0,27.0,9.0,206.0


## Генерация новых признаков - расстояние до центра

In [34]:
perm_esplanade_lat = 58.010455
perm_esplanade_lon = 56.229443

In [35]:
new_df['distance'] = 0

for i in range(len(new_df)):
    appartment = new_df.loc[i]
    appartment_lat = appartment['lat']
    appartment_lon = appartment['lon']
    dist = get_dist(
        llong1=appartment_lon,
        llat1=appartment_lat,
        llong2=perm_esplanade_lon,
        llat2=perm_esplanade_lat
    )
    new_df.loc[i, 'distance'] = round(dist, 2)

## Итоги генерации признаков

In [37]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1767 entries, 0 to 1766
Data columns (total 43 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   number of rooms                  1767 non-null   int64  
 1   area of apartment                1767 non-null   float64
 2   number of floors                 1767 non-null   int64  
 3   apartment floor                  1767 non-null   int64  
 4   price                            1767 non-null   float64
 5   year of construction             1767 non-null   int64  
 6   elevator                         1767 non-null   float64
 7   full address                     1767 non-null   object 
 8   concierge                        1767 non-null   int64  
 9   garbage chute                    1767 non-null   int64  
 10  repair_дизайнерский              1767 non-null   float64
 11  repair_евро                      1767 non-null   float64
 12  repair_косметический

In [39]:
new_df.to_csv('../clean_categorizated_data_without_outliers_with_coordinates_and_amenities.csv', sep=';', index=False)

__В ходе пятой обработки были:__
1. добавлены расстояния до центра
2. добавлены признаки наличия услуг в радиусе около 5 км