In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import geopandas as gpd
import tqdm
import dill
import overpass


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from geopy.geocoders import Nominatim

from functools import partial
import pyproj
from shapely.geometry  import Point
from shapely.ops import transform
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
proj_wgs84 = pyproj.Proj('+proj=longlat +datum=WGS84')
import copy

In [2]:
from sklearn.svm import SVR
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from geopy import distance

from sklearn.ensemble import RandomForestRegressor
import time

In [3]:
import networkx as nx
import osmnx as ox
ox.config(log_console=True, use_cache=True)
ox.__version__

'0.16.2'

In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [5]:
name_to_epsg = {}
name_to_epsg['Балашиха'] = 32637
name_to_epsg['Екатеринбург'] = 32641
name_to_epsg['Казань'] = 32639
name_to_epsg['Красноярск'] = 32646
name_to_epsg['Москва'] = 32637
name_to_epsg['Нижний Новгород'] = 32638
name_to_epsg['Новосибирск'] = 32644
name_to_epsg['Ростов-на-Дону'] = 32637
name_to_epsg['Самара'] = 32639
name_to_epsg['Санкт-Петербург'] = 32636
name_to_epsg['Уфа'] = 32640

### Задача:

Прогноз продаж одной из популярных моделей [фичерфонов](https://ru.wikipedia.org/wiki/%D0%A4%D0%B8%D1%87%D0%B5%D1%80%D1%84%D0%BE%D0%BD) (на картинке ниже пример похожего устройства) в салонах МегаФона
![](https://39.img.avito.st/640x480/8468720439.jpg)

### Исходные данные:

Датасет содержит следующие поля:

1. `point_id` - Индентификатор салона
2. `lon` - Долгота точки
3. `lat` - Широта точки
4. `target` - Значение таргета, усредненное за несколько месяцев и отнормированное

### Требования к решению и советы:

Ниже приведен список из нескольких важных пунктов, необходимых для решения задания. Выполнение каждого из пунктов влияет на итоговую оценку. Вы можете выполнить каждый из пунктов разными способами, самым лучшим будет считаться вариант, когда всё получение и обработка данных будут реализованы на Питоне (пример: вы можете скачать данные из OSM через интерфейс на сайте overpass-turbo или с помощью библиотек `overpass`/`requests`. Оба варианта будут зачтены, но больше баллов можно заработать во втором случае)



1. Салоны расположены в нескольких разных городах, вам необходимо **определить город для каждого салона** (это понадобится во многих частях задания). К этому есть разные подходы. Вы можете провести [обратное геокодирование](https://en.wikipedia.org/wiki/Reverse_geocoding) с помощью геокодера [Nominatim](https://nominatim.org/), доступного через библиотеку `geopy` примерно вот так:
```python
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="specify_your_app_name_here")
location = geolocator.reverse("52.509669, 13.376294")
print(location.address)
```
В таком случае, вам придется обрабатывать полученную строку адреса, чтобы извлечь название города. Также вы можете скачать из OSM или найти в любом другом источнике границы административно территориальных границ России и пересечь с ними датасет с помощью `geopandas.sjoin` (этот вариант более надежный, но нужно будет разобраться с тем, как устроены границы АТД в OSM, обратите внимание на [этот тег](https://wiki.openstreetmap.org/wiki/Key:admin_level))


2. **Используйте данные OSM**: подумайте, какие объекты могут влиять на продажи фичерфонов. Гипотеза: такие телефоны покупают люди, приезжающие в город или страну ненадолго, чтобы вставить туда отдельную симкарту для роуминга. Можно попробовать использовать местоположения железнодорожных вокзалов (изучите [этот тег](https://wiki.openstreetmap.org/wiki/Tag:railway%3Dstation)). Необходимо использовать хотя бы 5 разных типов объектов из OSM. Скорее всего, вам придется качать данные OSM отдельно для разных городов (см. пример для Нью-Йорка из лекции)


3. **Используйте разные способы генерации признаков**: описать положение салона МегаФона относительно станций метро можно разными способами - найти ***расстояние до ближайшей станции***, или же посчитать, сколько станций попадает в ***500 метровую буферную зону*** вокруг салона. Такие признаки будут нести разную информацию. Так же попробуйте поэкспериментировать с размерами буферных зон (представьте, что значат в реальности радиусы 100, 500, 1000 метров). Попробуйте посчитать расстояние до центра города, до других объектов.

4. **Сделайте визуализации**: постройте 2-3 карты для какого нибудь из городов - как распределен в пространстве таргет, где находятся объекты, полученные вами из OSM. Можете использовать любой инструмент - обычный `plot()`, `folium`, `keplergl`. Если выберете Кеплер, обязательно сохраните в файл конфиг карты, чтобы ее можно было воспроизвести. Сделать это можно вот так:

```python
import json
json_data = kepler_map.config
with open('kepler_config.json', 'w') as outfile:
    json.dump(json_data, outfile)
```
5. Задание не ограничено приведенными выше пунктами, попробуйте нагенерировать интересных признаков, найти в интернете дополнительные данные (в таком случае в комментарии к коду укажите ссылку на ресурс, откуда взяли данные)



6. Это довольно сложная задача - датасет очень маленький, данные по своей природе довольно случайны. Поэтому место и скор на Kaggle не будут играть решающую роль в оценке, но позволят заработать дополнительные баллы

### Read data

In [3]:
train = pd.read_csv('data/mf_geo_train.csv')
test = pd.read_csv('data/mf_geo_test.csv')

In [10]:
train.head(2)

Unnamed: 0,point_id,lon,lat,target
0,ommNZCUV,37.590776,55.84863,-0.348157
1,nMe2LHPb,37.78421,55.750271,1.294206


In [11]:
test.head(2)

Unnamed: 0,point_id,lon,lat,target
0,F4lXR1cG,37.681242,55.74804,0.0091
1,4LJu4GTf,60.58091,56.79586,0.0091


### Define a city for each store

In [85]:
addresses_train = []
addresses_test = []
geolocator = Nominatim(user_agent="dmitry_geohw")

In [36]:
for i in tqdm.tqdm(range(len(train))):
    vals = train.iloc[i][['lon','lat']].values[::-1]
    coord = str(vals[0]) + ', ' + str(vals[1])
    addresses_train.append(geolocator.reverse(coord))

for i in tqdm.tqdm(range(len(test))):
    vals = test.iloc[i][['lon','lat']].values[::-1]
    coord = str(vals[0]) + ', ' + str(vals[1])
    addresses_test.append(geolocator.reverse(coord))

100%|██████████| 425/425 [03:32<00:00,  2.00it/s]
100%|██████████| 107/107 [00:53<00:00,  2.00it/s]


In [40]:
addresses_train = [str(i) for i in addresses_train]

In [41]:
addresses_test = [str(i) for i in addresses_test]

In [42]:
train['full_address'] = addresses_train
test['full_address'] = addresses_test

In [47]:
train.to_csv('data/train_1.csv')
test.to_csv('data/test_1.csv')

In [48]:
train = pd.read_csv('data/train_1.csv', index_col=0)
test = pd.read_csv('data/test_1.csv', index_col=0)

In [108]:
!git clone https://github.com/hflabs/city.git

fatal: destination path 'city' already exists and is not an empty directory.


In [69]:
cities = pd.read_csv('city/city.csv')

In [73]:
all_cities = []
for i in range(len(cities)):
    all_cities.append(cities['address'][i].split('г ')[-1])

In [76]:
all_cities = set(all_cities)

In [95]:
cities_train = []
for i, address, in enumerate(train['full_address']):
    counter = True
    for tmp in address.split(', '):
        if tmp in all_cities:
            if counter:
                cities_train.append(tmp)
                counter = False
cities_test = []
for i, address, in enumerate(test['full_address']):
    counter = True
    for tmp in address.split(', '):
        if tmp in all_cities:
            if counter:
                cities_test.append(tmp)
                counter = False

In [99]:
train['city'] = cities_train
test['city'] = cities_test

In [107]:
train.to_csv('data/train_2.csv')
test.to_csv('data/test_2.csv')

### Convert data to geopandas table

In [10]:
train = pd.read_csv('data/train_2.csv', index_col=0)
test = pd.read_csv('data/test_2.csv', index_col=0)

In [11]:
train.index = ['train_' + str(i) for i in range(len(train.index))]
test.index = ['test_' + str(i) for i in range(len(test.index))]

In [12]:
data = pd.concat((train, test), axis=0)

In [13]:
data['city'] = data['city'].replace(['Заречный'], 'Екатеринбург')
data['city'] = data['city'].replace(['Пионерский'], 'Екатеринбург')
data['city'] = data['city'].replace(['Сосновка'], 'Санкт-Петербург')
data['city'] = data['city'].replace(['Троицк'], 'Москва')
data['city'] = data['city'].replace(['Донской'], 'Москва')
data['city'] = data['city'].replace(['Калуга'], 'Казань')

In [14]:
geometry = [Point(lon, lat) for lon, lat in zip(data['lon'], data['lat'])]
geom_data = gpd.GeoDataFrame({'geometry':geometry}, index=data.index, crs='EPSG:4326') #WGS 84

In [15]:
geolocator = Nominatim(user_agent="dmitry_geohw")

In [16]:
city_to_location = {}

In [17]:
for city in set(data['city']):
    location = geolocator.geocode(city + ", Россия") 
    city_to_location[city] = location
    time.sleep(1)

### Download data for  cities from OSM and extract features for each city

In [18]:
def geodesic_point_buffer(lon, lat, km):
    # Azimuthal equidistant projection
    aeqd_proj = '+proj=aeqd +lat_0={lat} +lon_0={lon} +x_0=0 +y_0=0'
    project = partial(
        pyproj.transform,
        pyproj.Proj(aeqd_proj.format(lon=lon, lat=lat)),
        proj_wgs84)
    buf = Point(0, 0).buffer(km * 1000)  # distance in metres
    return transform(project, buf)

In [19]:
def extract_features(city, data, geometry, response):
    mask = data['city'] == city
    cur_df = data.loc[mask]
    cur_geometry = geometry.loc[mask] 
    default_dist = 3.5
    
    buffers = [100, 300, 500, 700, 1000, 'dist']
    features = []
    columns = []
    columns.extend(['bus_stop_' + str(i) for i in buffers])
    columns.extend(['subway_entrance_' + str(i) for i in buffers])
    columns.extend(['platform_' + str(i) for i in buffers])
    columns.extend(['sub_platform_' + str(i) for i in buffers])
    columns.extend(['halt_' + str(i) for i in buffers])
    columns.extend(['sub_halt_' + str(i) for i in buffers])
    columns.extend(['station_' + str(i) for i in buffers])
    columns.extend(['sub_station_' + str(i) for i in buffers])
    columns.extend(['tram_stop_' + str(i) for i in buffers])
    columns.extend(['hotel_' + str(i) for i in buffers])
    columns.extend(['apartment_' + str(i) for i in buffers])
    columns.extend(['hostel_' + str(i) for i in buffers])
    columns.extend(['guest_house_' + str(i) for i in buffers])
    columns.extend(['t_hotel_' + str(i) for i in buffers])
    columns.extend(['motel_' + str(i) for i in buffers])
    columns.extend(['electronics_' + str(i) for i in buffers])
    columns.extend(['mobile_phone_' + str(i) for i in buffers])
    columns.extend(['hospital_' + str(i) for i in buffers])
    columns.extend(['nursing_home_' + str(i) for i in buffers])
    columns.extend(['prison_' + str(i) for i in buffers])
    #distances
    columns.append('distance_to_center')
    
    for i in tqdm.tqdm(range(len(cur_df))):
        cur_features = []
        cur_point = cur_geometry.iloc[i]
        #100m buffer
        buf_100 = geodesic_point_buffer(cur_point['geometry'].x, cur_point['geometry'].y, 0.1)
        #200m buffer
        buf_300 = geodesic_point_buffer(cur_point['geometry'].x, cur_point['geometry'].y, 0.3)
        #500m buffer
        buf_500 = geodesic_point_buffer(cur_point['geometry'].x, cur_point['geometry'].y, 0.5)
        #700m buffer
        buf_700 = geodesic_point_buffer(cur_point['geometry'].x, cur_point['geometry'].y, 0.7)
        #1km buffer
        buf_1000 = geodesic_point_buffer(cur_point['geometry'].x, cur_point['geometry'].y, 1.0)
        
        #bus_stop
        #columns.extend(['bus_stop_' + str(i) for i in buffers])
        mask_ = response['highway'] == 'bus_stop'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #subway_entrance
        #columns.extend(['subway_entrance_' + str(i) for i in buffers])
        mask_ = response['railway'] == 'subway_entrance'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #platform
        #columns.extend(['platform_' + str(i) for i in buffers])
        mask_ = response['railway'] == 'platform'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #platform without subway
        mask1 = response['railway'] == 'platform'
        mask2 = response['station'] != 'subway'
        mask_ = mask1&mask2
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #halt 
        #columns.extend(['halt_' + str(i) for i in buffers])
        mask_ = response['railway'] == 'halt'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #halt without subways
        
        mask1 = response['railway'] == 'halt'
        mask2 = response['station'] != 'subway'
        mask_ = mask1&mask2
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #station
        
        #columns.extend(['station_' + str(i) for i in buffers])
        mask_ = response['railway'] == 'station'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #station without subways
        mask1 = response['railway'] == 'station'
        mask2 = response['station'] != 'subway'
        
        mask_ = mask1&mask2
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #tram_stop
        #columns.extend(['tram_stop_' + str(i) for i in buffers])
        mask_ = response['railway'] == 'tram_stop'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #hotel
        #columns.extend(['hotel_' + str(i) for i in buffers])
        mask_ = response['building'] == 'hotel'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #apartment
        #columns.extend(['apartment_' + str(i) for i in buffers])
        mask_ = response['tourism'] == 'apartment'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #hostel
        #columns.extend(['hostel_' + str(i) for i in buffers])
        mask_ = response['tourism'] == 'hostel'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #guest_house
        #columns.extend(['guest_house_' + str(i) for i in buffers])
        mask_ = response['tourism'] == 'guest_house'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #hotel
        #columns.extend(['t_hotel_' + str(i) for i in buffers])
        mask_ = response['tourism'] == 'hotel'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #motel
        #columns.extend(['motel_' + str(i) for i in buffers])
        mask_ = response['tourism'] == 'motel'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #electronic shop's
        #columns.extend(['electronics_' + str(i) for i in buffers])
        mask_ = response['shop'] == 'electronics'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #mobile phone's
        #columns.extend(['mobile_phone_' + str(i) for i in buffers])
        mask_ = response['shop'] == 'mobile_phone'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #hospital
        mask_ = response['amenity'] == 'hospital'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        #nursing_home
        mask_ = response['amenity'] == 'nursing_home'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        
        #prison
        mask_ = response['amenity'] == 'prison'
        within_100m = response[mask_].loc[response.loc[mask_].geometry.within(buf_100)]
        within_300m = response[mask_].loc[response.loc[mask_].geometry.within(buf_300)]
        within_500m = response[mask_].loc[response.loc[mask_].geometry.within(buf_500)]
        within_700m = response[mask_].loc[response.loc[mask_].geometry.within(buf_700)]
        within_1000m = response[mask_].loc[response.loc[mask_].geometry.within(buf_1000)]
        
        cur_features.append(len(within_100m))
        cur_features.append(len(within_300m))
        cur_features.append(len(within_500m))
        cur_features.append(len(within_700m))
        cur_features.append(len(within_1000m))
        
        cur_distances = []
        if len(within_1000m) != 0:
            for j in range(len(within_1000m)):
                loc = within_1000m.iloc[j]
                dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (loc['geometry'].y, loc['geometry'].x)).km
                cur_distances.append(dist)
        else:
            cur_distances.append(default_dist)
        cur_features.append(np.min(cur_distances))
        
        
        #distance_to_center 
        dist = distance.geodesic((cur_point['geometry'].y, cur_point['geometry'].x), (city_to_location[city].latitude, city_to_location[city].longitude)).km
        cur_features.append(dist)
        features.append(cur_features)
    
    return pd.DataFrame(features, index=cur_df.index, columns=columns)
        

In [20]:
def get_repr(response):
    new_geom = []
    for geom in response.geometry:
        if str(type(geom)) == '<class \'shapely.geometry.point.Point\'>':
            new_geom.append(geom)
        else:
            new_geom.append(geom.centroid)
    return new_geom

In [21]:
features_df = {}
for city in set(data['city']):
    print(city)
    if city != 'Нижний Новгород':
        name = city + ', Российская Федерация'
    else:
        name = 'Nizhny Novgorod, Russia'
    response = ox.geometries.geometries_from_place(name,  tags = {'highway': 'bus_stop', 'railway' : ['subway_entrance', 'tram_stop', 'halt', 'platform', 'station'], 
                                                                                             'building':['hotel', 'train_station'], 'tourism':True, 'amenity':True, 'shop':True, 'public_transport':True, 'transport':True})
    if 'station' not in response:
        response['station'] = [None for i in range(response.shape[0])]
    response['geometry'] = get_repr(response)
    output_df = extract_features(city, data, geom_data, response)
    features_df[city] = output_df

Казань


100%|██████████| 32/32 [00:14<00:00,  2.27it/s]


Санкт-Петербург


100%|██████████| 104/104 [04:08<00:00,  2.39s/it]


Москва


100%|██████████| 200/200 [16:32<00:00,  4.96s/it]


Самара


100%|██████████| 34/34 [00:13<00:00,  2.55it/s]


Ростов-на-Дону


100%|██████████| 26/26 [00:11<00:00,  2.36it/s]


Красноярск


100%|██████████| 25/25 [00:08<00:00,  2.78it/s]


Нижний Новгород


100%|██████████| 26/26 [00:12<00:00,  2.08it/s]


Уфа


100%|██████████| 24/24 [00:08<00:00,  2.77it/s]


Новосибирск


100%|██████████| 33/33 [00:16<00:00,  2.04it/s]


Балашиха


100%|██████████| 1/1 [00:00<00:00,  3.17it/s]


Екатеринбург


100%|██████████| 27/27 [00:16<00:00,  1.67it/s]


In [22]:
dfs = list(features_df.values())

In [23]:
tmp = pd.concat(dfs, axis=0)

In [24]:
X = tmp.reindex(data.index)

In [25]:
X.columns

Index(['bus_stop_100', 'bus_stop_300', 'bus_stop_500', 'bus_stop_700',
       'bus_stop_1000', 'bus_stop_dist', 'subway_entrance_100',
       'subway_entrance_300', 'subway_entrance_500', 'subway_entrance_700',
       ...
       'nursing_home_700', 'nursing_home_1000', 'nursing_home_dist',
       'prison_100', 'prison_300', 'prison_500', 'prison_700', 'prison_1000',
       'prison_dist', 'distance_to_center'],
      dtype='object', length=121)

In [26]:
X_new = pd.concat(( data[['lon', 'lat']], X), axis=1)

In [27]:
X_new.to_csv('data/X10.csv')

In [28]:
train_index = [i.startswith('train_') for i in X_new.index]
test_index = [i.startswith('test_') for i in X_new.index]

### Fit model

In [30]:
folds = KFold(4, shuffle=True, random_state=42)
reg = GridSearchCV(estimator=SVR(), scoring=make_scorer(mean_absolute_error, greater_is_better=False),
                   param_grid= {'kernel': ['poly', 'rbf'], 'degree':[2, 3, 4, 5, 6, 7, 8], 'C': [0.05, 0.01, 0.1, 0.2, 0.4, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 2.0, 3.0, 5.0, 6.0], 'epsilon':[0.1, 0.05, 0.15, 0.2, 0.03], 'gamma':['scale', 'auto'], 'coef0':[0.0, 0.1, 0.2, 0.3]}, 
                   cv=folds,
                   verbose=True,
                  n_jobs=4)

In [41]:
folds = KFold(4, shuffle=True, random_state=42)
random_forest = GridSearchCV(estimator=RandomForestRegressor(), scoring=make_scorer(mean_absolute_error, greater_is_better=False),
                   param_grid= {'n_estimators':[100, 200, 300], 'max_depth':[None, 14, 20], 'max_features':['auto', 'sqrt', 'log2'], 'criterion':['mse', 'mae']}, 
                   cv=folds,
                   verbose=True,
                  n_jobs=4)

In [32]:
folds = KFold(4, shuffle=True, random_state=42)
lgbm = GridSearchCV(estimator=LGBMRegressor(), scoring=make_scorer(mean_absolute_error, greater_is_better=False),
                   param_grid= {'n_estimators':[10, 20, 30, 50, 100, 200, 500], 'max_depth':[1, 2, 3, 4, 5], 'learning_rate':[0.04, 0.1, 0.15], 'objective':['regression','regression_l1', 'huber']}, 
                   cv=folds,
                   verbose=True,
                  n_jobs=1)

In [34]:
to_drop =[False for i in X_new.columns]
print(np.sum(to_drop))

0


In [35]:
X_dropped = X_new.drop(X_new.columns[to_drop], axis=1)

In [36]:
X_norm = pd.DataFrame(Normalizer().fit_transform(X_dropped), index=X_dropped.index, columns=X_dropped.columns)

In [37]:
X_train_norm = X_norm.loc[train_index]
X_test_norm  = X_norm.loc[test_index]
X_train = X_dropped.loc[train_index]
X_test = X_dropped.loc[test_index]
y = train['target']

SVM Regressor

In [39]:
reg.fit(X_train_norm, y)

Fitting 4 folds for each of 8400 candidates, totalling 33600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  68 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 1260 tasks      | elapsed:   12.5s
[Parallel(n_jobs=4)]: Done 3260 tasks      | elapsed:   30.7s
[Parallel(n_jobs=4)]: Done 6060 tasks      | elapsed:   55.1s
[Parallel(n_jobs=4)]: Done 9660 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 14060 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 19260 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done 25260 tasks      | elapsed:  4.7min
[Parallel(n_jobs=4)]: Done 32060 tasks      | elapsed:  6.1min
[Parallel(n_jobs=4)]: Done 33600 out of 33600 | elapsed:  6.5min finished


GridSearchCV(cv=KFold(n_splits=4, random_state=42, shuffle=True),
             estimator=SVR(), n_jobs=4,
             param_grid={'C': [0.05, 0.01, 0.1, 0.2, 0.4, 0.5, 0.7, 0.9, 1.0,
                               1.3, 1.5, 2.0, 3.0, 5.0, 6.0],
                         'coef0': [0.0, 0.1, 0.2, 0.3],
                         'degree': [2, 3, 4, 5, 6, 7, 8],
                         'epsilon': [0.1, 0.05, 0.15, 0.2, 0.03],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['poly', 'rbf']},
             scoring=make_scorer(mean_absolute_error, greater_is_better=False),
             verbose=True)

In [48]:
reg.best_score_

-0.6564071748664376

In [49]:
reg.best_estimator_

SVR(C=0.1, degree=8, epsilon=0.2, kernel='poly')

Random Forest Regressor

In [42]:
random_forest.fit(X_train, y)

Fitting 4 folds for each of 54 candidates, totalling 216 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   14.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done 216 out of 216 | elapsed:  3.9min finished


GridSearchCV(cv=KFold(n_splits=4, random_state=42, shuffle=True),
             estimator=RandomForestRegressor(), n_jobs=4,
             param_grid={'criterion': ['mse', 'mae'],
                         'max_depth': [None, 14, 20],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 200, 300]},
             scoring=make_scorer(mean_absolute_error, greater_is_better=False),
             verbose=True)

In [43]:
random_forest.best_score_

-0.679056633228241

In [44]:
random_forest.best_estimator_

RandomForestRegressor(criterion='mae', max_depth=14, max_features='log2',
                      n_estimators=200)

LGBM Regressor

In [45]:
lgbm.fit(X_train, y)

Fitting 4 folds for each of 315 candidates, totalling 1260 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.










[Parallel(n_jobs=1)]: Done 1260 out of 1260 | elapsed:   57.0s finished


GridSearchCV(cv=KFold(n_splits=4, random_state=42, shuffle=True),
             estimator=LGBMRegressor(), n_jobs=1,
             param_grid={'learning_rate': [0.04, 0.1, 0.15],
                         'max_depth': [1, 2, 3, 4, 5],
                         'n_estimators': [10, 20, 30, 50, 100, 200, 500],
                         'objective': ['regression', 'regression_l1', 'huber']},
             scoring=make_scorer(mean_absolute_error, greater_is_better=False),
             verbose=True)

In [46]:
lgbm.best_score_

-0.645906629362195

In [47]:
lgbm.best_estimator_

LGBMRegressor(learning_rate=0.15, max_depth=5, n_estimators=10,
              objective='regression_l1')

Bagging + Boosting LGBM Regressor

In [50]:
all_set = set(np.arange(0, X_train.shape[0]))
predictions = np.zeros(X_train.shape[0])
norm = 1e-100 + np.zeros(X_train.shape[0])
models = []
maes = []
for i in tqdm.tqdm(range(1200)):
    lgbm = LGBMRegressor(max_depth=3, n_estimators=20, objective='huber')
    new_index = np.random.randint(low=0, high=X_train.shape[0], size=X_train.shape[0])
    other_index = list(all_set - set(new_index))
    lgbm.fit(X_train.iloc[new_index].values, y.iloc[new_index].values)
    predictions[other_index] += lgbm.predict(X_train.iloc[other_index].values)
    norm[other_index] += 1
    models.append(lgbm)
    maes.append(mean_absolute_error(predictions/norm, y))

100%|██████████| 1200/1200 [00:20<00:00, 57.82it/s]


In [51]:
maes[-1]

0.6511192213994846

In [67]:
all_set = set(np.arange(0, X_train.shape[0]))
predictions = np.zeros(X_train.shape[0])
norm = 1e-100 + np.zeros(X_train.shape[0])
models = []
maes = []
for i in tqdm.tqdm(range(1200)):
    lgbm = LGBMRegressor(learning_rate=0.15, max_depth=5, n_estimators=10,
              objective='regression_l1', verbose_eval=False)
    new_index = np.random.randint(low=0, high=X_train.shape[0], size=X_train.shape[0])
    other_index = list(all_set - set(new_index))
    lgbm.fit(X_train.iloc[new_index].values, y.iloc[new_index].values)
    predictions[other_index] += lgbm.predict(X_train.iloc[other_index].values)
    norm[other_index] += 1
    models.append(lgbm)
    maes.append(mean_absolute_error(predictions/norm, y))

  0%|          | 2/1200 [00:00<01:08, 17.57it/s]



  2%|▏         | 23/1200 [00:00<00:34, 34.14it/s]



  3%|▎         | 36/1200 [00:00<00:26, 44.46it/s]



  4%|▍         | 48/1200 [00:00<00:23, 48.22it/s]



  4%|▍         | 54/1200 [00:01<00:24, 47.53it/s]

Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).


  6%|▌         | 68/1200 [00:01<00:21, 53.66it/s]



  7%|▋         | 83/1200 [00:01<00:18, 60.29it/s]

Unknown parameter: verbose_eval


  8%|▊         | 99/1200 [00:01<00:16, 66.03it/s]



  9%|▉         | 106/1200 [00:01<00:17, 63.02it/s]



 10%|▉         | 119/1200 [00:02<00:21, 49.62it/s]



 11%|█         | 133/1200 [00:02<00:18, 56.31it/s]



 12%|█▏        | 148/1200 [00:02<00:16, 62.79it/s]



 14%|█▎        | 163/1200 [00:02<00:15, 66.41it/s]



 14%|█▍        | 170/1200 [00:02<00:16, 61.01it/s]



 15%|█▌        | 183/1200 [00:03<00:21, 46.45it/s]



 16%|█▋        | 196/1200 [00:03<00:18, 53.34it/s]



 18%|█▊        | 211/1200 [00:03<00:16, 60.37it/s]



 19%|█▉        | 225/1200 [00:03<00:16, 60.60it/s]



 20%|██        | 240/1200 [00:04<00:14, 64.32it/s]



 21%|██▏       | 255/1200 [00:04<00:13, 67.63it/s]



 22%|██▏       | 269/1200 [00:04<00:14, 65.49it/s]



 24%|██▍       | 285/1200 [00:04<00:13, 68.56it/s]



 24%|██▍       | 293/1200 [00:04<00:14, 64.22it/s]



 26%|██▌       | 308/1200 [00:05<00:13, 66.07it/s]



 27%|██▋       | 323/1200 [00:05<00:13, 65.17it/s]



 28%|██▊       | 339/1200 [00:05<00:13, 61.80it/s]



 29%|██▉       | 353/1200 [00:05<00:14, 58.60it/s]



 30%|███       | 360/1200 [00:06<00:15, 54.34it/s]



 31%|███       | 374/1200 [00:06<00:14, 58.79it/s]



 32%|███▏      | 389/1200 [00:06<00:13, 60.79it/s]



 34%|███▍      | 405/1200 [00:06<00:11, 67.61it/s]



 35%|███▍      | 419/1200 [00:06<00:11, 65.91it/s]



 36%|███▌      | 433/1200 [00:07<00:11, 64.38it/s]



 37%|███▋      | 449/1200 [00:07<00:10, 69.13it/s]




 39%|███▊      | 464/1200 [00:07<00:10, 67.99it/s]



 40%|████      | 480/1200 [00:07<00:10, 70.01it/s]



 41%|████▏     | 496/1200 [00:08<00:09, 71.15it/s]

Unknown parameter: verbose_eval


 43%|████▎     | 512/1200 [00:08<00:09, 70.09it/s]



 44%|████▍     | 528/1200 [00:08<00:09, 70.86it/s]



 45%|████▌     | 544/1200 [00:08<00:09, 72.65it/s]



 46%|████▌     | 552/1200 [00:08<00:09, 70.16it/s]



 47%|████▋     | 567/1200 [00:09<00:09, 65.87it/s]



 48%|████▊     | 582/1200 [00:09<00:09, 66.59it/s]



 50%|████▉     | 597/1200 [00:09<00:08, 69.42it/s]



 51%|█████     | 613/1200 [00:09<00:08, 70.68it/s]



 52%|█████▏    | 629/1200 [00:10<00:08, 69.73it/s]



 54%|█████▍    | 645/1200 [00:10<00:07, 71.07it/s]



 55%|█████▌    | 661/1200 [00:10<00:07, 69.59it/s]



 56%|█████▋    | 677/1200 [00:10<00:07, 71.46it/s]



 57%|█████▋    | 685/1200 [00:10<00:07, 69.18it/s]



 58%|█████▊    | 700/1200 [00:11<00:07, 66.46it/s]



 60%|█████▉    | 716/1200 [00:11<00:07, 68.73it/s]



 61%|██████    | 732/1200 [00:11<00:06, 69.85it/s]



 62%|██████▏   | 748/1200 [00:11<00:06, 71.24it/s]



 64%|██████▎   | 763/1200 [00:11<00:06, 66.61it/s]



 65%|██████▍   | 777/1200 [00:12<00:06, 62.48it/s]



 66%|██████▌   | 792/1200 [00:12<00:06, 65.57it/s]



 67%|██████▋   | 807/1200 [00:12<00:05, 67.14it/s]



 69%|██████▊   | 823/1200 [00:12<00:05, 71.47it/s]



 69%|██████▉   | 831/1200 [00:12<00:05, 69.03it/s]



 71%|███████   | 847/1200 [00:13<00:05, 70.56it/s]



 72%|███████▏  | 864/1200 [00:13<00:04, 74.08it/s]



 73%|███████▎  | 881/1200 [00:13<00:04, 72.35it/s]



 75%|███████▍  | 897/1200 [00:13<00:04, 70.08it/s]



 76%|███████▌  | 912/1200 [00:14<00:04, 65.85it/s]



 77%|███████▋  | 927/1200 [00:14<00:04, 64.94it/s]



 78%|███████▊  | 934/1200 [00:14<00:04, 61.54it/s]



 79%|███████▉  | 950/1200 [00:14<00:03, 65.88it/s]



 81%|████████  | 967/1200 [00:14<00:03, 69.01it/s]



 82%|████████▏ | 981/1200 [00:15<00:03, 66.14it/s]



 83%|████████▎ | 996/1200 [00:15<00:03, 67.60it/s]



 84%|████████▍ | 1012/1200 [00:15<00:02, 71.47it/s]



 86%|████████▌ | 1029/1200 [00:15<00:02, 73.60it/s]



 87%|████████▋ | 1044/1200 [00:16<00:02, 66.79it/s]



 88%|████████▊ | 1059/1200 [00:16<00:02, 66.81it/s]



 90%|████████▉ | 1074/1200 [00:16<00:01, 70.74it/s]



 91%|█████████ | 1090/1200 [00:16<00:01, 71.42it/s]



 92%|█████████▏| 1098/1200 [00:16<00:01, 70.38it/s]



 93%|█████████▎| 1113/1200 [00:17<00:01, 65.81it/s]



 94%|█████████▍| 1127/1200 [00:17<00:01, 65.70it/s]



 95%|█████████▌| 1142/1200 [00:17<00:00, 63.18it/s]



 96%|█████████▋| 1157/1200 [00:17<00:00, 62.37it/s]



 97%|█████████▋| 1164/1200 [00:17<00:00, 64.38it/s]



 98%|█████████▊| 1180/1200 [00:18<00:00, 66.02it/s]



100%|█████████▉| 1195/1200 [00:18<00:00, 65.85it/s]



100%|██████████| 1200/1200 [00:18<00:00, 64.85it/s]


In [68]:
maes[-1]

0.6484971261753096

### Make submission

In [69]:
model = LGBMRegressor(learning_rate=0.15, max_depth=5, n_estimators=10,
              objective='regression_l1')

In [55]:
model.fit(X_train, y)



LGBMRegressor(learning_rate=0.15, max_depth=5, n_estimators=10,
              objective='regression_l1')

In [56]:
predictions = model.predict(X_test)

In [57]:
submission = pd.read_csv('data/sample_submission.csv')

In [58]:
submission['target'] = predictions

In [60]:
submission.to_csv('lgbm_regressor.csv', index=False)