In [43]:
import os
import pandas as pd
import numpy as np
import pandas_profiling
import seaborn as sns
from sklearn.decomposition import PCA
from baseline.raif_hack.metrics import deviation_metric
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
import joblib

In [44]:
DATA_DIR = 'data'
FILENAME = 'test.csv'

In [45]:
df = pd.read_csv(os.path.join(DATA_DIR, FILENAME))

In [46]:
#Load
OBJECTS_FOLDER = 'object'
encoder = joblib.load(os.path.join(OBJECTS_FOLDER, 'encoder.gzip'))
gb = joblib.load(os.path.join(OBJECTS_FOLDER, 'gb.gzip'))
pca = joblib.load(os.path.join(OBJECTS_FOLDER, 'pca.gzip'))

In [47]:
df.head()

Unnamed: 0,city,floor,id,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,...,reform_mean_year_building_1000,reform_mean_year_building_500,region,lat,lng,total_square,street,date,realty_type,price_type
0,Курск,1.0,COL_289284,7,55,85,117,0,0,0,...,1966.471591,1966.74026,Курская область,51.709255,36.147908,156.148996,S6983,2020-09-06,100,1
1,Сургут,1.0,COL_289305,8,70,112,140,0,0,0,...,1988.259259,1989.068182,Ханты-Мансийский АО,61.23324,73.462509,190.737943,S29120,2020-09-06,110,1
2,Тюмень,-1.0,COL_289318,3,28,67,122,0,0,0,...,1985.880282,1991.458333,Тюменская область,57.14311,65.554573,457.118051,S23731,2020-09-06,10,1
3,Иркутск,1.0,COL_289354,5,76,139,231,0,0,0,...,1947.073276,1941.657895,Иркутская область,52.28138,104.282975,66.503622,S14207,2020-09-06,100,1
4,Курск,,COL_289399,8,105,189,279,0,0,2,...,1948.764151,1946.689655,Курская область,51.729706,36.194019,23.864915,S20658,2020-09-06,10,1


In [48]:
ids = df['id']

In [49]:
# Отбросить признаки с большим числом пропусков
print((df.isna().astype(int).sum(0) / df.shape[0]).sort_values()[-10:])

features_to_drop = ['floor', 'id', 'date']
df = df[[c for c in df.columns if c not in features_to_drop]]

osm_crossing_points_in_0.001      0.000000
osm_culture_points_in_0.0075      0.000000
street                            0.002017
reform_house_population_1000      0.011769
reform_mean_floor_count_1000      0.011769
reform_mean_year_building_1000    0.011769
reform_mean_floor_count_500       0.016140
reform_mean_year_building_500     0.016140
reform_house_population_500       0.016140
floor                             0.437458
dtype: float64


In [50]:
df = df[[c for c in df.columns if c != 'street']]

In [51]:
# Разделим данные на категории
categorical_features = ['price_type', 'realty_type', 'region', 'city', 'osm_city_nearest_name']  # 'street', 
numerical_features = [c for c in df.columns if c not in categorical_features]

In [52]:
df[[c for c in df[categorical_features] if df[c].dtype != 'O']] += 1
df.replace(np.nan, 0, inplace=True)

In [53]:
a = pca.transform(df[numerical_features].to_numpy())
pca_res = pd.DataFrame(data=a, columns=list(range(a.shape[1])))

In [54]:
df = df[[c for c in df.columns if c not in numerical_features]]
df = pd.concat([df, pca_res], axis=1)

In [55]:
# idx = df[categorical_features].select_dtypes('O') == 0
# df[idx].replace(0, 'Missing', inplace=True)

In [56]:
df[categorical_features]

Unnamed: 0,price_type,realty_type,region,city,osm_city_nearest_name
0,2,101,Курская область,Курск,Курск
1,2,111,Ханты-Мансийский АО,Сургут,Сургут
2,2,11,Тюменская область,Тюмень,Тюмень
3,2,101,Иркутская область,Иркутск,Иркутск
4,2,11,Курская область,Курск,Курск
...,...,...,...,...,...
2969,2,101,Красноярский край,Красноярск,Красноярск
2970,2,11,Томская область,Томск,Томск
2971,2,101,Калужская область,Калуга,Калуга
2972,2,11,Нижегородская область,Нижний Новгород,Нижний Новгород


In [57]:
df[categorical_features] = encoder.transform(df[categorical_features])

In [58]:
y_hat = gb.predict(df.to_numpy())

In [67]:
res = pd.concat([ids, pd.Series(data=y_hat, name='per_square_meter_price')], axis=1)
res.set_index(ids.name, inplace=True)

In [68]:
res.head()

Unnamed: 0_level_0,per_square_meter_price
id,Unnamed: 1_level_1
COL_289284,139431.409324
COL_289305,131732.384607
COL_289318,157935.965046
COL_289354,149382.896388
COL_289399,142533.66079


In [69]:
RES_FILENAME = 'prediction.csv'
res.to_csv(RES_FILENAME)