Используя исходные или очищенные данные, сформируйте предсказание класса объявления из множества exposition_test.tsv.gz

Обязательно нужно использовать одну или несколько моделей кластеризации. Дополнительно можно использовать решающие деревья, CatBoost, LightGBM и XGBoost.

Подсказка: для использования day_mean в классификации/кластеризации потребуется его сформировать для тестовых данных. Это можно сделать либо при помощи других моделей (два этапа классификации), либо построив линейную модель прогноза day_mean от count_day.

Данные:

https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_train.tsv.gz
https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_test.tsv.gz
https://video.ittensive.com/machine-learning/hacktherealty/data/metro.utf8.json
https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_sample_submisson.tsv
Итоговый файл с кодом (.py или .ipynb) выложите в github с портфолио.

### Подключение библиотек

In [2]:
import pandas as pd
import numpy as np
from sklearn_som.som import SOM
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from transliterate import translit
from tqdm import notebook
from haversine import haversine

In [3]:
def data_preproccesing (data):
# add total items per day
    #data_day_count = data.groupby("day").count()["build_year"]
    #data["day_count"] = data["day"].apply(lambda x:data_day_count.loc[x])
# approximate values (clean-up)
    data.loc[data.build_year == 0, 'build_year'] = np.NaN
    data['build_year'] = data['build_year'].fillna((data.groupby(['building_series_id'])['build_year'].transform('median')))
    data.loc[data['build_year'].isna(), 'build_year'] = data['build_year'].mean()
    data['build_year'] = data['build_year'].astype(np.uint16)
    if 'has_elevator' in data.columns:
# elevator for 6+ floors
        data.loc[(data.has_elevator==0) & (data.floor>5), 'has_elevator'] = 1
# fix living area
    data.loc[data.living_area == 0, 'living_area'] = np.NaN
    data['living_area'] = data['living_area'].fillna((data.groupby(['rooms'])['living_area'].transform('median')))
# fix price
    data.loc[data.price<100, 'price'] *= 1000
    data.loc[data.price<1000, 'price'] *= 60
    if 'floors_total' in data.columns:
# fix celing height
        data.loc[(data.ceiling_height<2) | (data.ceiling_height>5), 'ceiling_height'] = np.NaN
        data['ceiling_height'] = data['ceiling_height'].fillna(data.groupby(['building_series_id'])['ceiling_height'].transform('median'))
        data.loc[data['ceiling_height'].isna(), 'ceiling_height'] = data['ceiling_height'].mean()
# enrich data, % floor
        data['floor'] = data['floor'] / data["floors_total"]
# locality, village/region/moscow/metro
    if 'locality_name' in data.columns:
        data['loctype_village'] = (data['locality_name'].str.match(pat = 'городок|деревня|ДНП|поселок|посёлок|село|СНТ|товарищество|хутор')).astype(np.uint8)
        data['loctype_moscow'] = (data.locality_name == 'Москва').astype(np.uint8)
        data['loctype_region'] = ((data.loctype_village == 0) & (data.loctype_moscow == 0)).astype(np.uint8)
    if "site_id" in data.columns:
        data = data.drop(['site_id', 'main_image', 'area', 'building_id', 'unified_address'], axis=1)
    if 'target_string' in data.columns:
        data = data.drop(['target_string'], axis=1)
# processing date
    if 'day' in data.columns:
        data['day'] = pd.to_datetime(data['day'])
        data['year'] = data['day'].dt.year
        data['month'] = data['day'].dt.month
        data['week'] = data['day'].dt.week
        data['dow'] = data['day'].dt.dayofweek
        data['dom'] = data['day'].dt.day
        data['doy'] = data['day'].dt.dayofyear
        data = data.drop(["day"], axis=1)
# adding holydays, 1-7 Jan, 8 Mar, 1 May, 9 May, 12 Jun, 4 Nov
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2017/
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2018/
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2019/
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2020/
        data['is_holyday'] = ((data['year'] == 2017 &
                                (((data['dom'] > 0) & (data['dom'] < 8) & data['month'] == 1) | 
                                (((data['dom'] == 23) | data['dom'] == 24)) & (data['month'] == 2)) |
                                ((data['dom'] == 8) & (data['month'] == 3)) |
                                (((data['dom'] == 1) | (data['dom'] == 8) | (data['dom'] == 9)) & data['month'] == 5) |
                                ((data['dom'] == 12) & (data['month'] == 6)) |
                                ((data['dom'] == 6) & (data['month'] == 11))) |
                              ((data['year'] == 2018) &
                                (((data['dom'] > 0) & (data['dom'] < 9) & data['month'] == 1) | 
                                ((data['dom'] == 23) & (data['month'] == 2)) |
                                (((data['dom'] == 8) | (data['dom'] == 9)) & (data['month'] == 3)) |
                                ((data['dom'] == 30) & (data['month'] == 4)) |
                                (((data['dom'] == 1) | (data['dom'] == 2) | (data['dom'] == 9)) & data['month'] == 5) |
                                (((data['dom'] == 11) | (data['dom'] == 12)) & (data['month'] == 6)) |
                                ((data['dom'] == 5) & (data['month'] == 11)) |
                                ((data['dom'] == 31) & (data['month'] == 12)))) |
                              ((data['year'] == 2019) &
                                (((data['dom'] > 0) & (data['dom'] < 9) & data['month'] == 1) | 
                                ((data['dom'] == 8) & (data['month'] == 3)) |
                                (((data['dom'] == 1) | (data['dom'] == 2) | (data['dom'] == 3) | (data['dom'] == 9) | (data['dom'] == 10)) & data['month'] == 5) |
                                ((data['dom'] == 12) & (data['month'] == 6)) |
                                ((data['dom'] == 4) & (data['month'] == 11)))) |
                              ((data['year'] == 2020) &
                                (((data['dom'] > 0) & (data['dom'] < 9) & data['month'] == 1) | 
                                ((data['dom'] == 24) & (data['month'] == 2)) |
                                ((data['dom'] == 9) & (data['month'] == 3)) |
                                (((data['dom'] == 1) | (data['dom'] == 4) | (data['dom'] == 5) | (data['dom'] == 11)) & data['month'] == 5) |
                                ((data['dom'] == 12) & (data['month'] == 6)) |
                                ((data['dom'] == 4) & (data['month'] == 11))))).astype(np.uint8)
# one-hot vectors
    if 'year' in data.columns:
        for label in ['year', 'month', 'week', 'dow', 'doy', 'dom', 'renovation',
                      'balcony', 'building_type', 'parking', 'floors_total', 'locality_name']:
            for l in data[label].unique():
                data[label + "_" + translit(str(l), "ru", reversed=True)] = (data[label] == l).astype(np.uint8)
# boolean -> int
    if 'studio' in data.columns:
        for label in ['studio', 'has_elevator', 'expect_demolition', 'is_apartment']:
            data[label] = data[label].astype(np.uint8)
# index (remove id from columns)
    if 'id' in data.columns:
        data = data.set_index(['id'])
    return data

In [36]:
test_data = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_test.tsv.gz', sep='\t')
#test_data = test_data.drop(labels=["building_series_id", "site_id", "parking", "build_year","expect_demolition","main_image","latitude"], axis=1)
#test_data = test_data.drop(labels=["floors_total","id","floor", "is_apartment","building_id","has_elevator","kitchen_area","public","longitude"], axis=1)
#test_data = test_data.drop(labels=["flats_count", "flats_count","building_type","balcony"], axis=1)
#test_data = test_data.drop(labels=["studio","unified_address","area", "renovation"], axis=1)
test_data.head()

Unnamed: 0,building_series_id,site_id,parking,build_year,expect_demolition,main_image,latitude,total_area,ceiling_height,rooms,...,kitchen_area,day,public,longitude,price,flats_count,building_type,balcony,locality_name,renovation
0,663294,0,UNKNOWN,1971,False,//avatars.mds.yandex.net/get-realty/1900763/ad...,55.795704,36.0,2.64,1,...,0.0,2020-01-25,True,37.602478,40000,80,PANEL,UNKNOWN,Москва,UNKNOWN
1,712125,0,UNKNOWN,1986,False,//avatars.mds.yandex.net/get-realty/1583116/ad...,55.605583,40.0,2.48,1,...,10.0,2019-11-19,True,37.743679,25000,222,PANEL,LOGGIA,Москва,COSMETIC_DONE
2,0,0,UNKNOWN,2014,False,//avatars.mds.yandex.net/get-realty/2124710/ad...,55.92556,25.0,0.0,0,...,0.0,2020-01-11,True,37.862965,19000,179,MONOLIT,LOGGIA,Королёв,COSMETIC_DONE
3,0,0,UNKNOWN,2001,False,//avatars.mds.yandex.net/get-realty/2958378/ad...,55.432522,42.0,0.0,1,...,10.0,2020-01-27,True,37.544224,20000,0,PANEL,LOGGIA,Подольск,COSMETIC_DONE
4,1564812,0,UNKNOWN,2019,False,//avatars.mds.yandex.net/get-realty/2732616/ad...,55.91753,73.300003,2.8,3,...,10.2,2020-03-04,False,37.411098,68000,0,MONOLIT,TWO_LOGGIA,Химки,EURO


In [38]:
test_data = data_preproccesing(test_data)
#test_data=test_data.reset_index()
test_data.reset_index(inplace=True)
test_data=test_data.drop(["id"], axis=1)
test_data.head()



Unnamed: 0,building_series_id,parking,build_year,expect_demolition,latitude,total_area,ceiling_height,rooms,floors_total,living_area,...,locality_name_derevnja Kabanovo,locality_name_derevnja Ivojlovo,locality_name_selo Vozdvizhenskoe,locality_name_derevnja Dolgoe Ledovo,locality_name_derevnja Martem'janovo,locality_name_poselok Veshki,locality_name_poselok Radiotsentr,locality_name_derevnja Zhilino,locality_name_poselok Shuvoe,locality_name_derevnja Vorschikovo
0,663294,UNKNOWN,1971,0,55.795704,36.0,2.64,1,12,19.0,...,0,0,0,0,0,0,0,0,0,0
1,712125,UNKNOWN,1986,0,55.605583,40.0,2.48,1,16,20.0,...,0,0,0,0,0,0,0,0,0,0
2,0,UNKNOWN,2014,0,55.92556,25.0,2.7,0,16,12.0,...,0,0,0,0,0,0,0,0,0,0
3,0,UNKNOWN,2001,0,55.432522,42.0,2.7,1,10,20.0,...,0,0,0,0,0,0,0,0,0,0
4,1564812,UNKNOWN,2019,0,55.91753,73.300003,2.8,3,16,45.799999,...,0,0,0,0,0,0,0,0,0,0


Добавление открытых данных о метро
Близость к метро оказывает сильное воздействие на недвижимость. Источник данных: https://data.mos.ru/opendata/624

Сокращаем входы в метро до уникальных, немного теряем точность, повышаем скорость расчета в 6 раз

In [7]:
def nearest_metro(house):
    min_dist = 100
    near_metro = ''
    for i in range(len(metro['metro_station'].values)):
        station = (metro['metro_latitude'][i], metro['metro_longitude'][i])
        dist = haversine(house, station)
        if dist < min_dist:
            min_dist = dist
            near_metro = metro['metro_station'][i]
    return [min_dist, near_metro]

def calculate_nearest_metro(data):
    metro_distances = []
    lat = data['latitude'].values
    lon = data['longitude'].values
    msk = data['loctype_moscow'].values
    for i in notebook.tqdm(range(len(lat))):
        if msk[i] == 1:
            house = (lat[i], lon[i])
            metro_distances.append(nearest_metro(house))
        else:
            metro_distances.append([0, ""])
    return np.stack(metro_distances, axis=1)

def enrich_metro (data, metro_data):
    data['metro_distance'] = (metro_data[0]).astype(np.float64)
    data['metro_station'] = metro_data[1]
# fill mean values for non-Moscow localities (metro distance is ~ incorrect)
    m = data[data["loctype_moscow"] == 1]["metro_distance"].mean()
    data.loc[data["loctype_moscow"] == 0, "metro_distance"] = m
# one-hot vector for metro station
    for l in data['metro_station'].unique():
        data['metro_station_' + translit(str(l), "ru", reversed=True)] = (data['metro_station'] == l).astype(np.uint8)
    return data

In [39]:
metro = pd.read_json("https://video.ittensive.com/machine-learning/hacktherealty/data/metro.utf8.json")
metro = metro[['NameOfStation', 'Longitude_WGS84', 'Latitude_WGS84']]
metro = metro.reset_index().drop('index', axis=1)
metro = metro.rename({'NameOfStation': 'metro_station',
                      'Longitude_WGS84': 'metro_longitude',
                      'Latitude_WGS84': 'metro_latitude'}, axis=1)
metro = metro.drop_duplicates(subset=["metro_station"], keep="first")
metro = metro.set_index("metro_station").reset_index()
print (metro.head())

              metro_station  metro_longitude  metro_latitude
0               Китай-город        37.631677       55.757315
1                 Калужская        37.539238       55.655386
2             Братиславская        37.752643       55.660114
3  Бульвар адмирала Ушакова        37.541645       55.545011
4  Бульвар Дмитрия Донского        37.576311       55.570289


In [40]:
metro_train = calculate_nearest_metro(test_data)

  0%|          | 0/71666 [00:00<?, ?it/s]

In [41]:
test_data = enrich_metro(test_data, metro_train)



In [42]:
price_data = pd.DataFrame(test_data[["locality_name", "price", "metro_station"]])
price_groups = {"locality_name": {
    "median": price_data.groupby(["locality_name"]).median()["price"]
}, "metro_station": {
    "median": price_data.groupby(["metro_station"]).median()["price"]
}}

def calc_price (data, group="", label=""):
    if data[group] in price_groups[group][label]:
        return data["price"] / price_groups[group][label][data[group]]
    else:
        return 1
    
for group in price_groups:
    print ("Processing:", group, end=" ")
    for label in price_groups[group]:
        print (label, end=" ")
        test_data["price_" + group + "_" + label] = test_data.apply(calc_price, axis=1,
                                                                      group=group, label=label)
    print ("")


Processing: locality_name median 
Processing: metro_station median 


In [47]:
#test_data=test_data.drop(["building_series_id","parking","build_year","expect_demolition","latitude", "floors_total"], axis=1)
#test_data=test_data.drop(["floors_total"], axis=1)
test_data=test_data[['total_area','ceiling_height','rooms','living_area','price','price_locality_name_median']]
test_data.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,price_locality_name_median
0,36.0,2.64,1,19.0,40000,0.888889
1,40.0,2.48,1,20.0,25000,0.555556
2,25.0,2.7,0,12.0,19000,0.791667
3,42.0,2.7,1,20.0,20000,0.869565
4,73.300003,2.8,3,45.799999,68000,2.259136


### Загрузка предварительно очищенных данных
Удаляем doy_108

In [13]:
train_data = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz')
train_data = train_data.drop(labels=["doy_108"], axis=1)
train_data.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,price_locality_name_median,target
0,105.0,3.0,3,50.0,95000,2.456912,2.261905,1
1,40.0,3.0,1,19.200001,25000,3.028689,1.0,2
2,37.599998,2.64,0,19.0,26000,3.091993,0.619048,2
3,80.0,3.0,3,49.0,35000,3.10101,1.25,2
4,100.0,3.0,3,49.0,80000,2.495468,1.904762,3


Удаляем day_mean. Позже планируется вернуться и доработать этот вопрос, сгенерировав day_mean для тестовых данных

In [14]:
train_data = train_data.drop(labels=["day_mean"], axis=1)
train_data.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,price_locality_name_median,target
0,105.0,3.0,3,50.0,95000,2.261905,1
1,40.0,3.0,1,19.200001,25000,1.0,2
2,37.599998,2.64,0,19.0,26000,0.619048,2
3,80.0,3.0,3,49.0,35000,1.25,2
4,100.0,3.0,3,49.0,80000,1.904762,3


### Нормализация данных
Приведение всех значений к отрезку [0;1], удалим из данных target

In [66]:
train_data_mm = pd.DataFrame(MinMaxScaler().fit_transform(train_data[train_data.columns[:-1]]))

In [49]:
test_data_mm=pd.DataFrame(MinMaxScaler().fit_transform(test_data))

In [18]:
train_data_mm.head()

Unnamed: 0,0,1,2,3,4,5
0,0.097782,0.333333,0.6,0.108352,0.002078,0.002071
1,0.032258,0.333333,0.2,0.038826,0.000522,0.000893
2,0.029839,0.213333,0.0,0.038375,0.000544,0.000538
3,0.072581,0.333333,0.6,0.106095,0.000744,0.001127
4,0.092742,0.333333,0.6,0.106095,0.001745,0.001738


In [19]:
test_data_mm.head()

Unnamed: 0,0,1,2,3,4,5
0,0.052632,0.213333,0.2,0.040984,0.003901,0.003851
1,0.060429,0.16,0.2,0.043394,0.0024,0.00235
2,0.031189,0.233333,0.0,0.024108,0.0018,0.003413
3,0.064327,0.233333,0.2,0.043394,0.0019,0.003764
4,0.125341,0.266667,0.6,0.105593,0.006701,0.010018


### Self Organizing Maps
Получим кластеры по всем данным, используем 50x50=2500 начальных центров кластеров. Используется только часть набора данных, ввиду ограниченных возможностей пк

In [67]:
train_data_mm=train_data_mm[0:100000]
#test_data_mm=test_data_mm[0:5000]


In [69]:
model_labels = []
np.random.seed(42)
som = SOM(m=50, n=50, dim=len(train_data_mm.columns), max_iter=1000)
model_labels.append(som.fit_predict(np.array(train_data_mm), epochs=50, shuffle=False))

In [70]:
np.random.seed(15)
som = SOM(m=50, n=50, dim=len(train_data_mm.columns), max_iter=1000)
model_labels.append(som.fit_predict(np.array(train_data_mm), epochs=50, shuffle=False))

In [71]:
np.random.seed(1)
som = SOM(m=50, n=50, dim=len(train_data_mm.columns), max_iter=1000)
model_labels.append(som.fit_predict(np.array(train_data_mm), epochs=50, shuffle=False))

In [72]:
np.random.seed(88)
som = SOM(m=50, n=50, dim=len(train_data_mm.columns), max_iter=1000)
model_labels.append(som.fit_predict(np.array(train_data_mm), epochs=50, shuffle=False))

### DBSCAN

In [73]:
dbscan = DBSCAN(n_jobs=-1, eps=0.02)
model_labels.append(dbscan.fit_predict(train_data_mm))

## Присвоение меток кластеров тестовым данным

In [74]:
test_model_labels = []
np.random.seed(42)
som = SOM(m=50, n=50, dim=len(test_data_mm.columns), max_iter=1000)
test_model_labels.append(som.fit_predict(np.array(test_data_mm), epochs=50, shuffle=False))

In [75]:
np.random.seed(15)
som = SOM(m=50, n=50, dim=len(test_data_mm.columns), max_iter=1000)
test_model_labels.append(som.fit_predict(np.array(test_data_mm), epochs=50, shuffle=False))

In [76]:
np.random.seed(1)
som = SOM(m=50, n=50, dim=len(test_data_mm.columns), max_iter=1000)
test_model_labels.append(som.fit_predict(np.array(test_data_mm), epochs=50, shuffle=False))

In [77]:
np.random.seed(88)
som = SOM(m=50, n=50, dim=len(test_data_mm.columns), max_iter=1000)
test_model_labels.append(som.fit_predict(np.array(test_data_mm), epochs=50, shuffle=False))

In [85]:
dbscan = DBSCAN(n_jobs=-1, eps=0.02)
test_model_labels.append(dbscan.fit_predict(test_data_mm))



Далее рассматривается предсказание класса на основе DBScan, предсказание на основе ансамбля моделей планируется доработать позже

In [81]:
for i in range(5):
    train_data_mm["label" + str(i)] = model_labels[i]
train_data_mm["target"] = train_data["target"]

Метки для других моделей верменно удалены, target4 - dbscan

In [116]:
train_data_mm.head()

Unnamed: 0,0,1,2,3,4,5,label4,target,target4
0,0.097782,0.333333,0.6,0.108352,0.002078,0.002071,0,1,3.595899
1,0.032258,0.333333,0.2,0.038826,0.000522,0.000893,1,2,2.626943
2,0.029839,0.213333,0.0,0.038375,0.000544,0.000538,2,2,2.81557
3,0.072581,0.333333,0.6,0.106095,0.000744,0.001127,0,2,3.595899
4,0.092742,0.333333,0.6,0.106095,0.001745,0.001738,0,3,3.595899


Присвоим тестовым данным кластеры DBSCAN. Метки для дргих моделей верменно удалены, target4 - dbscan

In [87]:
for i in range(5):
    test_data_mm["label" + str(i)] = test_model_labels[i]

### Предсказание класса
Выбор наиболее популярногоr класса в кластере

In [126]:
som_labels=test_data_mm["label4"]

groups = train_data_mm.groupby(["label4","target"]).count()[0]
clusters_popular = [0]*len(som_labels)
clusters_class = [0]*len(som_labels)
for group in groups.iteritems():
    items = group[1]
    cluster = group[0][0]
    if items > clusters_popular[cluster]:
        clusters_popular[cluster] = items
        clusters_class[cluster] = group[0][1]
test_data_mm["target4"] = test_data_mm["label4"].apply(lambda x:clusters_class[x])



In [127]:
test_data_mm.head()

Unnamed: 0,0,1,2,3,4,5,label4,target4
0,0.052632,0.213333,0.2,0.040984,0.003901,0.003851,0,5
1,0.060429,0.16,0.2,0.043394,0.0024,0.00235,1,1
2,0.031189,0.233333,0.0,0.024108,0.0018,0.003413,2,3
3,0.064327,0.233333,0.2,0.043394,0.0019,0.003764,3,3
4,0.125341,0.266667,0.6,0.105593,0.006701,0.010018,4,3
