In [1]:
import warnings
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

In [2]:
%config InlineBackend.figure_format='svg'
%matplotlib inline
warnings.filterwarnings('ignore')
large = 16; med = 10; small = 6
params = {'axes.titlesize': small,
          'legend.fontsize': small,
          'figure.figsize': (12,6),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': small,
          'ytick.labelsize': small,
          'figure.titlesize': small,
          'font.size': small}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("whitegrid")

In [3]:
def train_preprocessing(input_db):
#     Rooms - Квартиры
#     Преобразовываем типы
    input_db['Rooms'] = input_db['Rooms'].astype('int64')
    
#     Замена значений
    input_db['Rooms'].replace(0, 2, inplace=True)
    input_db['Rooms'].replace(6, 2, inplace=True)
    input_db['Rooms'].replace(10, 3, inplace=True)
    input_db['Rooms'].replace(19, 2, inplace=True)
    
#     Замена пропущенных значений LifeSquare
    count = 0
    for _, row in input_db.iterrows():
        if math.isnan(row['LifeSquare']):
            input_db['LifeSquare'].iloc[count] = abs(input_db['Square'].iloc[count] - input_db['KitchenSquare'].iloc[count])
        count += 1
        
    count = 0
    for _, row in input_db.iterrows():
        if row['LifeSquare'] > 200:
            input_db['LifeSquare'].iloc[count] = input_db['LifeSquare'].mean()
    count += 1
        
#     Обрабатываем данные Square
    count = 0
    for _, row in input_db.iterrows():
        if row['LifeSquare'] > row['Square']:
            input_db['Square'].iloc[count] = input_db['LifeSquare'].iloc[count] + input_db['KitchenSquare'].iloc[count]
        count += 1
        
    count = 0
    for _, row in input_db.iterrows():
        if row['LifeSquare'] > row['Square']:
            input_db['Square'].iloc[count] = input_db['LifeSquare'].iloc[count] + input_db['KitchenSquare'].iloc[count]
        count += 1
    
    count = 0
    for _, row in input_db.iterrows():
        if row['Square'] > 200:
            input_db['Square'].iloc[count] = input_db['Square'].mean()
    count += 1
        
#     Обрабатываем данные KitchenSquare
    count = 0
    for _, row in input_db.iterrows():
        if row['KitchenSquare'] > 250:
            input_db['KitchenSquare'].iloc[count] = input_db['KitchenSquare'].mean()
    count += 1
    
#     Проверка диапазона Square после работы с площадьми
    count = 0
    for _, row in input_db.iterrows():
        if (row['KitchenSquare'] > row['Square']) | (row['LifeSquare'] > row['Square']):
            input_db['Square'].iloc[count] = input_db['LifeSquare'].iloc[count] + input_db['KitchenSquare'].iloc[count]
    count += 1
    
#     Floor, HouseFloor - Этажи
#     Преобразовываем типы
    input_db['HouseFloor'] = input_db['HouseFloor'].astype('int64')

#     Заменяем нули в данных по этажам
    count = 0
    for _, row in input_db.iterrows():
        if row['HouseFloor'] == 0:
            input_db['HouseFloor'].iloc[count] = input_db['HouseFloor'].mode()[0]
    count += 1
    
    count = 0
    for _, row in input_db.iterrows():
        if row['Floor'] == 0:
            input_db['Floor'].iloc[count] = input_db['Floor'].mode()[0]
    count += 1
    
#     Обрабатываем данные Floor
    count = 0
    for _, row in input_db.iterrows():
        if row['Floor'] > 30:
            input_db['Floor'].iloc[count] = input_db['Floor'].mode()[0]
    count += 1
    
    count = 0
    for _, row in input_db.iterrows():
        if row['Floor'] > row['HouseFloor']:
            max_floor = input_db['HouseFloor'].iloc[count]
            input_db['Floor'].iloc[count] = random.randint(1, max_floor)
    count += 1
    
#     Года постройки - HouseYear
#     Обрабатываем данные HouseYear
    count = 0
    for _, row in input_db.iterrows():
        if row['HouseYear'] > 2020:
            input_db['HouseYear'].iloc[count] = 2020
    count += 1
    
#     Странные признаки - Ecology, Shops, HealthCare
#     Замена буквенных параметров на цифровые
    feature_bin_names = ['Ecology_2', 'Ecology_3', 'Shops_2']
    input_db[feature_bin_names] = input_db[feature_bin_names].replace({'A':0, 'B':1})

#     Удаляем Social_2
    input_db = input_db.drop('Social_2', 1)
    
#     Замена пустых значений HealthCare_1
    input_db['Healthcare_1'].fillna(input_db['Healthcare_1'].mean(), inplace=True)
    
#     Группируем значения HealthCare_2
    count = 0
    for _, row in input_db.iterrows():
        if row['Helthcare_2'] == 0:
            input_db['Helthcare_2'].iloc[count] = 'A'
        elif row['Helthcare_2'] == 1 or row['Helthcare_2'] == 2 or row['Helthcare_2'] == 3 or row['Helthcare_2'] == 6:
            input_db['Helthcare_2'].iloc[count] = 'B'
        elif row['Helthcare_2'] == 4 or row['Helthcare_2'] == 5:
            input_db['Helthcare_2'].iloc[count] = 'C'
    count += 1
    
    input_db['Helthcare_2'] = input_db['Helthcare_2'].replace({'A':0, 'B':1, 'C':2})
    
#     Создаем новые значения
#     DistrictSize
    district_size = input_db['DistrictId'].value_counts().reset_index()\
                .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})
    districts_popular = district_size.loc[district_size['DistrictSize'] > 100, 'DistrictId'].tolist()
    district_size.loc[~district_size['DistrictId'].isin(districts_popular), 'DistrictId'] = 999
    district_size = district_size.groupby('DistrictId')['DistrictSize'].agg(DistrictSize='median')
    district_size.reset_index(level='DistrictId', inplace=True)
    input_db.loc[~input_db['DistrictId'].isin(districts_popular), 'DistrictId'] = 999
    input_db = input_db.merge(district_size, on='DistrictId', how='left').set_index(input_db.index)
    
#     PriceOneRoomByDistrict
    input_db['PriceOneRoom'] = input_db['Price'] / input_db['Rooms']
    price_room_district = input_db.groupby(['DistrictId'], as_index=False)\
                        .agg({'PriceOneRoom':'median'})\
                        .rename(columns={'PriceOneRoom':'PriceOneRoomByDistrict'})
    input_db = input_db.merge(price_room_district, on=['DistrictId'], how='left').set_index(input_db.index)
    
#     PriceOneMeterByDistrict
    input_db['PriceOneMeter'] = input_db['Price'] / input_db['Square']
    price_meter_district = input_db.groupby(['DistrictId'], as_index=False)\
                        .agg({'PriceOneMeter':'median'})\
                        .rename(columns={'PriceOneMeter':'PriceOneMeterByDistrict'})
    input_db = input_db.merge(price_meter_district, on=['DistrictId'], how='left').set_index(input_db.index)
    
#     RoomSquare
    input_db['RoomSquare'] = input_db['Square'] / input_db['Rooms']
    return input_db

In [4]:
def test_preprocessing(input_db, train_db):
#     Rooms - Квартиры
#     Преобразовываем типы
    input_db['Rooms'] = input_db['Rooms'].astype('int64')
    
#     Замена значений
    input_db['Rooms'].replace(0, 2, inplace=True)
    input_db['Rooms'].replace(6, 2, inplace=True)
    input_db['Rooms'].replace(10, 3, inplace=True)
    input_db['Rooms'].replace(19, 2, inplace=True)
    
#     Замена пропущенных значений LifeSquare
    count = 0
    for _, row in input_db.iterrows():
        if math.isnan(row['LifeSquare']):
            input_db['LifeSquare'].iloc[count] = abs(input_db['Square'].iloc[count] - input_db['KitchenSquare'].iloc[count])
        count += 1
        
    count = 0
    for _, row in input_db.iterrows():
        if row['LifeSquare'] > 200:
            input_db['LifeSquare'].iloc[count] = train_db['LifeSquare'].mean()
    count += 1
        
#     Обрабатываем данные Square
    count = 0
    for _, row in input_db.iterrows():
        if row['LifeSquare'] > row['Square']:
            input_db['Square'].iloc[count] = input_db['LifeSquare'].iloc[count] + input_db['KitchenSquare'].iloc[count]
        count += 1
        
    count = 0
    for _, row in input_db.iterrows():
        if row['LifeSquare'] > row['Square']:
            input_db['Square'].iloc[count] = input_db['LifeSquare'].iloc[count] + input_db['KitchenSquare'].iloc[count]
        count += 1
    
    count = 0
    for _, row in input_db.iterrows():
        if row['Square'] > 200:
            input_db['Square'].iloc[count] = train_db['Square'].mean()
    count += 1
        
#     Обрабатываем данные KitchenSquare
    count = 0
    for _, row in input_db.iterrows():
        if row['KitchenSquare'] > 250:
            input_db['KitchenSquare'].iloc[count] = train_db['KitchenSquare'].mean()
    count += 1
    
#     Проверка диапазона Square после работы с площадьми
    count = 0
    for _, row in input_db.iterrows():
        if (row['KitchenSquare'] > row['Square']) | (row['LifeSquare'] > row['Square']):
            input_db['Square'].iloc[count] = input_db['LifeSquare'].iloc[count] + input_db['KitchenSquare'].iloc[count]
    count += 1
    
#     Floor, HouseFloor - Этажи
#     Преобразовываем типы
    input_db['HouseFloor'] = input_db['HouseFloor'].astype('int64')

#     Заменяем нули в данных по этажам
    count = 0
    for _, row in input_db.iterrows():
        if row['HouseFloor'] == 0:
            input_db['HouseFloor'].iloc[count] = train_db['HouseFloor'].mode()[0]
    count += 1
    
    count = 0
    for _, row in input_db.iterrows():
        if row['Floor'] == 0:
            input_db['Floor'].iloc[count] = train_db['Floor'].mode()[0]
    count += 1
    
#     Обрабатываем данные Floor
    count = 0
    for _, row in input_db.iterrows():
        if row['Floor'] > 30:
            input_db['Floor'].iloc[count] = train_db['Floor'].mode()[0]
    count += 1
    
    count = 0
    for _, row in input_db.iterrows():
        if row['Floor'] > row['HouseFloor']:
            max_floor = input_db['HouseFloor'].iloc[count]
            input_db['Floor'].iloc[count] = random.randint(1, max_floor)
    count += 1
    
#     Года постройки - HouseYear
#     Обрабатываем данные HouseYear
    count = 0
    for _, row in input_db.iterrows():
        if row['HouseYear'] > 2020:
            input_db['HouseYear'].iloc[count] = 2020
    count += 1
    
#     Странные признаки - Ecology, Shops, HealthCare
#     Замена буквенных параметров на цифровые
    feature_bin_names = ['Ecology_2', 'Ecology_3', 'Shops_2']
    input_db[feature_bin_names] = input_db[feature_bin_names].replace({'A':0, 'B':1})

#     Удаляем Social_2
    input_db = input_db.drop('Social_2', 1)
    
#     Замена пустых значений HealthCare_1
    input_db['Healthcare_1'].fillna(train_db['Healthcare_1'].mean(), inplace=True)
    
#     Группируем значения HealthCare_2
    count = 0
    for _, row in input_db.iterrows():
        if row['Helthcare_2'] == 0:
            input_db['Helthcare_2'].iloc[count] = 'A'
        elif row['Helthcare_2'] == 1 or row['Helthcare_2'] == 2 or row['Helthcare_2'] == 3 or row['Helthcare_2'] == 6:
            input_db['Helthcare_2'].iloc[count] = 'B'
        elif row['Helthcare_2'] == 4 or row['Helthcare_2'] == 5:
            input_db['Helthcare_2'].iloc[count] = 'C'
    count += 1
    
    input_db['Helthcare_2'] = input_db['Helthcare_2'].replace({'A':0, 'B':1, 'C':2})
    
#     Создаем новые значения
#     DistrictSize
    district_size = train_db['DistrictId'].value_counts().reset_index()\
                .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})
    districts_popular = district_size.loc[district_size['DistrictSize'] > 100, 'DistrictId'].tolist()
    district_size.loc[~district_size['DistrictId'].isin(districts_popular), 'DistrictId'] = 999
    district_size = district_size.groupby('DistrictId')['DistrictSize'].agg(DistrictSize='median')
    district_size.reset_index(level='DistrictId', inplace=True)
    input_db.loc[~input_db['DistrictId'].isin(districts_popular), 'DistrictId'] = 999
    input_db = input_db.merge(district_size, on='DistrictId', how='left').set_index(input_db.index)

#     PriceOneRoomByDistrict
    train_db['PriceOneRoom'] = train_db['Price'] / train_db['Rooms']
    price_room_district = train_db.groupby(['DistrictId'], as_index=False)\
                        .agg({'PriceOneRoom':'median'})\
                        .rename(columns={'PriceOneRoom':'PriceOneRoomByDistrict'})
    input_db = input_db.merge(price_room_district, on=['DistrictId'], how='left').set_index(input_db.index)
    
#     PriceOneMeterByDistrict
    train_db['PriceOneMeter'] = train_db['Price'] / train_db['Square']
    price_meter_district = train_db.groupby(['DistrictId'], as_index=False)\
                        .agg({'PriceOneMeter':'median'})\
                        .rename(columns={'PriceOneMeter':'PriceOneMeterByDistrict'})
    input_db = input_db.merge(price_meter_district, on=['DistrictId'], how='left').set_index(input_db.index)
    
#     RoomSquare
    input_db['RoomSquare'] = input_db['Square'] / input_db['Rooms']
    return input_db

In [5]:
def target_preprocessing(input_col):
#     Преобразовываем целевую переменную
    input_col = np.log1p(input_col)
    return input_col

In [6]:
def recovery_target(input_col):
#     Восстанавлиаем целевую переменную
    input_col = np.expm1(input_col)
    
    return input_col

In [7]:
db_review = pd.read_csv('train.csv', index_col='Id')

In [8]:
db_review_prep = train_preprocessing(db_review)

In [9]:
db_review_prep['Price'] = target_preprocessing(db_review_prep['Price'])

In [10]:
features = list(db_review_prep.loc[:, db_review_prep.columns != 'Id'].corrwith(
    db_review_prep['Price']).abs().sort_values(ascending=False)[1:].index)

target = 'Price'
features.remove('PriceOneMeter')
features.remove('PriceOneRoom')
len(features)

21

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    db_review_prep[features], db_review_prep[target], test_size=0.3, random_state=42)

In [12]:
lgbm_regressor_model = LGBMRegressor(
    max_bin=90,
    num_leaves=4,
    n_estimators=4000,
    learning_rate=0.01
)
lgbm_regressor_model.fit(X_train, y_train)

LGBMRegressor(learning_rate=0.01, max_bin=90, n_estimators=4000, num_leaves=4)

In [13]:
models_dict = {}

In [14]:
models_dict['LGBMRegressor'] = lgbm_regressor_model

In [15]:
best_model = models_dict['LGBMRegressor']

In [16]:
db_test = pd.read_csv('test.csv', index_col='Id')

In [17]:
db_test_prep = test_preprocessing(db_test, db_review)

In [18]:
db_test_prep['Price'] = best_model.predict(db_test_prep[features])

In [19]:
db_test_prep['Price'] = recovery_target(db_test_prep['Price'])

In [20]:
db_test_prep['Id'] = db_test_prep.index

In [21]:
db_test_prep[['Id', 'Price']].to_csv('IGridchin_predictions.csv', index=None)