## File descriptions

    train.csv - the training set
    test.csv - the test set
    sampleSubmission.csv - a sample submission file in the correct format

## Data fields

    Id - идентификационный номер квартиры
    DistrictId - идентификационный номер района
    Rooms - количество комнат
    Square - площадь
    LifeSquare - жилая площадь
    KitchenSquare - площадь кухни
    Floor - этаж
    HouseFloor - количество этажей в доме
    HouseYear - год постройки дома
    Ecology_1, Ecology_2, Ecology_3 - экологические показатели местности
    Social_1, Social_2, Social_3 - социальные показатели местности
    Healthcare_1, Helthcare_2 - показатели местности, связанные с охраной здоровья
    Shops_1, Shops_2 - показатели, связанные с наличием магазинов, торговых центров
    Price - цена квартиры 

In [1]:
import pandas as pd
import numpy as np
import sklearn.linear_model as lr
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
lr = LinearRegression()

In [4]:
ORIGINAL_DATASET_PATH = './data/train.csv' # Оригинальный датасет для обучения
PREPARED_DATASET_PATH = './data/prep_train.csv' # Очищенный датасет
TEST_DATASET_PATH = './data/test.csv' # Данные для проверки модели

In [5]:
orig_df = pd.read_csv(ORIGINAL_DATASET_PATH)
orig_test_df = pd.read_csv(TEST_DATASET_PATH)
X = orig_df.drop('Price', axis=1)
y = orig_df['Price']
X_final = pd.read_csv(TEST_DATASET_PATH)
preds_final = pd.DataFrame()
preds_final['Id'] = X_final['Id'].copy()
X.set_index('Id', inplace=True)
X_final.set_index('Id', inplace=True)

In [6]:
X['DistrictId'] = X['DistrictId']
X_final['DistrictId'] = X_final['DistrictId']

In [7]:
# orig_df.head(10)

In [8]:
# orig_test_df.describe()

In [9]:
# X[X['KitchenSquare']>100]

In [10]:
# X[X['Rooms']==3.0].mean()

In [11]:
# orig_test_df.head(20)

X['Old_Square'] = X['Square']
X['Old_LifeSquare'] = X['LifeSquare']
X['Old_Square_summ'] = X['Old_Square']-X['Old_LifeSquare']-X['KitchenSquare']

X.loc[X['Old_Square_summ']<0,('LifeSquare','Square')]

X.loc[((X['Square']-X['LifeSquare'])<0),'Square'] = X['Old_LifeSquare']
X.loc[((X['Square']-X['LifeSquare'])<0),'LifeSquare'] = X['Old_Square']


X[X['Old_Square_summ']<0]

X['Square_summ'] = X['Square']-X['LifeSquare']

X[X['Square_summ']<1]

orig_df['Square'].value_counts()

In [12]:
class FeatureClear:
    
    def __init__(self):
        self.medians=None
        self.modas=None
        
    def fit(self, X):
        self.medians = X.median()
        self.modas = X.mode().head(1).iloc[0]
    
    def transform(self, X):
        
        X['Correction_count'] = 0
        # Rooms
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
        X.loc[X['Rooms_outlier']==1, 'Correction_count'] = X['Correction_count'] + 1
        
        X.loc[X['Rooms'] == 0, 'Rooms'] = 1
        X.loc[X['Rooms'] >= 6, 'Rooms'] = self.modas['Rooms']
        
        # KitchenSquare
        X.loc[X['KitchenSquare'] < 3, 'KitchenSquare'] = 3
        X.loc[X['KitchenSquare'] > 50, 'KitchenSquare'] = self.medians['KitchenSquare']
        
        # HouseFloor, Floor
        X['HouseFloor_outlier'] = 0
        
        X.loc[X['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1
        X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor_outlier'] = 1
        X.loc[(X['Floor'] > X['HouseFloor'])|(X['HouseFloor'] == 0), 'Correction_count'] = X['Correction_count'] + 1
        
        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']
        X.loc[X['Floor'] > X['HouseFloor'], 'Floor'] = self.medians['HouseFloor']
        
        # HouseYear
        current_year = now = datetime.datetime.now().year
        
        X['HouseYear_outlier'] = 0
        X.loc[X['HouseYear'] > current_year, 'HouseYear_outlier'] = 1       
        X.loc[X['HouseYear'] > current_year, 'HouseYear'] = self.modas['HouseYear']
        
        # Healthcare_1
        if 'Healthcare_1' in X.columns:
            X.drop('Healthcare_1', axis=1, inplace=True)
            
        # LifeSquare
        X['Old_Square'] = X['Square']
        X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1
        X['Old_LifeSquare'] = X['LifeSquare']
        X['Old_Square_summ'] = X['Old_Square']-X['Old_LifeSquare']
        X.loc[((X['Square']-X['LifeSquare'])<0),'Square'] = X['Old_LifeSquare']
        X.loc[((X['Square']-X['LifeSquare'])<0),'LifeSquare'] = X['Old_Square']
        X['LifeSquare_outlier'] = 0
        X.loc[(X['LifeSquare'].isna())|(X['Old_Square_summ']<0), 'LifeSquare_outlier'] = 1
        
        condition = (X['LifeSquare'].isna()) &\
                      (~X['Square'].isna()) & \
                      (~X['KitchenSquare'].isna())
        
        X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare'] - 3
        
        
        return X

In [13]:
class FeatureGenetator():
    """Генерация новых фич"""
    
    def __init__(self):
        self.DistrictId_counts = None
        self.binary_to_numbers = None
        self.med_price_by_district = None
        self.med_price_by_floor_year = None
        
    def fit(self, X, y=None):
        
        X = X.copy()
        
        # DistrictID
        district = X['DistrictId'].value_counts()
        district = district[district > 20]  ## Если вы этого не сделаете, то на Leaderboard-е улетите в самый низ (>100 места, R2 ~ 0.65),
                                            ## Хотя на локальной валидации скор вырастет
        
        self.DistrictId_counts = dict(district)
        
        # Binary features
        self.binary_to_numbers = {'A': 0, 'B': 1}
        
        # Target encoding
        ## District
        df = X.copy()
        
        if y is not None:
            df['Price'] = y.values
            
            df['DistrictId_popular'] = df['DistrictId'].copy()
            df.loc[~df['DistrictId_popular'].isin(district.keys().tolist())] = np.nan
            
            self.med_price_by_district = df.groupby(['DistrictId_popular', 'Rooms'], as_index=False).agg({'Price':'median'}).\
                                            rename(columns={'Price':'MedPriceByDistrict',
                                                           'DistrictId_popular': 'DistrictId'})
            
        ## floor, year
        if y is not None:
            df['Price'] = y.values
            df = self.floor_to_cat(df)
            df = self.year_to_cat(df)
            self.med_price_by_floor_year = df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'}).\
                                            rename(columns={'Price':'MedPriceByFloorYear'})
        if y is not None:
            df['Price'] = y.values
            df['Meter_price']=df['Price']/df['Square']
            df = self.floor_to_cat(df)
            df = self.year_to_cat(df)
            self.med_meter_price_by_floor_year = df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Meter_price':'median'}).\
                                            rename(columns={'Meter_price':'MedMeterPriceByFloorYear'})

        
    def transform(self, X):
        
        # DistrictId
        X['DistrictId_count'] = X['DistrictId'].map(self.DistrictId_counts)  # self.DistrictId_counts = {'id': value}
        
        X['new_district'] = 0
        X.loc[X['DistrictId_count'].isna(), 'new_district'] = 1
        
        X['DistrictId_count'].fillna(5, inplace=True)
        
        # Binary features
        X['Ecology_2'] = X['Ecology_2'].map(self.binary_to_numbers)  # self.binary_to_numbers = {'A': 0, 'B': 1}
        X['Ecology_3'] = X['Ecology_3'].map(self.binary_to_numbers)
        X['Shops_2'] = X['Shops_2'].map(self.binary_to_numbers)
        
        # More categorical features
        X = self.floor_to_cat(X)  # + столбец flooar_cat
        X = self.year_to_cat(X)   # + столбец year_cat
        
        # Target encoding
        if self.med_price_by_district is not None:
            X = X.merge(self.med_price_by_district, on=['DistrictId', 'Rooms'], how='left')
        if self.med_price_by_floor_year is not None:
            X = X.merge(self.med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')
        
        return X
    
    @staticmethod
    def floor_to_cat(X):
        
        X['floor_cat'] = np.nan
        
        X.loc[X['Floor'] < 3, 'floor_cat'] = 1  
        X.loc[(X['Floor'] >= 3) & (X['Floor'] <= 5), 'floor_cat'] = 2
        X.loc[(X['Floor'] > 5) & (X['Floor'] <= 9), 'floor_cat'] = 3
        X.loc[(X['Floor'] > 9) & (X['Floor'] <= 15), 'floor_cat'] = 4
        X.loc[X['Floor'] > 15, 'floor_cat'] = 5
            
        return X
     
    @staticmethod
    def year_to_cat(X):
        
        X['year_cat'] = np.nan
        
        X.loc[X['HouseYear'] < 1941, 'year_cat'] = 1
        X.loc[(X['HouseYear'] >= 1941) & (X['HouseYear'] <= 1945), 'year_cat'] = 2
        X.loc[(X['HouseYear'] > 1945) & (X['HouseYear'] <= 1980), 'year_cat'] = 3
        X.loc[(X['HouseYear'] > 1980) & (X['HouseYear'] <= 2000), 'year_cat'] = 4
        X.loc[(X['HouseYear'] > 2000) & (X['HouseYear'] <= 2010), 'year_cat'] = 5
        X.loc[(X['HouseYear'] > 2010), 'year_cat'] = 6
            
        return X

In [14]:
clearer = FeatureClear()

In [15]:
clearer.fit(X)

In [16]:
X = clearer.transform(X)
X_final = clearer.transform(X_final)

In [17]:
X.dtypes

DistrictId              int64
Rooms                 float64
Square                float64
LifeSquare            float64
KitchenSquare         float64
Floor                 float64
HouseFloor            float64
HouseYear             float64
Ecology_1             float64
Ecology_2              object
Ecology_3              object
Social_1                int64
Social_2                int64
Social_3                int64
Helthcare_2             int64
Shops_1                 int64
Shops_2                object
Correction_count        int64
Rooms_outlier           int64
HouseFloor_outlier      int64
HouseYear_outlier       int64
Old_Square            float64
LifeSquare_nan          int64
Old_LifeSquare        float64
Old_Square_summ       float64
LifeSquare_outlier      int64
dtype: object

In [18]:
X_final.describe()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,...,Shops_1,Correction_count,Rooms_outlier,HouseFloor_outlier,HouseYear_outlier,Old_Square,LifeSquare_nan,Old_LifeSquare,Old_Square_summ,LifeSquare_outlier
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,3959.0,3959.0,5000.0
mean,51.2792,1.9058,56.591223,39.861525,6.4378,8.8514,12.9416,1984.3926,0.119874,24.9338,...,4.2428,0.1778,0.001,0.1768,0.0,56.4495,0.2082,36.15881,19.23917,0.2564
std,44.179466,0.806137,19.407324,19.453834,3.291489,4.99014,6.466881,18.573149,0.12007,17.532202,...,4.777365,0.382905,0.03161,0.381538,0.0,19.092787,0.406061,17.825287,14.033705,0.436689
min,0.0,1.0,1.378543,0.33349,3.0,1.0,1.0,1908.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.378543,0.0,0.33349,-254.357651,0.0
25%,21.0,1.0,41.98902,25.483923,3.0,5.0,9.0,1973.0,0.019509,6.0,...,1.0,0.0,0.0,0.0,0.0,41.906231,0.0,23.092026,13.717367,0.0
50%,37.0,2.0,52.933734,35.570732,6.0,8.0,13.0,1977.0,0.072158,25.0,...,3.0,0.0,0.0,0.0,0.0,52.92134,0.0,32.925087,18.339225,0.0
75%,77.0,2.0,66.416189,48.875389,9.0,13.0,17.0,2000.0,0.195781,36.0,...,6.0,0.0,0.0,0.0,0.0,66.285129,0.0,45.174091,24.277162,1.0
max,212.0,5.0,303.071094,303.071094,44.0,46.0,99.0,2020.0,0.521867,74.0,...,23.0,2.0,1.0,1.0,0.0,223.453689,1.0,303.071094,135.89928,1.0


In [19]:
X.describe()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,...,Shops_1,Correction_count,Rooms_outlier,HouseFloor_outlier,HouseYear_outlier,Old_Square,LifeSquare_nan,Old_LifeSquare,Old_Square_summ,LifeSquare_outlier
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,7887.0,7887.0,10000.0
mean,50.4008,1.8876,57.223484,40.727374,6.4489,8.8122,12.9591,1984.8638,0.118858,24.687,...,4.2313,0.1837,0.0012,0.1825,0.0002,56.315775,0.2113,37.199645,18.034615,0.2595
std,43.587592,0.811438,77.30942,77.609523,3.340854,4.897278,6.44346,18.409979,0.119025,17.532614,...,4.806341,0.388805,0.034622,0.386275,0.014141,21.058732,0.408251,86.241209,84.638864,0.438382
min,0.0,1.0,2.377248,0.370619,3.0,1.0,1.0,1910.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.136859,0.0,0.370619,-7398.897712,0.0
25%,20.0,1.0,41.866858,25.279879,3.0,5.0,9.0,1974.0,0.017647,6.0,...,1.0,0.0,0.0,0.0,0.0,41.774881,0.0,22.769832,13.680048,0.0
50%,36.0,2.0,52.633651,35.518682,6.0,8.0,13.0,1977.0,0.075424,25.0,...,3.0,0.0,0.0,0.0,0.0,52.51331,0.0,32.78126,18.4641,0.0
75%,75.0,2.0,66.004634,48.690336,9.0,13.0,17.0,2001.0,0.195781,36.0,...,6.0,0.0,0.0,0.0,0.0,65.900625,0.0,45.128803,24.532159,1.0
max,209.0,5.0,7480.592129,7480.592129,48.0,42.0,117.0,2020.0,0.521867,74.0,...,23.0,2.0,1.0,1.0,1.0,641.065193,1.0,7480.592129,125.041526,1.0


In [20]:
features = FeatureGenetator()

features.fit(X, y)

X = features.transform(X)
X_final = features.transform(X_final)

In [21]:
X.dtypes

DistrictId               int64
Rooms                  float64
Square                 float64
LifeSquare             float64
KitchenSquare          float64
Floor                  float64
HouseFloor             float64
HouseYear              float64
Ecology_1              float64
Ecology_2                int64
Ecology_3                int64
Social_1                 int64
Social_2                 int64
Social_3                 int64
Helthcare_2              int64
Shops_1                  int64
Shops_2                  int64
Correction_count         int64
Rooms_outlier            int64
HouseFloor_outlier       int64
HouseYear_outlier        int64
Old_Square             float64
LifeSquare_nan           int64
Old_LifeSquare         float64
Old_Square_summ        float64
LifeSquare_outlier       int64
DistrictId_count       float64
new_district             int64
floor_cat              float64
year_cat               float64
MedPriceByDistrict     float64
MedPriceByFloorYear    float64
dtype: o



# Удаляю, просто чтобы было быстрее строить модель. Возможно, вам стоит что-то сделать с не количественными признаками
remain_features = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'Ecology_1', 'Social_1', 'Shops_1',
                  'Rooms_outlier', 'HouseFloor_outlier', 'HouseYear_outlier', 'LifeSquare_nan',
                   'HouseFloor', 'HouseYear',
                  
                  'DistrictId_count', 'new_district', 'Ecology_2', 'Ecology_3', 'Shops_2',
                  'Correction_count']

X = X[remain_features]
X_final  = X_final[remain_features]

In [22]:
to_del_list = ["MedPriceByDistrict","MedPriceByFloorYear","LifeSquare","Old_Square","Old_LifeSquare","Old_Square_summ"]
X.drop(to_del_list, axis=1, inplace=True)
X_final.drop(to_del_list, axis=1, inplace=True)

In [23]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

In [24]:
model_rfr = RandomForestRegressor(max_depth=15, max_features = 10, n_estimators=600,random_state=42)

In [25]:
model_lgbm = LGBMRegressor(max_depth=5,
                             min_samples_leaf=2,
                             n_estimators=350,
                             random_state=42)

In [26]:
model3 = GradientBoostingRegressor(n_estimators=350, max_depth=5, min_samples_split=4,
                                           learning_rate=0.1, loss='ls', random_state=42)

In [27]:
er = VotingRegressor([('model1', model_lgbm), ('model2', model_rfr),('model3', model3)])

In [28]:
er.fit(X, y)

VotingRegressor(estimators=[('model1',
                             LGBMRegressor(boosting_type='gbdt',
                                           class_weight=None,
                                           colsample_bytree=1.0,
                                           importance_type='split',
                                           learning_rate=0.1, max_depth=5,
                                           min_child_samples=20,
                                           min_child_weight=0.001,
                                           min_samples_leaf=2,
                                           min_split_gain=0.0, n_estimators=350,
                                           n_jobs=-1, num_leaves=31,
                                           objective=None, random_state=42,
                                           reg_alpha=0.0, reg_lambda=0.0,
                                           s...
                                                       loss='ls', max_depth=5,
    

In [29]:
cv_score = cross_val_score(er, X, y, 
                           scoring='r2', 
                           n_jobs=-1,
                           cv=KFold(n_splits=5, shuffle=True, random_state=42))
# cv_score
mean = cv_score.mean()
std = cv_score.std()

print('R2: {:.6f} +- {:.6f}'.format(mean, std))

R2: 0.760445 +- 0.017242


In [None]:
y_pred_final = er.predict(X_final)
preds_final['Price'] = y_pred_final
preds_final.to_csv('predictions_er_c2_v5.csv', index=False)
!kaggle competitions submit -c realestatepriceprediction -m "VotingRegressor 3 models v5 add meter future" -f predictions_er_c2_v5.csv