In [2]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV

TRAIN_DATASET_PATH = './Kurs_project_task/train.csv'
TEST_DATASET_PATH = './Kurs_project_task/test.csv'

  from numpy.core.umath_tests import inner1d


In [3]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
test_df = pd.read_csv(TEST_DATASET_PATH)

In [4]:
district_size = train_df['DistrictId'].value_counts().reset_index()\
                .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})
med_price_by_district = train_df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'Price':'median'})\
                        .rename(columns={'Price':'MedPriceByDistrict'})

In [5]:
def my_feature(train_df = train_df, train_test = 0, 
               split_par = 0.2, kitch_size = 3,room_median = 2, 
               HouseFloor = 1, sq_diff = 10, 
               district_size = district_size,med_price_by_district = med_price_by_district):
    
    train_df['Id'] = train_df['Id'].astype(str)
    train_df['DistrictId'] = train_df['DistrictId'].astype(str)
    district_size['DistrictId'] = district_size['DistrictId'].astype(str)
    med_price_by_district['DistrictId'] = med_price_by_district['DistrictId'].astype(str)
    train_df.loc[train_df['Rooms'].isin([0, 10, 19]), 'Rooms'] = room_median
    
    train_df.loc[train_df['KitchenSquare'] < 3, 'KitchenSquare'] = 3
    train_df.loc[train_df['HouseFloor'] == 0, 'HouseFloor'] = HouseFloor
    
    floor_outliers = train_df.loc[train_df['Floor'] > train_df['HouseFloor']].index

    train_df.loc[floor_outliers, 'Floor'] = train_df.loc[floor_outliers, 'HouseFloor'].apply(lambda x: random.randint(1, x))
    
    train_df.loc[train_df['HouseYear'] > 2020, 'HouseYear'] = 2019
    
    train_df.drop(['Healthcare_1'], axis = 1, inplace = True)
    
    train_df.loc[train_df['Square']<(train_df['LifeSquare']+train_df['KitchenSquare']), 'Square'] =\
    train_df.loc[train_df['Square']<(train_df['LifeSquare']+train_df['KitchenSquare']), 'LifeSquare'] +\
    train_df.loc[train_df['Square']<(train_df['LifeSquare']+train_df['KitchenSquare']), 'KitchenSquare'] +\
    sq_diff
    
    train_df['Ecology_2_bin'] = train_df['Ecology_2'].replace({'A':0, 'B':1})
    train_df['Ecology_3_bin'] = train_df['Ecology_3'].replace({'A':0, 'B':1})
    train_df['Shops_2_bin'] = train_df['Shops_2'].replace({'A':0, 'B':1})
    
    train_df = train_df.merge(district_size, on='DistrictId', how='left')
    
    train_df['IsDistrictLarge'] = (train_df['DistrictSize'] > 100).astype(int)
    train_df = train_df.merge(med_price_by_district, on=['DistrictId', 'Rooms'], how='left')
    feature_names = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear',
                 'Ecology_1', 'Ecology_2_bin', 'Ecology_3_bin', 'Social_1', 'Social_2', 'Social_3',
                 'Helthcare_2', 'Shops_1', 'Shops_2_bin']

    new_feature_names = ['IsDistrictLarge', 'MedPriceByDistrict']

    target_name = 'Price'
    X = train_df[feature_names + new_feature_names]
    X_train,y_train = [],[]
    X_test,y_test = X,[]
    if train_test==1: 
        y = train_df[target_name]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_par, shuffle=True, random_state=21)

    return X_train,y_train,X_test,y_test

test_df = pd.read_csv(TEST_DATASET_PATH)
test_df['LifeSquare'] = test_df['LifeSquare'].fillna(test_df['LifeSquare'].median())


_,_,Test_x,_ = my_feature(test_df, train_test = 0, 
                           split_par = 0.2, kitch_size = 3,room_median = 2, 
                           HouseFloor = 1, sq_diff = 10, 
                           district_size = district_size,
                           med_price_by_district = med_price_by_district)
Test_x['MedPriceByDistrict'] = Test_x['MedPriceByDistrict'].fillna(Test_x['MedPriceByDistrict'].median())

In [6]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
train_df['LifeSquare'] = train_df['LifeSquare'].fillna(train_df['LifeSquare'].median())
X_train, y_train, X_test, y_test = my_feature(train_df, train_test = 1, 
                           split_par = 0.2, kitch_size = 3,room_median = 2, 
                           HouseFloor = 1, sq_diff = 10, 
                           district_size = district_size,
                           med_price_by_district = med_price_by_district)

In [34]:
rf_model = RandomForestRegressor(random_state=21, max_depth = 9, max_features = 8, n_estimators = 150)
rf_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
           max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=150, n_jobs=1, oob_score=False, random_state=21,
           verbose=0, warm_start=False)

In [35]:
y_pred_test = rf_model.predict(X_test)

In [36]:
r2(y_test, y_pred_test)

0.7630618801337414

In [39]:
final_pred = rf_model.predict(Test_x)

In [68]:
final_df = pd.DataFrame(final_pred, columns = ['Price'], index = Test_x.index)

In [71]:
final_df.rename_axis('Id', inplace = True)

In [73]:
final_df.to_csv('Varlamov_predictions.csv')