In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from copy import deepcopy
import numpy as np
import statsmodels.api as sm
import statistics as sts

In [2]:
df = pd.read_csv('train.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## NaN preprocessing

In [3]:
df.isnull().sum().iloc[np.where(df.isnull().sum() != 0)]

floor                             176237
osm_city_nearest_population           55
reform_house_population_1000       14596
reform_house_population_500        27234
reform_mean_floor_count_1000       16708
reform_mean_floor_count_500        30168
reform_mean_year_building_1000     16239
reform_mean_year_building_500      29637
street                              1606
dtype: int64

#### Floor

In [4]:
def determine_hight_category(x):
    '''
    1 - floors from 1 to 5
    2 - floors from 6 to 10
    3 - floors from 10 to 30
    4 - floors from 30 to 90
    '''
    try:
        x = float(x)
    except:
        return 1
    else:
        if 1 <= x < 5:
            return 1
        elif 5 <= x < 10:
            return 2
        elif 10 <= x < 30:
            return 3
        else:
            return 4
        
def is_wholesale(x):
    x = str(x)
    if len(x.split(',')) >= 2:
        return 1
    return 0

In [5]:
df['wholesale'] = df['floor'].apply(is_wholesale)
df['floor_hight_category'] = df['floor'].apply(determine_hight_category)
df.drop(['floor'], inplace=True, axis=1)

#### osm_city_nearest_population

In [6]:
def determine_osm_city_nearest_population(observation_string):
    if pd.isnull(observation_string.osm_city_nearest_population):
        observation_region = observation_string.region
        value = aggregation_osm_city_nearest_population[aggregation_osm_city_nearest_population.region == observation_region]. \
                values[0]
        observation_string.osm_city_nearest_population = value
    return observation_string

In [7]:
aggregation_osm_city_nearest_population = df.groupby(by=['region']). \
                                          aggregate({'osm_city_nearest_population': np.mean}). \
                                          osm_city_nearest_population. \
                                          reset_index()

In [8]:
df = df.apply(determine_osm_city_nearest_population, axis=1)

#### reform_house_population_1000 && reform_house_population_500

In [9]:
aggregation_reform_house_population = df.groupby(by=['city']). \
                                      aggregate({'reform_house_population_1000': np.mean,
                                                 'reform_house_population_500': np.mean})

In [10]:
data_for_reform_house_population_500 = aggregation_reform_house_population.reform_house_population_500.reset_index()
data_for_reform_house_population_1000 = aggregation_reform_house_population.reform_house_population_1000.reset_index()

In [11]:
def determine_reform_house_population(x, r):
    if r == 500:
        data = data_for_reform_house_population_500
    else:
        data = data_for_reform_house_population_1000
    value = data[data.city == x]['reform_house_population_{}'.format(r)].values[0]
    if pd.isnull(value):
        value = sts.mode(data['reform_house_population_{}'.format(r)])
    return value

In [None]:
df['reform_house_population_500'] = df['city'].apply(lambda x: determine_reform_house_population(x, 500))
df['reform_house_population_1000'] = df['city'].apply(lambda x: determine_reform_house_population(x, 1000))

#### reform_mean_floor_count_1000 && reform_mean_floor_count_500

In [None]:
aggregation_reform_mean_floor_count = df.groupby(by=['city']). \
                                      aggregate({'reform_mean_floor_count_1000': np.mean,
                                                 'reform_mean_floor_count_500': np.mean})

In [None]:
data_for_reform_mean_floor_count_500 = aggregation_reform_mean_floor_count.reform_mean_floor_count_500.reset_index()
data_for_reform_mean_floor_count_1000 = aggregation_reform_mean_floor_count.reform_mean_floor_count_1000.reset_index()

In [None]:
def determine_mean_floor_count(x, r):
    if r == 500:
        data = data_for_reform_mean_floor_count_500
    else:
        data = data_for_reform_mean_floor_count_1000
    value = data[data.city == x]['reform_mean_floor_count_{}'.format(r)].values[0]
    if pd.isnull(value):
        value = sts.mode(data['reform_mean_floor_count_{}'.format(r)])
    return value

In [None]:
df['reform_mean_floor_count_500'] = df['city'].apply(lambda x: determine_mean_floor_count(x, 500))
df['reform_mean_floor_count_1000'] = df['city'].apply(lambda x: determine_mean_floor_count(x, 1000))

#### reform_mean_year_building_1000 && reform_mean_year_building_500

In [9]:
aggregation = df.groupby(by=['region']). \
              aggregate({'reform_mean_year_building_1000': np.mean, 'reform_mean_year_building_500': np.mean})

#### street

In [None]:
def fill_street(x):
    if x.count()<=0:
        return np.nan
    return x.value_counts().index[0]

df['street'] = df.groupby('city')['street'].transform(fill_street)
df['street'] = df['street'].fillna(df['street'].value_counts().idxmax())

In [None]:
# check for result
df.isnull().sum().iloc[np.where(df.isnull().sum() != 0)]

## Feature Selection

### Stepwize selection

In [80]:
target = 'per_square_meter_price'

categorial_features = [
    'city',
    'floor',
    'id',
    'osm_city_nearest_name',
    'region',
    'street',
    'realty_type',
    'price_type'
]

drop_columns = categorial_features + ['date', 'per_square_meter_price']

x_columns = list(set(df.columns) - set(drop_columns))

In [84]:
def stepwize_regression(df):
    
    df_temp = deepcopy(df)
    df_temp.dropna(inplace=True)  # should be preprocessed before this function
    p_value_threshold = 0.05  # if greater, than should be dropped
    
    current_columns = []
    for column in x_columns:
        current_columns.append(column)
        X = df_temp[current_columns].astype(float)
        y = list(df_temp[target])
        results = sm.OLS(y, X).fit()
        appropriate_features = results.pvalues.iloc[np.where(results.pvalues <= p_value_threshold)]
        
    return appropriate_features

In [86]:
appropriate_features = stepwize_regression(df)

In [87]:
len(appropriate_features)

53