In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from feature_selector import FeatureSelector
import matplotlib.pyplot as plt

In [2]:
macro_cols = ["micex", "oil_urals", "income_per_cap", "net_capital_export"]

#macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
#"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
#"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_macro = pd.read_csv('macro.csv', usecols=['timestamp'] + macro_cols)

In [4]:
answer = pd.DataFrame()
answer['id'] = df_test['id']
target = df_train['price_doc']

In [5]:
df_train = df_train.drop(['id', 'price_doc'], axis=1)
df_test = df_test.drop(['id'], axis=1)

In [6]:
df_train.shape, target.shape

((30471, 290), (30471,))

### Обработка

In [7]:
########################
## Ошибочные значения ##
########################

df_train.loc[df_train.state == 33, 'state'] = 3
df_train.loc[df_train['life_sq'] > 1000,     'life_sq']       = np.median(df_train['life_sq'].dropna())
df_train.loc[df_train['kitch_sq'] > 250,     'kitch_sq']      = np.median(df_train['kitch_sq'].dropna())
df_train.loc[df_train['num_room'] > 6,       'num_room']      = np.median(df_train['num_room'].dropna())
df_train.loc[df_train['build_year'] > 2017,  'build_year']    = np.median(df_train['build_year'].dropna())
df_train.loc[df_train['build_year'] < 1800,  'build_year']    = np.median(df_train['build_year'].dropna())
df_train.loc[df_train['floor'] > 50,         'floor']         = np.median(df_train['floor'].dropna())
df_train.loc[df_train['max_floor'] > 60,     'max_floor']     = np.median(df_train['max_floor'].dropna())
df_train.loc[df_train.full_sq == 0, 'full_sq'] = 50

df_test.loc[df_test['life_sq'] > 1000,     'life_sq']       = np.median(df_test['life_sq'].dropna())
df_test.loc[df_test['kitch_sq'] > 250,     'kitch_sq']      = np.median(df_test['kitch_sq'].dropna())
df_test.loc[df_test['num_room'] > 6,       'num_room']      = np.median(df_test['num_room'].dropna())
df_test.loc[df_test['build_year'] > 2017,  'build_year']    = np.median(df_test['build_year'].dropna())
df_test.loc[df_test['build_year'] < 1800,  'build_year']    = np.median(df_test['build_year'].dropna())
df_test.loc[df_test['floor'] > 50,         'floor']         = np.median(df_test['floor'].dropna())
df_test.loc[df_test['max_floor'] > 60,     'max_floor']     = np.median(df_test['max_floor'].dropna())
df_test.loc[df_test.full_sq == 0, 'full_sq'] = 50

In [8]:
# Объединение данных и добавление macro-данных
df_full = pd.concat([df_train, df_test], sort=False)
df_full = pd.merge_ordered(df_full, df_macro, on='timestamp', how='left')

In [9]:
df_full['year'] = df_full['timestamp'].apply(lambda f: f.split('-')[0])
df_full['month'] =df_full['timestamp'].apply(lambda f: f.split('-')[1])
df_full['day'] = df_full['timestamp'].apply(lambda f: f.split('-')[2])

In [10]:
del df_full['timestamp']

In [11]:
###########
# numeric #
###########
for col in df_full._get_numeric_data().columns[df_full._get_numeric_data().columns.isnull().any()].tolist():
    df_full[col].fillna(df_full[col].mean(), inplace=True)

In [12]:
###############
# categorical #
###############
for col in df_full.columns[df_full.isnull().any()].tolist():
    df_full[col].fillna(df_full[col].value_counts().index[0], inplace=True)

### Корреляция 

In [13]:
fs = FeatureSelector(data = df_train, labels = None)
fs.identify_collinear(correlation_threshold = 0.7)
collinear_features = fs.ops['collinear']

No labels provided. Feature importance based methods are not available.
216 features with a correlation magnitude greater than 0.70.



In [14]:
for col in collinear_features:
    df_full = df_full.drop([col], axis = 1)

### Кодирование

In [15]:
df_full = pd.get_dummies(df_full)

In [16]:
df_test = df_full.iloc[df_train.shape[0]:,:]
df_train = df_full.iloc[:df_train.shape[0],:]

###  Сохранение

In [17]:
df_test.to_csv('df_test.csv', encoding='utf-8', index=False)
df_train.to_csv('df_train.csv', encoding='utf-8', index=False)