In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')

In [None]:
dict(zip(train.columns.to_list(), train.dtypes.to_list()))

# 0. Data preprocessing + Feature Engineering

build_year outliers cleaning

In [None]:
train['build_year'].clip(train['build_year'].quantile(0.06), train['build_year'].quantile(0.99), inplace=True)
train['build_year'].hist(bins=100)

Product_type (категориальная -> бинарная)

In [None]:
train['product_type'] = train['product_type'].replace({'Investment': 1, 'OwnerOccupier': 0})
test['product_type'] = test['product_type'].replace({'Investment': 1, 'OwnerOccupier': 0})

новая фича: Price m2

In [None]:
train['price_m2'] = train['price_doc'] / (train['full_sq'] + 1)
train['price_m2'].clip(train['price_m2'].quantile(0.01), train['price_m2'].quantile(0.99), inplace=True)

target encoding (sub area) // Каково среднее значение price_m2 в каждой из sub_area

In [None]:
def target_encoding(train, test, col_x, col_y):
    col_target_encoding = train.groupby([col_x])[col_y].mean()

    train[col_x] = train[col_x].replace(col_target_encoding)
    test[col_x] = test[col_x].replace(col_target_encoding)
    
    return train, test

In [None]:
train, test = target_encoding(train, test, 'sub_area', 'price_m2')
train, test = target_encoding(train, test, 'material', 'price_m2')
train, test = target_encoding(train, test, 'state', 'price_m2')
train, test = target_encoding(train, test, 'ID_metro', 'price_m2')

timestamp encoding

In [None]:
def get_date_time_features(timestamp_series: pd.Series) -> pd.DataFrame:
    
    timestamp_series = pd.to_datetime(timestamp_series)
    
    output= pd.DataFrame()
    output['timestamp']= timestamp_series
    output['month'] = timestamp_series.dt.month.astype(np.int8)
#     output['hour_datetime'] = timestamp_series.dt.hour.astype(np.int8)  
    output['day_week'] = timestamp_series.dt.dayofweek.astype(np.int8)
    output['day_month_datetime'] =timestamp_series.dt.day.astype(np.int8)
    output['year'] = timestamp_series.dt.year.astype(np.int16)
#     _min = timestamp_series.iloc[0].astype(int)/1_000_000_000_000
    output['kseconds'] = timestamp_series.view(int)/1_000_000_000_000
    
    return output

In [None]:
train = pd.concat( [train, get_date_time_features(train['timestamp'])] , axis=1 )
test = pd.concat( [test, get_date_time_features(test['timestamp'])] , axis=1 )

add features

In [None]:
train['floor_height'] = train['floor'] / (train['max_floor'] + 1)
test['floor_height'] = test['floor'] / (test['max_floor'] + 1)

In [None]:
train['life_part'] = train['life_sq'] / (train['full_sq'] + 1)
train['kitch_part'] = train['kitch_sq'] / (train['full_sq'] + 1)

test['life_part'] = test['life_sq'] / (test['full_sq'] + 1)
test['kitch_part'] = test['kitch_sq'] / (test['full_sq'] + 1)

In [None]:
all_x_col = ['kseconds',
         'sub_area',
         'product_type',
         'build_year',
         'material',
         'num_room',
         'floor_height',
         'life_part',
         'kitch_part', 
         'kremlin_km',
         'metro_km_walk', 
         'state', 
         'ID_metro',
         'price_m2']

HeatMap of features

In [None]:
XY_train = train[all_x_col]

In [None]:
ax = sns.heatmap(XY_train.corr(), cmap="YlGnBu", annot=True)

Check NaNs

In [None]:
train[all_x_col].isnull().sum()

In [None]:
train.loc[:, 'build_year'] = train['build_year'].fillna(train['build_year'].median())
train.loc[:, 'material'] = train['material'].fillna(train['material'].median())
train.loc[:, 'num_room'] = train['num_room'].fillna(train['num_room'].median())
train.loc[:, 'floor_height'] = train['floor_height'].fillna(train['floor_height'].median())
train.loc[:, 'life_part'] = train['life_part'].fillna(train['life_part'].median())
train.loc[:, 'kitch_part'] = train['kitch_part'].fillna(train['kitch_part'].median())
train.loc[:, 'metro_km_walk'] = train['metro_km_walk'].fillna(train['metro_km_walk'].median())
train.loc[:, 'state'] = train['state'].fillna(train['state'].median())

In [None]:
train[all_x_col].isnull().sum()

# 1. Cross validation function

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.linear_model import SGDRegressor

In [None]:
def cross_validation(df, x_col, y_col):
    features = x_col
    target = y_col

    mse_list = []
    msle_list = []
    i = 1

    kf = KFold(n_splits=5, shuffle=True)
    for train_idx, val_idx in kf.split(df):
        train_part = df.loc[train_idx, :]
        val_part = df.loc[val_idx, :]

        model = SGDRegressor()
        model.fit(X=train_part[features], y=train_part[target])
        val_pred = model.predict(val_part[features]).clip(1, 10**9)

        mse = mean_squared_error(val_pred, val_part[target])
        msle = mean_squared_log_error(val_pred, val_part[target])

        mse_list.append(mse)
        msle_list.append(msle)

        print(f'Fold: {i}, MSLE: {msle}, MSE: {mse}')
        i += 1

    print(f'AV_MSLE: {np.mean(msle_list)}, AV_MSE: {np.mean(mse_list)}')

# 2. Test 5 models

1) Test 1

In [None]:
x_col1 = ['metro_km_walk', 'kremlin_km', 'num_room']
y_col = 'price_m2'

cross_validation(train, x_col1, y_col)

model1 = SGDRegressor()
model1.fit(X=train[x_col1], y=train[y_col])

2) Test 2

In [None]:
x_col2 = ['metro_km_walk', 'kremlin_km', 'num_room']
y_col = 'price_m2'

train.loc[:, 'metro_km_walk'] = (train['metro_km_walk'] - train['metro_km_walk'].mean()) / train['metro_km_walk'].std()
train.loc[:, 'kremlin_km'] = (train['kremlin_km'] - train['kremlin_km'].mean()) / train['kremlin_km'].std()

cross_validation(train, x_col2, y_col)

model2 = SGDRegressor()
model2.fit(X=train[x_col2], y=train[y_col])

3) Test 3

In [None]:
x_col3 = ['sub_area',
         'product_type',
         'state',
         'ID_metro',
         'material',
         'num_room',
         'kremlin_km',
         'metro_km_walk']
y_col = 'price_m2'

train.loc[:, 'sub_area'] = (train['sub_area'] - train['sub_area'].mean()) / train['sub_area'].std()
train.loc[:, 'state'] = (train['state'] - train['state'].mean()) / train['state'].std()
train.loc[:, 'ID_metro'] = (train['ID_metro'] - train['ID_metro'].mean()) / train['ID_metro'].std()
train.loc[:, 'material'] = (train['material'] - train['material'].mean()) / train['material'].std()

cross_validation(train, x_col3, y_col)
model3 = SGDRegressor()
model3.fit(X=train[x_col3], y=train[y_col])

4) Test 4

In [None]:
x_col4 = ['sub_area',
         'product_type',
         'state',
         'ID_metro',
         'material',
         'num_room',
         'kremlin_km',
         'metro_km_walk',
        'kseconds',
        'build_year']
y_col = 'price_m2'

train.loc[:, 'kseconds'] = (train['kseconds'] - train['kseconds'].mean()) / train['kseconds'].std()
train.loc[:, 'build_year'] = (train['build_year'] - train['build_year'].mean()) / train['build_year'].std()

cross_validation(train, x_col4, y_col)
model4 = SGDRegressor()
model4.fit(X=train[x_col4], y=train[y_col])

5) Test 5

In [None]:
x_col5 = ['sub_area',
         'product_type',
         'state',
         'ID_metro',
         'material',
         'num_room',
         'kremlin_km',
         'metro_km_walk',
        'kseconds',
        'build_year']

train['log_price_m2'] = np.log1p(train['price_m2'])

y_col = 'log_price_m2'


cross_validation(train, x_col5, y_col)
model5 = SGDRegressor()
model5.fit(X=train[x_col5], y=train[y_col])

# 3. CV + submit

In [None]:
def write_test(test, model, x_col, i):
    test_prediction = model.predict(test[x_col].fillna(0))
    sample_submission = pd.read_csv('../data/raw/sample_submission.csv')
    sample_submission['price_doc'] = test_prediction
    sample_submission['price_doc'] *= test['full_sq']
    sample_submission['price_doc'] = sample_submission['price_doc'].clip(1, 10**9)
    sample_submission.to_csv(f'predict_hw_model{i}.csv', index=False)

In [None]:
def write_test_logtarget(test, model, x_col, i):
    test_prediction = model.predict(test[x_col].fillna(0))
    sample_submission = pd.read_csv('../data/raw/sample_submission.csv')
    sample_submission['price_doc'] = np.e**test_prediction - 1
    sample_submission['price_doc'] *= test['full_sq']
    sample_submission['price_doc'] = sample_submission['price_doc'].clip(1, 10**9)
    sample_submission.to_csv(f'predict_hw_model{i}.csv', index=False)

In [None]:
test.loc[:, 'metro_km_walk'] = (test['metro_km_walk'] - test['metro_km_walk'].mean()) / test['metro_km_walk'].std()
test.loc[:, 'kremlin_km'] = (test['kremlin_km'] - test['kremlin_km'].mean()) / test['kremlin_km'].std()
test.loc[:, 'sub_area'] = (test['sub_area'] - test['sub_area'].mean()) / test['sub_area'].std()
test.loc[:, 'state'] = (test['state'] - test['state'].mean()) / test['state'].std()
test.loc[:, 'ID_metro'] = (test['ID_metro'] - test['ID_metro'].mean()) / test['ID_metro'].std()
test.loc[:, 'material'] = (test['material'] - test['material'].mean()) / test['material'].std()
test.loc[:, 'kseconds'] = (test['kseconds'] - test['kseconds'].mean()) / test['kseconds'].std()
test.loc[:, 'build_year'] = (test['build_year'] - test['build_year'].mean()) / test['build_year'].std()

In [None]:
write_test(test, model1, x_col1, 1)
write_test(test, model2, x_col2, 2)
write_test(test, model3, x_col3, 3)
write_test(test, model4, x_col4, 4)
write_test_logtarget(test, model5, x_col5, 5)

# 4. Plot submit_score(CV_score)

In [None]:
models = ['1', '2', '3', '4', '5']
CV_scores = [0.28854592971311366, 0.3465958289811607, 0.22271291419476152, 0.21919826596823824, 0.21129017297138314]
submit_scores = [0.37263, 0.36283, 0.33540, 0.33954, 0.35458]

In [None]:
df = pd.DataFrame(list(zip(models, CV_scores, submit_scores)), columns=['model', 'CV', 'submit'])

In [None]:
df.plot(x='model', y=['CV', 'submit'])

# 5. Insights

1) Категориальные переменные сильно снизили ошибку<br>
2) Из-за использования DateTime переменных модель переобучается<br>
3) Логарифмирование таргета ухудшило результат на сабмите