In [90]:
# imports 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

In [91]:
# load the data

X_train = pd.read_csv("data/russian_house_market/train.csv", parse_dates=['timestamp'])
X_test = pd.read_csv("data/russian_house_market/test.csv")
currency_convertor = pd.read_csv("data/russian_house_market/currency.csv")

In [92]:
# transform the currency df to a dictionary for easier access

currency_convertor = currency_convertor.set_index("date")
currency_convertor_dict = currency_convertor.to_dict()

In [93]:
# currency conversion

def month_year(x):
    if type(x) == str:
        splitted = x.split("-")
        new_date = splitted[1]+"-"+splitted[0]
    else:
        new_date = x.strftime("%m-%Y")
    return new_date

def currency_convert(date, price):
    exchange_rate = currency_convertor_dict["currency"][date]
    return price * exchange_rate
    
X_all = pd.concat(objs = [X_train, X_test], axis=0)
assert isinstance(X_all, pd.DataFrame)

X_all["timestamp"] = X_all["timestamp"].apply(month_year)
df_currency_conversion = X_all[["timestamp", "price_doc"]].copy()
df_currency_conversion["price_doc"] = df_currency_conversion.apply(lambda row: pd.Series({'timestamp':row["timestamp"], 'price_doc':currency_convert(row["timestamp"], row["price_doc"])}), axis=1)     
X_all["price_doc"] = df_currency_conversion["price_doc"]

In [96]:
# make a copy of test timestamp

test_timestamp = X_all["timestamp"]
test_timestamp = pd.DataFrame(test_timestamp[len(X_train):])

In [97]:
# fill the na and get dummy variables from categorical data

X_all.fillna(X_all.mean(), inplace = True)
X_all = pd.get_dummies(X_all)

In [98]:
# feature scaling

from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler

ids = X_all["id"]
mapper = DataFrameMapper([(X_all.columns, StandardScaler())])
scaled_features = mapper.fit_transform(X_all.copy())
scaled_features_df = pd.DataFrame(scaled_features, index=X_all.index, columns=X_all.columns)
scaled_features_df.head()

Unnamed: 0,0_13_all,0_13_female,0_13_male,0_17_all,0_17_female,0_17_male,0_6_all,0_6_female,0_6_male,16_29_all,...,timestamp_11-2013,timestamp_11-2014,timestamp_11-2015,timestamp_12-2011,timestamp_12-2012,timestamp_12-2013,timestamp_12-2014,timestamp_12-2015,water_1line_no,water_1line_yes
0,1.201285,1.162229,1.235537,1.188467,1.149805,1.222768,1.153571,1.164018,1.141788,-0.223144,...,-0.156883,-0.18379,-0.139117,-0.079417,-0.121195,-0.155827,-0.216344,-0.155208,0.286628,-0.286628
1,0.523683,0.554055,0.494024,0.54843,0.565753,0.531123,0.445573,0.478837,0.413604,-0.26233,...,-0.156883,-0.18379,-0.139117,-0.079417,-0.121195,-0.155827,-0.216344,-0.155208,0.286628,-0.286628
2,0.182887,0.138965,0.223825,0.243104,0.199688,0.283535,0.1827,0.13706,0.225079,-0.191497,...,-0.156883,-0.18379,-0.139117,-0.079417,-0.121195,-0.155827,-0.216344,-0.155208,0.286628,-0.286628
3,2.065314,2.071514,2.055203,2.105747,2.095231,2.111937,2.075597,2.121536,2.029025,-0.460802,...,-0.156883,-0.18379,-0.139117,-0.079417,-0.121195,-0.155827,-0.216344,-0.155208,0.286628,-0.286628
4,0.235031,0.159379,0.305712,0.281378,0.213312,0.344959,0.137269,0.10451,0.167657,-0.429507,...,-0.156883,-0.18379,-0.139117,-0.079417,-0.121195,-0.155827,-0.216344,-0.155208,0.286628,-0.286628


In [99]:
# split the data

X_train = pd.DataFrame(X_all[:len(X_train)])
X_test = pd.DataFrame(X_all[len(X_train):])
length = len(X_train)

y = X_train["price_doc"]

X_train_transformed = X_train.drop(["price_doc"], axis = 1).copy()

In [100]:
X_matrix = X_train_transformed.as_matrix()
y_matrix = y.as_matrix()

In [101]:
# extreme gradient boosting

from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train_transformed, y)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [102]:
# drop the price_doc from test when we merged
X_test.drop("price_doc", axis=1, inplace=True)

In [103]:
xgb_predictions = xgb.predict(X_test)

In [113]:
def currency_back_to_rubles(date, price):
    exchange_rate = currency_convertor_dict["currency"][date]
    final_price = price // exchange_rate 
    return final_price 

currency_test = pd.DataFrame()
currency_test["timestamp"] = test_timestamp
currency_test["price"] = xgb_predictions
currency_test["price"] = currency_test.apply(lambda row: pd.Series({'timestamp':row["timestamp"], 'price':currency_back_to_rubles(row["timestamp"], row["price"])}), axis=1)     

currency_test.head()

Unnamed: 0,timestamp,price
0,07-2015,6719880.0
1,07-2015,9353380.0
2,07-2015,6409460.0
3,07-2015,5944840.0
4,07-2015,5691000.0


In [114]:
xgb_results = pd.DataFrame(
    {
        "id": ids[length:],
        "price_doc": currency_test["price"].values
    }
)
xgb_results.to_csv("data/russian_house_market/xgb_conversion.csv", index=False)

# 0.3 score, which is worse than without the conversion to USD