In [5]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt



In [6]:
train_df = pd.read_csv("./train.csv")

In [7]:
train_df["datetime"] = pd.to_datetime(train_df["timestamp"])

In [8]:
train_df["year"] = train_df["datetime"].dt.year
train_df["month"] = train_df["datetime"].dt.month
train_df["day"] = train_df["datetime"].dt.day

In [9]:
train_df["year"].value_counts()

2014    13662
2013     7978
2012     4839
2015     3239
2011      753
Name: year, dtype: int64

In [10]:
temp = train_df['build_year'].copy()
temp[train_df["build_year"] > 1000] = train_df["year"][train_df["build_year"] > 1000] - train_df["build_year"][train_df["build_year"] > 1000]

In [11]:
train_df["build_year"] = temp

In [12]:
train_df["build_year"]

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
5         NaN
6         NaN
7         NaN
8         NaN
9         NaN
10        NaN
11        NaN
12        NaN
13        NaN
14        NaN
15        NaN
16        NaN
17        NaN
18        NaN
19        NaN
20        NaN
21        NaN
22        NaN
23        NaN
24        NaN
25        NaN
26        NaN
27        NaN
28        NaN
29        NaN
         ... 
30441     0.0
30442     0.0
30443     NaN
30444    -1.0
30445    -1.0
30446    -2.0
30447     NaN
30448     NaN
30449     0.0
30450     NaN
30451     0.0
30452     9.0
30453     0.0
30454    43.0
30455     NaN
30456    23.0
30457    36.0
30458     0.0
30459     NaN
30460    36.0
30461    35.0
30462    -1.0
30463    14.0
30464    -2.0
30465     NaN
30466    40.0
30467    80.0
30468     NaN
30469    12.0
30470    47.0
Name: build_year, Length: 30471, dtype: float64

In [13]:
train_df["product_type"] = train_df["product_type"].replace("Investment", -1)
train_df["product_type"] = train_df["product_type"].replace("OwnerOccupier", 1)
train_df["product_type"].value_counts()

-1    19448
 1    11023
Name: product_type, dtype: int64

In [14]:
test_df = pd.read_csv("./test.csv")
test_df["datetime"] = pd.to_datetime(test_df["timestamp"])
test_df["year"] = test_df["datetime"].dt.year
test_df["month"] = test_df["datetime"].dt.month
test_df["day"] = test_df["datetime"].dt.day
test_df["year"].value_counts()
temp2 = test_df['build_year'].copy()
temp2[test_df["build_year"] > 1000] = test_df["year"][test_df["build_year"] > 1000] - test_df["build_year"][test_df["build_year"] > 1000]
test_df["build_year"] = temp2
test_df["build_year"]
test_df["product_type"] = test_df["product_type"].replace("Investment", -1)
test_df["product_type"] = test_df["product_type"].replace("OwnerOccupier", 1)
test_df["product_type"].value_counts()

-1.0    4998
 1.0    2631
Name: product_type, dtype: int64

In [15]:
train_df = train_df.replace({"no": -1, "yes": 1})
test_df = test_df.replace({"no": -1, "yes": 1})

In [17]:
y_train = train_df['price_doc'].values
ylog_train_all = np.log1p(train_df['price_doc'].values)
id_test = test_df['id']

In [18]:
train_df.drop(['timestamp'], axis=1, inplace=True)

In [19]:
train_df.drop(['year'], axis=1, inplace=True)
train_df.drop(['month'], axis=1, inplace=True)
train_df.drop(['day'], axis=1, inplace=True)

In [20]:
test_df.drop(['timestamp'], axis=1, inplace=True)
test_df.drop(['year'], axis=1, inplace=True)
test_df.drop(['month'], axis=1, inplace=True)
test_df.drop(['day'], axis=1, inplace=True)

In [21]:
train_df.drop(['datetime'], axis=1, inplace=True)
test_df.drop(['datetime'], axis=1, inplace=True)

In [22]:
train_df["ecology"] = train_df["ecology"].replace("poor", -1)
train_df["ecology"] = train_df["ecology"].replace("no data", 0)
train_df["ecology"] = train_df["ecology"].replace("satisfactory", 1)
train_df["ecology"] = train_df["ecology"].replace("good", 2)
train_df["ecology"] = train_df["ecology"].replace("excellent", 3)
test_df["ecology"] = test_df["ecology"].replace("poor", -1)
test_df["ecology"] = test_df["ecology"].replace("no data", 0)
test_df["ecology"] = test_df["ecology"].replace("satisfactory", 1)
test_df["ecology"] = test_df["ecology"].replace("good", 2)
test_df["ecology"] = test_df["ecology"].replace("excellent", 3)

In [23]:
train_df.drop(['id', 'price_doc'], axis=1, inplace=True)
test_df.drop(['id'], axis=1, inplace=True)

# Build df_all = (df_train+df_test).join(df_macro)
num_train = len(train_df)
df_all = pd.concat([train_df, test_df])
print(df_all.shape)

(38133, 289)


In [24]:
# Convert to numpy values
factorize = lambda t: pd.factorize(t[1])[0]

df_obj = df_all.select_dtypes(include=['object'])

X_all = np.c_[
    df_all.select_dtypes(exclude=['object']).values,
    np.array(list(map(factorize, df_obj.iteritems()))).T
]
print(X_all.shape)

X_train = X_all[:num_train]
X_test = X_all[num_train:]

df_columns = df_all.columns

(38133, 289)


In [46]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
dtrain = xgb.DMatrix(X_train, ylog_train_all, feature_names=df_columns)
dtest = xgb.DMatrix(X_test, feature_names=df_columns)

In [47]:
num_train = len(train_df)

In [48]:
num_val = int(num_train * 0.2)
X_val = X_all[-num_val:]
ylog_val = ylog_train_all[-num_val:]
X_train_train = X_all[:num_train-num_val]
ylog_train_train = ylog_train_all[:num_train-num_val]

In [49]:
dval = xgb.DMatrix(X_val, ylog_val, feature_names=df_columns)
dtrain_train = xgb.DMatrix(X_train_train, ylog_train_train, feature_names=df_columns)

In [50]:
partial_model = xgb.train(xgb_params, dtrain_train, num_boost_round=1000, evals=[(dval, 'val')],
                       early_stopping_rounds=20, verbose_eval=20)

num_boost_round = partial_model.best_iteration

[0]	val-rmse:14.4771
Will train until val-rmse hasn't improved in 20 rounds.
[20]	val-rmse:5.28766
[40]	val-rmse:2.04417
[60]	val-rmse:0.997515
[80]	val-rmse:0.743114
[100]	val-rmse:0.697207
[120]	val-rmse:0.690491
[140]	val-rmse:0.690864
Stopping. Best iteration:
[120]	val-rmse:0.690491



In [54]:
import math

In [55]:
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_round)
y_pred = model.predict(dtest)

In [62]:
for x in y_pred:
    x = math.exp(x)

In [65]:
y_pred2 = [math.exp(y_pred[i]) for i in range(y_pred.size)]

In [66]:
y_pred2

[5023123.798043842,
 7459442.415800787,
 4859009.3543700185,
 5546607.368489687,
 4940142.522121828,
 8321014.168511533,
 4297599.632108161,
 4275345.161938714,
 4366855.060824825,
 4268736.914961205,
 5901129.7474783845,
 4916848.4755081395,
 3582620.903454564,
 3874800.92271073,
 5800285.297750475,
 5993978.990286223,
 19555531.94505913,
 16423306.896399228,
 5336353.843504604,
 12945945.164555626,
 6587814.62195697,
 8408419.725799898,
 7928133.183588597,
 7825685.885627767,
 4319475.590516166,
 6333947.222185548,
 10174887.136763968,
 6090281.0045183785,
 3273375.5313154804,
 5678693.738514561,
 5195426.41452144,
 11001592.220731132,
 3325234.4574436825,
 4805478.877206648,
 5575330.265212289,
 6987104.525957837,
 8493073.484852212,
 5946953.475971885,
 5976751.988545726,
 4310911.676213666,
 3331563.827837843,
 9531368.618871102,
 5703421.0034829285,
 9788602.714849249,
 6752509.466279797,
 4344523.694474736,
 6203232.784511289,
 5481391.472231967,
 7096214.975205124,
 5589747.443

In [67]:
df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred2})
df_sub.to_csv('TryAgain3.csv', index=False)