In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import math
from numpy import nan
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # ingore warnings



In [2]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

In [3]:
train_df["datetime"] = pd.to_datetime(train_df["timestamp"])
train_df["year"] = train_df["datetime"].dt.year
train_df["month"] = train_df["datetime"].dt.month
train_df["day"] = train_df["datetime"].dt.day
test_df["datetime"] = pd.to_datetime(test_df["timestamp"])
test_df["year"] = test_df["datetime"].dt.year
test_df["month"] = test_df["datetime"].dt.month
test_df["day"] = test_df["datetime"].dt.day

In [6]:
temp = train_df["build_year"].copy() 
temp[train_df["build_year"] < 1000] = train_df["year"][train_df["build_year"] < 1000] + train_df["build_year"][train_df["build_year"] < 1000]
temp2 = test_df["build_year"].copy() 
temp2[test_df["build_year"] < 1000] = test_df["year"][test_df["build_year"] < 1000] + test_df["build_year"][test_df["build_year"] < 1000]

In [7]:
train_df["build_year"] = temp
test_df["build_year"] = temp2

In [8]:
train_df["build_age"] = train_df["year"] - train_df["build_year"]
test_df["build_age"] = test_df["year"] - test_df["build_year"]

In [9]:
train_df.drop("build_year", axis=1, inplace=True)

In [10]:
test_df.drop("build_year", axis=1, inplace=True)

In [11]:
train_df = train_df.replace({"no": -1, "yes": 1})
test_df = test_df.replace({"no": -1, "yes": 1})

In [12]:
train_df["product_type"].ix[train_df["product_type"] == "Investment"] = -1
train_df["product_type"].ix[train_df["product_type"] == "OwnerOccupier"] = 1
test_df["product_type"].ix[test_df["product_type"] == "Investment"] = -1
test_df["product_type"].ix[test_df["product_type"] == "OwnerOccupier"] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [13]:
train_df["ecology"].value_counts()

poor            8018
no data         7656
good            7174
excellent       3938
satisfactory    3685
Name: ecology, dtype: int64

In [14]:
train_df["ecology"].ix[train_df["ecology"] == "poor"] = 1
train_df["ecology"].ix[train_df["ecology"] == "good"] = 2
train_df["ecology"].ix[train_df["ecology"] == "satisfactory"] = 3
train_df["ecology"].ix[train_df["ecology"] == "excellent"] = 4
train_df["ecology"].ix[train_df["ecology"] == "no data"] = nan
test_df["ecology"].ix[test_df["ecology"] == "poor"] = 1
test_df["ecology"].ix[test_df["ecology"] == "good"] = 2
test_df["ecology"].ix[test_df["ecology"] == "satisfactory"] = 3
test_df["ecology"].ix[test_df["ecology"] == "excellent"] = 4
test_df["ecology"].ix[test_df["ecology"] == "no data"] = nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [15]:
train_df["floor_inverse"] = train_df["max_floor"] - train_df["floor"]
test_df["floor_inverse"] = test_df["max_floor"] - test_df["floor"]
train_df["floor_inverse"].ix[train_df["floor_inverse"] < 0] = nan
test_df["floor_inverse"].ix[test_df["floor_inverse"] < 0] = nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [17]:
y_train = train_df['price_doc'].values
id_test = test_df['id']
train_df.drop(['timestamp'], axis=1, inplace=True)
test_df.drop(['timestamp'], axis=1, inplace=True)
train_df.drop(['datetime'], axis=1, inplace=True)
test_df.drop(['datetime'], axis=1, inplace=True)
train_df.drop("id", axis=1, inplace=True)
train_df.drop("price_doc", axis=1, inplace=True)
test_df.drop("id", axis=1, inplace=True)
train_df.drop("sub_area", axis=1, inplace=True)
test_df.drop("sub_area", axis=1, inplace=True)

In [19]:
train_df.drop(['product_type'], axis=1, inplace=True)
test_df.drop(['product_type'], axis=1, inplace=True)
train_df.drop(['ecology'], axis=1, inplace=True)
test_df.drop(['ecology'], axis=1, inplace=True)

In [20]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 4,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'min_child_weight':1,
    'silent': 1,
    'seed':0
}

dtrain = xgb.DMatrix(train_df, y_train)
dtest = xgb.DMatrix(test_df)

In [21]:
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= 480)

In [22]:
y_predict = model.predict(dtest)
output = pd.DataFrame({'id': id_test, 'price_doc': y_predict})
output["price_doc"] = output["price_doc"] * 0.96
output.head()

Unnamed: 0,id,price_doc
0,30474,5665372.5
1,30475,7952311.0
2,30476,5419068.0
3,30477,5938990.5
4,30478,5246465.5


In [23]:
output.to_csv('NewStart2.csv', index=False)