In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse

cd ..

train = pd.read_csv('data/train_clean.csv')
test = pd.read_csv('data/test_clean.csv')

col = list(test.columns)[2:]
label = np.log1p(train['price_doc'])
dtrain = xgb.DMatrix(train[col],label)

params = {
   'eta': 0.05, ## Try 0.01,3,5
   'max_depth': 5,## Try 4,5,6
   'subsample': 0.7,
   'colsample_bytree': 0.7,
   'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
     verbose_eval=50, show_stdv=True,seed=42)

kf = KFold(n_splits=5,shuffle=False)
col = list(test.columns)[2:]
train_array = train[col].as_matrix()
label = np.log1p(train['price_doc']).as_matrix()
accuracy = []

for train_index, test_index in kf.split(train_array):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = train_array[train_index,:], train_array[test_index,:]
    y_train, y_test = label[train_index], label[test_index]
    dtrain = xgb.DMatrix(X_train,y_train)
    dtest = xgb.DMatrix(X_test)
    model = xgb.train(params,dtrain,num_boost_round=len(xgb_cvalid))
    print('training done')
    pred = model.predict(dtest)
    RMSLE = mse(y_test, pred)**0.5
    print('RMSLE: '+str(RMSLE))
    accuracy.append(RMSLE)
    
accuracy

print accuracy
print "mean\tstd"
print np.array(accuracy).mean(), np.array(accuracy).std()

In [33]:
train = pd.read_csv('data/train_clean_subsample_avg_price.csv')
test = pd.read_csv('data/test_clean_subsample_avg_price.csv')


# Use avg as feature

In [35]:
drop_col = ['moscow_avg_price_avg', 'moscow_avg_price_high', 'moscow_avg_price_low', 'sub_area_avg_price']
drop_col = ["{}_{}".format(col, currency) for currency in ['usd', 'eur'] for col in drop_col]
drop_col
train = train.drop(drop_col, axis=1)
test = test.drop(drop_col, axis=1)

['moscow_avg_price_avg_usd',
 'moscow_avg_price_high_usd',
 'moscow_avg_price_low_usd',
 'sub_area_avg_price_usd',
 'moscow_avg_price_avg_eur',
 'moscow_avg_price_high_eur',
 'moscow_avg_price_low_eur',
 'sub_area_avg_price_eur']

In [None]:
def get_avg_price_features(df):
    df['avg_subarea_moscow_avg_diff'] = df['sub_area_avg_price_rub'] - df['moscow_avg_price_avg_rub']
    df['avg_subarea_moscow_avg_ratio'] = (df['sub_area_avg_price_rub'] - df['moscow_avg_price_avg_rub'])/df['moscow_avg_price_avg_rub']

In [34]:
train.head().T

Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
timestamp,2011-08-20,2011-08-23,2011-08-27,2011-09-01,2011-09-05
full_sq,43,34,43,89,77
life_sq,27,19,29,50,77
floor,4,3,2,9,4
max_floor,,,,,
material,,,,,
build_year,,,,,
num_room,,,,,
kitch_sq,,,,,


In [22]:
params = {
   'eta': 0.05, ## Try 0.01,3,5
   'max_depth': 5,## Try 4,5,6
   'subsample': 0.7,
   'colsample_bytree': 0.7,
   'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
     verbose_eval=50, show_stdv=True,seed=42)

[0]	train-rmse:14.4049+0.00178946	test-rmse:14.405+0.00341215
[50]	train-rmse:1.16136+0.000284672	test-rmse:1.16777+0.0031281
[100]	train-rmse:0.307942+0.00119694	test-rmse:0.351326+0.0038784
[150]	train-rmse:0.274377+0.00103711	test-rmse:0.337475+0.00446533
[200]	train-rmse:0.258275+0.00151799	test-rmse:0.336864+0.00437301


In [23]:
kf = KFold(n_splits=5,shuffle=False)
col = list(test.columns)[2:]
train_array = train[col].as_matrix()
label = np.log1p(train['price_doc']).as_matrix()
accuracy = []

for train_index, test_index in kf.split(train_array):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = train_array[train_index,:], train_array[test_index,:]
    y_train, y_test = label[train_index], label[test_index]
    dtrain = xgb.DMatrix(X_train,y_train)
    dtest = xgb.DMatrix(X_test)
    model = xgb.train(params,dtrain,num_boost_round=len(xgb_cvalid))
    print('training done')
    pred = model.predict(dtest)
    RMSLE = mse(y_test, pred)**0.5
    print('RMSLE: '+str(RMSLE))
    accuracy.append(RMSLE)
    
accuracy

('TRAIN:', array([ 5853,  5854,  5855, ..., 29259, 29260, 29261]), 'TEST:', array([   0,    1,    2, ..., 5850, 5851, 5852]))
training done
RMSLE: 0.384004819482
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([ 5853,  5854,  5855, ..., 11703, 11704, 11705]))
training done
RMSLE: 0.318389721533
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([11706, 11707, 11708, ..., 17555, 17556, 17557]))
training done
RMSLE: 0.332483655834
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([17558, 17559, 17560, ..., 23407, 23408, 23409]))
training done
RMSLE: 0.322626565997
('TRAIN:', array([    0,     1,     2, ..., 23407, 23408, 23409]), 'TEST:', array([23410, 23411, 23412, ..., 29259, 29260, 29261]))
training done
RMSLE: 0.308389251332


[0.38400481948200516,
 0.31838972153259376,
 0.33248365583403999,
 0.32262656599650258,
 0.30838925133226203]

In [24]:
train_file = 'data/train_clean_subsample_avg_price.csv'
test_file = 'data/train_clean_subsample_avg_price.csv'

train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

In [25]:
train.columns.tolist()

['id',
 'timestamp',
 'full_sq',
 'life_sq',
 'floor',
 'max_floor',
 'material',
 'build_year',
 'num_room',
 'kitch_sq',
 'state',
 'product_type',
 'sub_area',
 'area_m',
 'raion_popul',
 'green_zone_part',
 'indust_part',
 'children_preschool',
 'preschool_quota',
 'preschool_education_centers_raion',
 'children_school',
 'school_quota',
 'school_education_centers_raion',
 'school_education_centers_top_20_raion',
 'hospital_beds_raion',
 'healthcare_centers_raion',
 'university_top_20_raion',
 'sport_objects_raion',
 'additional_education_raion',
 'culture_objects_top_25',
 'culture_objects_top_25_raion',
 'shopping_centers_raion',
 'office_raion',
 'thermal_power_plant_raion',
 'incineration_raion',
 'oil_chemistry_raion',
 'radiation_raion',
 'railroad_terminal_raion',
 'big_market_raion',
 'nuclear_reactor_raion',
 'detention_facility_raion',
 'full_all',
 'male_f',
 'female_f',
 'young_all',
 'young_male',
 'young_female',
 'work_all',
 'work_male',
 'work_female',
 'ekder_all'

In [37]:
import sys
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from scipy import stats
from sklearn import preprocessing

"""
feature type:
0: direct use price
1. drop other currency
2. add different and ratio
3. add different and ratio, and statistics features
"""

train_file = 'data/train_clean_subsample_avg_price.csv'
test_file = 'data/test_clean_subsample_avg_price.csv'

train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

def drop_other_currency(df):
    drop_col = ['moscow_avg_price_avg', 'moscow_avg_price_high', 'moscow_avg_price_low', 'sub_area_avg_price']
    drop_col = ["{}_{}".format(col, currency) for currency in ['usd', 'eur'] for col in drop_col]
    return df.drop(drop_col, axis=1)

def get_avg_price_comparison_features(df):
    df['avg_subarea_moscow_avg_diff'] = df['sub_area_avg_price_rub'] - df['moscow_avg_price_avg_rub']
    df['avg_subarea_moscow_avg_ratio'] = (df['sub_area_avg_price_rub'] - df['moscow_avg_price_avg_rub'])/df['moscow_avg_price_avg_rub']
    return df


min_max_scaler = preprocessing.MinMaxScaler()

def get_slop(y):
    x = np.array(range(len(y)))
    slope, intercept, r_value, p_value, slope_std_error = stats.linregress(x, y)
    return slope

def get_avg_price_stat(df, columns):
    df_copy = pd.DataFrame()
    span_list = [7, 30, 90, 180]
    for t in span_list:
        for col_name in columns:
            df_copy['{}_mean_{}'.format(col_name, t)] = df[col_name].rolling(window=t).mean()
            df_copy['{}_std_{}'.format(col_name, t)] = df[col_name].rolling(window=t).std()
            df_copy['{}_norm'.format(col_name)] =  min_max_scaler.fit_transform(df[col_name])
            df_copy['{}_slope_{}'.format(col_name, t)] = df_copy['{}_norm'.format(col_name)].rolling(window=t).apply(get_slop)
            df_copy = df_copy.drop(['{}_norm'.format(col_name)], axis=1)
    return df_copy

feature_type = 3

if feature_type == 0:
    pass

if feature_type >= 1:
    train = drop_other_currency(train)
    test = drop_other_currency(test)

if feature_type >= 2:
    train = get_avg_price_comparison_features(train)
    test = get_avg_price_comparison_features(test)

if feature_type >= 3:
    train = get_avg_price_stat(train, ['sub_area_avg_price_rub', 'moscow_avg_price_avg_rub'])
    test = get_avg_price_stat(test, ['sub_area_avg_price_rub', 'moscow_avg_price_avg_rub'])



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [39]:
train = get_avg_price_stat(train, ['sub_area_avg_price_rub', 'moscow_avg_price_avg_rub'])



In [41]:
sum(pd.isnull(test ['sub_area_avg_price_rub']))
      #, 'moscow_avg_price_avg_rub']

4003

In [38]:
train[np.isfinite(train)]

TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
col = list(test.columns)[2:]
label = np.log1p(train['price_doc'])
dtrain = xgb.DMatrix(train[col],label)

params = {
'eta': 0.05, ## Try 0.01,3,5
'max_depth': 5,## Try 4,5,6
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=True,seed=42, nfold=5)

print "xgboost round: {}".format(len(xgb_cvalid))

kf = KFold(n_splits=5,shuffle=False)
col = list(test.columns)[2:]
train_array = train[col].as_matrix()
label = np.log1p(train['price_doc']).as_matrix()
accuracy = []

for train_index, test_index in kf.split(train_array):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = train_array[train_index,:], train_array[test_index,:]
    y_train, y_test = label[train_index], label[test_index]
    dtrain = xgb.DMatrix(X_train,y_train)
    dtest = xgb.DMatrix(X_test)
    model = xgb.train(params,dtrain,num_boost_round=len(xgb_cvalid))
    print('training done')
    pred = model.predict(dtest)
    RMSLE = mse(y_test, pred)**0.5
    print('RMSLE: '+str(RMSLE))
    accuracy.append(RMSLE)

accuracy

print accuracy
print "mean\tstd"
print np.array(accuracy).mean(), np.array(accuracy).std()


def get_feature_importance(model):
    Importance = model.get_fscore()
    Importance = list(Importance.items())
    Feature= []
    Score = []
    for each in Importance:
        Feature.append(each[0])
        Score.append(each[1])
    df = pd.DataFrame({'Feature':Feature,'Score':Score}).sort_values(by=['Score'],ascending=[0])
    return df

f_imp = get_feature_importance(model)
print f_imp

In [46]:
import sys
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from scipy import stats
from sklearn import preprocessing

"""
feature type:
0: direct use price
1. drop other currency
2. add different and ratio
3. add different and ratio, and avg statistics features
"""

train_file = 'data/train_clean_subsample_avg_price.csv'
test_file = 'data/test_clean_subsample_avg_price.csv'

train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

def drop_other_currency(df):
    drop_col = ['moscow_avg_price_avg', 'moscow_avg_price_high', 'moscow_avg_price_low', 'sub_area_avg_price']
    drop_col = ["{}_{}".format(col, currency) for currency in ['usd', 'eur'] for col in drop_col]
    return df.drop(drop_col, axis=1)

def get_avg_price_comparison_features(df):
    df['avg_subarea_moscow_avg_diff'] = df['sub_area_avg_price_rub'] - df['moscow_avg_price_avg_rub']
    df['avg_subarea_moscow_avg_ratio'] = (df['sub_area_avg_price_rub'] - df['moscow_avg_price_avg_rub'])/df['moscow_avg_price_avg_rub']
    return df


def add_moscow_price_stat(df):
    moscow_avg_price = pd.read_csv(prepath+'moscow_avg_price_with_stat.csv')
    df.timestamp = pd.to_datetime(df.timestamp, infer_datetime_format=True)
    moscow_avg_price = moscow_avg_price.rename(columns={'timestamp':'t_timestamp'})
    df['date'] = df['timestamp'].apply(lambda x: x.strftime('%Y-%m-%d'))
    df = df.merge(moscow_avg_price, left_on='date', right_on='t_timestamp').drop(['date', 't_timestamp'], axis=1)
    return df

3

if feature_type == 0:
    pass

if feature_type >= 1:
    train = drop_other_currency(train)
    test = drop_other_currency(test)

if feature_type >= 2:
    train = get_avg_price_comparison_features(train)
    test = get_avg_price_comparison_features(test)

if feature_type >= 3:
    train = add_moscow_price_stat(train)
    test = add_moscow_price_stat(test)

In [47]:
col = list(test.columns)[2:]
label = np.log1p(train['price_doc'])
dtrain = xgb.DMatrix(train[col],label)

params = {
'eta': 0.05, ## Try 0.01,3,5
'max_depth': 5,## Try 4,5,6
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=True,seed=42, nfold=5)

print "xgboost round: {}".format(len(xgb_cvalid))

kf = KFold(n_splits=5,shuffle=False)
col = list(test.columns)[2:]
train_array = train[col].as_matrix()
label = np.log1p(train['price_doc']).as_matrix()
accuracy = []

for train_index, test_index in kf.split(train_array):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = train_array[train_index,:], train_array[test_index,:]
    y_train, y_test = label[train_index], label[test_index]
    dtrain = xgb.DMatrix(X_train,y_train)
    dtest = xgb.DMatrix(X_test)
    model = xgb.train(params,dtrain,num_boost_round=len(xgb_cvalid))
    print('training done')
    pred = model.predict(dtest)
    RMSLE = mse(y_test, pred)**0.5
    print('RMSLE: '+str(RMSLE))
    accuracy.append(RMSLE)

accuracy

print accuracy
print "mean\tstd"
print np.array(accuracy).mean(), np.array(accuracy).std()


def get_feature_importance(model):
    Importance = model.get_fscore()
    Importance = list(Importance.items())
    Feature= []
    Score = []
    for each in Importance:
        Feature.append(each[0])
        Score.append(each[1])
    df = pd.DataFrame({'Feature':Feature,'Score':Score}).sort_values(by=['Score'],ascending=[0])
    return df

f_imp = get_feature_importance(model)
print f_imp


"""
direct

[0]	train-rmse:14.427+0.00330067	test-rmse:14.427+0.00742354
[50]	train-rmse:1.1624+0.000801421	test-rmse:1.16658+0.00543451
[100]	train-rmse:0.317574+0.00357707	test-rmse:0.344896+0.00726594
[150]	train-rmse:0.288764+0.00380459	test-rmse:0.329274+0.00883175
[200]	train-rmse:0.276291+0.0036066	test-rmse:0.327643+0.00879625
[250]	train-rmse:0.266411+0.00323442	test-rmse:0.327148+0.00842217
[300]	train-rmse:0.257561+0.00340931	test-rmse:0.326993+0.00800119
('TRAIN:', array([ 5853,  5854,  5855, ..., 29259, 29260, 29261]), 'TEST:', array([   0,    1,    2, ..., 5850, 5851, 5852]))
training done
RMSLE: 0.383017101562
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([ 5853,  5854,  5855, ..., 11703, 11704, 11705]))
training done
RMSLE: 0.313228510103
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([11706, 11707, 11708, ..., 17555, 17556, 17557]))
training done
RMSLE: 0.330456779999
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([17558, 17559, 17560, ..., 23407, 23408, 23409]))
training done
RMSLE: 0.322327986334
('TRAIN:', array([    0,     1,     2, ..., 23407, 23408, 23409]), 'TEST:', array([23410, 23411, 23412, ..., 29259, 29260, 29261]))
training done
RMSLE: 0.320165030776
[0.38301710156174595, 0.31322851010289504, 0.33045677999895751, 0.32232798633397974, 0.32016503077598907]
mean	std
0.333839081755 0.0251960244843
"""


"""
1. drop other currency
[0]	train-rmse:14.427+0.00167013	test-rmse:14.427+0.0071428
[50]	train-rmse:1.16191+0.000549938	test-rmse:1.16602+0.00476997
[100]	train-rmse:0.320419+0.00202003	test-rmse:0.34418+0.00723171
[150]	train-rmse:0.293878+0.00229244	test-rmse:0.328318+0.009073
[200]	train-rmse:0.282967+0.002239	test-rmse:0.326741+0.00933618
[250]	train-rmse:0.274054+0.00228758	test-rmse:0.325941+0.00941562
[300]	train-rmse:0.265922+0.0025173	test-rmse:0.32563+0.00927432
[350]	train-rmse:0.258777+0.00253456	test-rmse:0.325466+0.00946576
xgboost round: 350
('TRAIN:', array([ 5853,  5854,  5855, ..., 29259, 29260, 29261]), 'TEST:', array([   0,    1,    2, ..., 5850, 5851, 5852]))
training done
RMSLE: 0.383769717317
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([ 5853,  5854,  5855, ..., 11703, 11704, 11705]))
training done
RMSLE: 0.313193736375
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([11706, 11707, 11708, ..., 17555, 17556, 17557]))
training done
RMSLE: 0.332085268764
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([17558, 17559, 17560, ..., 23407, 23408, 23409]))
training done
RMSLE: 0.321253921226
('TRAIN:', array([    0,     1,     2, ..., 23407, 23408, 23409]), 'TEST:', array([23410, 23411, 23412, ..., 29259, 29260, 29261]))
training done
RMSLE: 0.302037854381
[0.38376971731671189, 0.31319373637465714, 0.33208526876408434, 0.32125392122648716, 0.30203785438076636]
mean	std
0.330468099613 0.0284086650108
"""



"""
2. add different and ratio
[0]	train-rmse:14.427+0.00167013	test-rmse:14.427+0.0071428
[50]	train-rmse:1.16179+0.000576531	test-rmse:1.1655+0.00449047
[100]	train-rmse:0.320246+0.00201104	test-rmse:0.344001+0.00754858
[150]	train-rmse:0.293562+0.00193066	test-rmse:0.328398+0.00943655
[200]	train-rmse:0.282715+0.00206784	test-rmse:0.32672+0.00957286
[250]	train-rmse:0.273669+0.0022386	test-rmse:0.326108+0.009504
xgboost round: 271
('TRAIN:', array([ 5853,  5854,  5855, ..., 29259, 29260, 29261]), 'TEST:', array([   0,    1,    2, ..., 5850, 5851, 5852]))
training done
RMSLE: 0.38314368611
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([ 5853,  5854,  5855, ..., 11703, 11704, 11705]))
training done
RMSLE: 0.314605651939
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([11706, 11707, 11708, ..., 17555, 17556, 17557]))
training done
RMSLE: 0.331218417307
('TRAIN:', array([    0,     1,     2, ..., 29259, 29260, 29261]), 'TEST:', array([17558, 17559, 17560, ..., 23407, 23408, 23409]))
training done
RMSLE: 0.320997076729
('TRAIN:', array([    0,     1,     2, ..., 23407, 23408, 23409]), 'TEST:', array([23410, 23411, 23412, ..., 29259, 29260, 29261]))
training done
RMSLE: 0.304352344435
[0.3831436861101189, 0.31460565193924533, 0.33121841730725859, 0.32099707672850647, 0.30435234443486026]
mean	std
0.330863435304 0.0275602943639
"""

[0]	train-rmse:14.4271+0.00168388	test-rmse:14.4271+0.00692365


KeyboardInterrupt: 

In [44]:
train.head().T

Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
timestamp,2011-08-20 00:00:00,2011-08-23 00:00:00,2011-08-27 00:00:00,2011-09-01 00:00:00,2011-09-05 00:00:00
full_sq,43,34,43,89,77
life_sq,27,19,29,50,77
floor,4,3,2,9,4
max_floor,,,,,
material,,,,,
build_year,,,,,
num_room,,,,,
kitch_sq,,,,,
