In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc

In [6]:
# Параметры
XGB_WEIGHT = 0.6500
BASELINE_WEIGHT = 0.0056

BASELINE_PRED = 0.0115

In [7]:
print( "\nЧтение данных ...")
prop = pd.read_csv('drive/MyDrive/zillow-prize-1/properties_2016.csv')
train = pd.read_csv('drive/MyDrive/zillow-prize-1/train_2016_v2.csv')


Чтение данных ...


  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
print( "\nОбработка данных для LightGBM ..." )
for c, dtype in zip(prop.columns, prop.dtypes):	
    if dtype == np.float64:		
        prop[c] = prop[c].astype(np.float32)

#Объединение таблиц
df_train = train.merge(prop, how='left', on='parcelid')
# Заполение с помощью медианного значения
df_train.fillna(df_train.median(),inplace = True)

#Удаление столбцов
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
                         'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)


train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

x_train = x_train.values.astype(np.float32, copy=False)
# Создание датасета для LightGBM с данными x_train и названиями из y_train
d_train = lgb.Dataset(x_train, label=y_train)


Обработка данных для LightGBM ...
(90275, 53) (90275,)


In [9]:
#Задание парметров модели
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.5      # feature_fraction -- OK, back to .5, but maybe later increase this
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0

In [10]:
print("\nОбучение модели LightGBM ...")
#Обучение модели с укаанием количества итераций бустинга
clf = lgb.train(params, d_train, 430)

del d_train; gc.collect()
del x_train; gc.collect()


Обучение модели LightGBM ...


0

In [11]:
print("\nПодготовка для предсказания ...")
print("   Чтение файла ...")
sample = pd.read_csv('drive/MyDrive/zillow-prize-1/sample_submission.csv')
print("   ...")
sample['parcelid'] = sample['ParcelId']
print("   Объединенение с таблицей property ...")
#Объединение таблиц
df_test = sample.merge(prop, on='parcelid', how='left')
print("   ...")
del sample, prop; gc.collect()
print("   ...")
x_test = df_test[train_columns]
print("   ...")
del df_test; gc.collect()
print("   Подготовка x_test...")
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
print("   ...")
x_test = x_test.values.astype(np.float32, copy=False)


Подготовка для предсказания ...
   Чтение файла ...
   ...
   Объединенение с таблицей property ...
   ...
   ...
   ...
   Подготовка x_test...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


   ...


In [12]:
print("\nLightGBM предсказание ...")
# num_threads > 1 will predict very slow in kernal
# 
clf.reset_parameter({"num_threads":1})
p_test = clf.predict(x_test)

del x_test; gc.collect()


LightGBM предсказание ...


144

In [13]:
print( "\nНескорекктированные прогнозы LightGBM:" )
print( pd.DataFrame(p_test).head() )


Нескорекктированные прогнозы LightGBM:
          0
0  0.035296
1  0.038122
2  0.010894
3  0.008950
4  0.008910


In [14]:
print( "\nRe-reading properties file ...")
properties = pd.read_csv('drive/MyDrive/zillow-prize-1/properties_2016.csv')


Re-reading properties file ...


  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
##### Обработка данных для XGBOOST

print( "\nProcessing data for XGBoost ...")
for c in properties.columns:
    properties[c]=properties[c].fillna(-1)
    if properties[c].dtype == 'object':
#LabelEncoder присваиваtn порядковые номера категориальным данным 
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))

train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)
# Размер
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))


Processing data for XGBoost ...
Shape train: (90275, 57)
Shape test: (2985217, 57)


In [16]:
# Удалить лишнее
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.418 ]
x_train=train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)

print('After removing outliers:')     
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))

After removing outliers:
Shape train: (88525, 57)
Shape test: (2985217, 57)


In [17]:
##### Запустить XGBOOST

print("\nSetting up data for XGBoost ...")
# xgboost параметры
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}


Setting up data for XGBoost ...


In [18]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

In [19]:
num_boost_rounds = 242
print("\nXGBoost tuned with CV in:")
print("   https://www.kaggle.com/aharless/xgboost-without-outliers-tweak ")
print("num_boost_rounds="+str(num_boost_rounds))



XGBoost tuned with CV in:
   https://www.kaggle.com/aharless/xgboost-without-outliers-tweak 
num_boost_rounds=242


In [20]:
# Обучени модели
print( "\nОбучение XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


Обучение XGBoost ...


In [21]:
print( "\nПрогноз с XGBoost ...")
xgb_pred = model.predict(dtest)


Прогноз с XGBoost ...


In [22]:
print( "\nXGBoost предсказания:" )
print( pd.DataFrame(xgb_pred).head() )


XGBoost предсказания:
          0
0 -0.040072
1 -0.026636
2  0.018891
3  0.063797
4 -0.002863


In [23]:
##### Объединение предсказаний

print( "\nОбъединение XGBoost, LightGBM, и baseline predicitons ..." )
lgb_weight = 1 - XGB_WEIGHT - BASELINE_WEIGHT
pred = XGB_WEIGHT*xgb_pred + BASELINE_WEIGHT*BASELINE_PRED + lgb_weight*p_test

print( "\nОбъединенное прогнозирование:" )
print( pd.DataFrame(pred).head() )



Объединение XGBoost, LightGBM, и baseline predicitons ...

Объединенное прогнозирование:
          0
0 -0.013827
1 -0.004120
2  0.016095
3  0.044615
4  0.001272


In [24]:
##### Запись результатов

print( "\nПодготовка результатов..." )
y_pred=[]

for i,predict in enumerate(pred):
    y_pred.append(str(round(predict,4)))
y_pred=np.array(y_pred)

output = pd.DataFrame({'ParcelId': properties['parcelid'].astype(np.int32),
        '201610': y_pred, '201611': y_pred, '201612': y_pred,
        '201710': y_pred, '201711': y_pred, '201712': y_pred})
# Переместить 'ParceID' в первый столбец
cols = output.columns.tolist()
cols = cols[-1:] + cols[:-1]
output = output[cols]
from datetime import datetime

print( "\nЗапись результатов ..." )
output.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)

print( "\nЗавершено..." )


Подготовка результатов...

Зпись результатов ...

Завершено...
