In [2]:
from mlbox.preprocessing import *
from mlbox.optimisation import *
from mlbox.prediction import *
import warnings
warnings.filterwarnings("ignore")

In [3]:
target_name = 'SalePrice'

In [4]:
paths = ["train.csv","test.csv"]
rd = Reader(sep=',')
df = rd.train_test_split(paths, target_name)


reading csv : train.csv ...
cleaning data ...
CPU time: 2.1471176147460938 seconds

reading csv : test.csv ...
cleaning data ...
CPU time: 0.18936872482299805 seconds

> Number of common features : 80

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 43
> Number of numerical features: 37
> Number of training samples : 1460
> Number of test samples : 1459

> Top sparse features (% missing values on train set):
PoolQC         99.5
MiscFeature    96.3
Alley          93.8
Fence          80.8
FireplaceQu    47.3
dtype: float64

> Task : regression
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64


In [5]:
dft = Drift_thresholder()
df = dft.fit_transform(df)


computing drifts ...
CPU time: 0.5379059314727783 seconds

> Top 10 drifts

('Id', 1.0)
('Exterior2nd', 0.06692247765095138)
('TotRmsAbvGrd', 0.06244630217813274)
('Fireplaces', 0.04357188363693654)
('1stFlrSF', 0.03847960830314534)
('Condition1', 0.03844534950079037)
('BldgType', 0.03602915537598572)
('YrSold', 0.03556155580112552)
('MSZoning', 0.03215772151292695)
('Exterior1st', 0.03189699655661604)

> Deleted variables : ['Id']
> Drift coefficients dumped into directory : save


In [6]:
df['train'].to_csv('preprocessed_train.csv', index=False)

In [7]:
df['test'].to_csv('preprocessed_test.csv', index=False)

In [8]:
rmse = make_scorer(lambda y_true, y_pred: np.sqrt(np.sum((y_true - y_pred)**2)/len(y_true)), 
                   greater_is_better=False, 
                   needs_proba=False)

opt = Optimiser(scoring = rmse, n_folds = 5)

# LightGBM

In [9]:
space = {
    'est__strategy':{"search":"choice",
                    "space":["LightGBM"]},
    'est__n_estimators':{"search":"choice",
                    "space":[150, 300]},
    'est__colsample_bytree':{"search":"uniform",
                    "space":[0.8, 0.95]},
    'est__subsample':{"search":"uniform",
                    "space":[0.8, 0.95]},
    'est__max_depth':{'search':"choice",
                    "space":[5,6,7,8,9]},
    'est__learning_rate':{"search":"choice",
                    "space":[0.01, 0.07]},
    'est__n_jobs':{'search':'choice',
                  'space':[4]}
}

params = opt.optimise(space, df, 50)

##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': '<NULL>'}
>>> CA ENCODER :{'strategy': 'label_encoding'}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'colsample_bytree': 0.9390547369633679, 'learning_rate': 0.07, 'max_depth': 8, 'n_estimators': 150, 'n_jobs': 4, 'subsample': 0.831433746026626, 'boosting_type': 'gbdt', 'class_weight': None, 'importance_type': 'split', 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'nthread': -1, 'seed': 0}
MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -28200.42790032871
VARIANCE : 4459.585096200776 (fold 1 = -28119.99417067384, fold 2 = -29736.876106566782, fold 3 = -26574.073079035785, fold 4 

In [10]:
prd = Predictor()
prd.fit_predict(params, df)


fitting the pipeline ...
CPU time: 0.14815711975097656 seconds

> Feature importances dumped into directory : save

predicting...
CPU time: 0.0493316650390625 seconds

> Overview on predictions : 

   SalePrice_predicted
0        126515.771485
1        154357.757515
2        179185.033749
3        183548.464490
4        182008.667972
5        174669.024861
6        172871.038341
7        169968.292019
8        176905.029256
9        120890.240177

dumping predictions into directory : save ...


<mlbox.prediction.predictor.Predictor at 0x7f8da68c7d50>

In [11]:
submit = pd.read_csv("sample_submission.csv",sep=',')
preds = pd.read_csv("save/"+target_name+"_predictions.csv")

submit[target_name] =  preds[target_name+"_predicted"].values

submit.to_csv("mlbox.csv", index=False)