In [1]:
!pip install lightgbm



In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-0.7.post3.tar.gz (450kB)
[K    100% |################################| 460kB 1.9MB/s ta 0:00:011
Building wheels for collected packages: xgboost
  Running setup.py bdist_wheel for xgboost ... [?25ldone
[?25h  Stored in directory: /home/nbuser/.cache/pip/wheels/ca/b3/02/d44d5e12c5c1eecff4a822555bac96b182551cd5e13c4795f6
Successfully built xgboost
Installing collected packages: xgboost
Successfully installed xgboost-0.7.post3


In [7]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [5]:
import zipfile
archive = zipfile.ZipFile('test.csv.zip', 'r')
test = pd.read_csv(archive.open('test.csv'), sep=";", decimal=",",parse_dates=True)

In [6]:
archive = zipfile.ZipFile('train.csv.zip', 'r')
train = pd.read_csv(archive.open('train.csv'), sep=";", decimal=",",parse_dates=True)

In [8]:
import datetime
test.date = test.date.str.split('-').apply(lambda x: datetime.datetime(int(x[0]),int(x[1]),int(x[2])))
train.date = train.date.str.split('-').apply(lambda x: datetime.datetime(int(x[0]),int(x[1]),int(x[2])))

In [9]:
train['dayofweek'] = train.date.dt.dayofweek
test['dayofweek'] = test.date.dt.dayofweek
train['quarter'] = train.date.dt.quarter
test['quarter'] = test.date.dt.quarter
train['week'] = train.date.dt.week
test['week'] = test.date.dt.week
train['month'] = train.date.dt.month
test['month'] = test.date.dt.month

In [10]:
## some more feature engineering
train["qteG"] = train.article_nom.str.extract('(\d+)G',expand=True).fillna(0).astype(int)
test["qteG"] = test.article_nom.str.extract('(\d+)G',expand=True).fillna(0).astype(int)
train['qteX'] = train.article_nom.str.extract('X ?(\d)',expand=True).fillna(0).astype(int)
test['qteX'] = test.article_nom.str.extract('X ?(\d)',expand=True).fillna(0).astype(int)
train['qteMl'] = train.article_nom.str.extract('(\d+) ?Ml',expand=True).fillna(0).astype(int)
test['qteMl'] = test.article_nom.str.extract('(\d+) ?Ml',expand=True).fillna(0).astype(int)

In [11]:
ytrain = train.set_index('id').qte_article_vendue

In [12]:
cat_features = ['implant', 'article_nom']

In [13]:
from sklearn import preprocessing
label_encoders = {}
for cat in cat_features:
     label_encoders.update({cat:preprocessing.LabelEncoder()})

In [14]:
for cat, le in label_encoders.items():
    cat_str = cat+'_label'
    train[cat_str] = le.fit_transform(train[cat])
    test[cat_str] = le.transform(test[cat])

In [12]:
##aggregates
#data = pd.concat([train.set_index('id'),test.set_index('id')],axis=0)


In [1]:
#data.groupby(['article_nom','date','implant']).qte_article_vendue.rolling(2).mean().reset_index()

In [15]:
trainingset = train.set_index('id').select_dtypes(include=['float64','int64']).drop('qte_article_vendue', axis=1)
testset = test.set_index('id').select_dtypes(include=['float64','int64'])

In [16]:
# Feature Selection
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
regressor = ExtraTreesRegressor().fit(trainingset, ytrain)
#lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(trainingset, ytrain)
model = SelectFromModel(regressor, prefit=True)
X = model.transform(trainingset)
Xpredict = model.transform(testset)

# Modeling

In [17]:
trainingset.columns[model.get_support()]

Index(['vente_j_7', 'vente_j_8_14', 'vente_cat5_j_7', 'vente_cat5_j_8_14',
       'vente_cat4_j_7', 'vente_cat4_j_8_14', 'dayofweek', 'qteG',
       'implant_label', 'article_nom_label'],
      dtype='object')

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainingset, ytrain, test_size=0.05, random_state=42)

In [30]:
print('Start training...')
# train
gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=60,
                        learning_rate=0.1,
                        n_estimators=150, random_state=42)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='rmse',
        early_stopping_rounds=5)

print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

# feature importances
print('Feature importances:', list(gbm.feature_importances_))

Start training...
[1]	valid_0's rmse: 0.824954
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's rmse: 0.794949
[3]	valid_0's rmse: 0.769341
[4]	valid_0's rmse: 0.748389
[5]	valid_0's rmse: 0.729549
[6]	valid_0's rmse: 0.714751
[7]	valid_0's rmse: 0.701308
[8]	valid_0's rmse: 0.690887
[9]	valid_0's rmse: 0.681769
[10]	valid_0's rmse: 0.67352
[11]	valid_0's rmse: 0.666433
[12]	valid_0's rmse: 0.660194
[13]	valid_0's rmse: 0.654688
[14]	valid_0's rmse: 0.65034
[15]	valid_0's rmse: 0.646665
[16]	valid_0's rmse: 0.64352
[17]	valid_0's rmse: 0.640808
[18]	valid_0's rmse: 0.638041
[19]	valid_0's rmse: 0.635866
[20]	valid_0's rmse: 0.634015
[21]	valid_0's rmse: 0.632183
[22]	valid_0's rmse: 0.630454
[23]	valid_0's rmse: 0.629355
[24]	valid_0's rmse: 0.627942
[25]	valid_0's rmse: 0.626457
[26]	valid_0's rmse: 0.625055
[27]	valid_0's rmse: 0.624297
[28]	valid_0's rmse: 0.623282
[29]	valid_0's rmse: 0.622281
[30]	valid_0's rmse: 0.621367
[31]	valid_0's rmse: 0.620757
[32

In [31]:
import numpy as np
for i in np.argsort(gbm.feature_importances_)[::-1][:10]:
    print(trainingset.columns[i])

vente_j_8_14
vente_j_7
week
vente_cat4_j_8_14
vente_cat5_j_7
qteG
dayofweek
article_nom_label
vente_cat4_j_7
vente_cat5_j_8_14


In [25]:
help(xgbReg.fit)

Help on method fit in module xgboost.sklearn:

fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None) method of xgboost.sklearn.XGBRegressor instance
    Fit the gradient boosting model
    
    Parameters
    ----------
    X : array_like
        Feature matrix
    y : array_like
        Labels
    sample_weight : array_like
        instance weights
    eval_set : list, optional
        A list of (X, y) tuple pairs to use as a validation set for
        early-stopping
    eval_metric : str, callable, optional
        If a str, should be a built-in evaluation metric to use. See
        doc/parameter.md. If callable, a custom evaluation metric. The call
        signature is func(y_predicted, y_true) where y_true will be a
        DMatrix object such that you may need to call the get_label
        method. It must return a str, value pair where the str is a name
        for the evaluation and value is the value of the evalu

In [27]:
xgbReg = xgb.XGBRegressor(nthread=-1, min_child_weight=4, subsample=0.9, max_depth=5) 
xgbReg.fit(X_train, y_train,
        eval_metric='rmse',
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=5)



[0]	validation_0-rmse:0.826859
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:0.798098
[2]	validation_0-rmse:0.772668
[3]	validation_0-rmse:0.751844
[4]	validation_0-rmse:0.734243
[5]	validation_0-rmse:0.719077
[6]	validation_0-rmse:0.706491
[7]	validation_0-rmse:0.696243
[8]	validation_0-rmse:0.68749
[9]	validation_0-rmse:0.679917
[10]	validation_0-rmse:0.673718
[11]	validation_0-rmse:0.66858
[12]	validation_0-rmse:0.663864
[13]	validation_0-rmse:0.660004
[14]	validation_0-rmse:0.656571
[15]	validation_0-rmse:0.653245
[16]	validation_0-rmse:0.65026
[17]	validation_0-rmse:0.647687
[18]	validation_0-rmse:0.645823
[19]	validation_0-rmse:0.644358
[20]	validation_0-rmse:0.642686
[21]	validation_0-rmse:0.641419
[22]	validation_0-rmse:0.63989
[23]	validation_0-rmse:0.638882
[24]	validation_0-rmse:0.6377
[25]	validation_0-rmse:0.6365
[26]	validation_0-rmse:0.635653
[27]	validation_0-rmse:0.634763
[28]	validation_0-rmse:0.634005
[29]	validation_0-rmse:0.6

TypeError: predict() got an unexpected keyword argument 'num_iteration'

In [33]:
print('Start predicting...')
# predict
y_pred2 = xgbReg.predict(X_test)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred2) ** 0.5)

# feature importances
print('Feature importances:', list(xgbReg.feature_importances_))

import numpy as np
for i in np.argsort(xgbReg.feature_importances_)[::-1][:10]:
    print(trainingset.columns[i])

Start predicting...
The rmse of prediction is: 0.614503423079
Feature importances: [0.016853932, 0.015449438, 0.0066713481, 0.028441012, 0.014396068, 0.0056179776, 0.018258426, 0.024227528, 0.019662922, 0.0070224721, 0.016151685, 0.015098315, 0.0056179776, 0.0049157306, 0.013693821, 0.020365169, 0.0014044944, 0.0056179776, 0.0098314611, 0.022120787, 0.001755618, 0.0, 0.0052668541, 0.0010533708, 0.0045646066, 0.025632022, 0.0056179776, 0.0, 0.0, 0.0024578653, 0.0049157306, 0.11341292, 0.18820225, 0.04985955, 0.031601124, 0.023174157, 0.051966291, 0.047752809, 0.0038623596, 0.066011235, 0.0070224721, 0.024578651, 0.0003511236, 0.0080758426, 0.024929775, 0.036516853]
vente_j_8_14
vente_j_7
week
vente_cat4_j_8_14
vente_cat5_j_7
dayofweek
article_nom_label
vente_cat5_j_8_14
t_9h_rouen
retour_zone_1


In [34]:
print('The rmse of prediction is:', mean_squared_error(y_test, 0.5*(y_pred+y_pred2)) ** 0.5)

The rmse of prediction is: 0.603037683074


In [35]:
y_sub = gbm.predict(testset, num_iteration=gbm.best_iteration_)

In [36]:
y_sub2 = xgbReg.predict(testset)

In [37]:
pd.DataFrame(0.5*(y_sub+y_sub2),index=testset.index,columns=['quantite_vendue']).to_csv('sub.csv',sep=';',decimal=',')