In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np

In [2]:
train = pd.read_csv('../input/train_MLWARE2.csv'); print(train.shape)
test = pd.read_csv('../input/test_MLWARE2.csv'); print(test.shape)

(958529, 4)
(641015, 3)


In [3]:
train_test = train.append(test)

In [4]:
user = pd.DataFrame(train_test['userId'].value_counts()).reset_index(drop=False).rename(columns={'index':'userId', 'userId':'user_count'})

In [5]:
user.head()

Unnamed: 0,userId,user_count
0,1319,120
1,46189,120
2,31590,120
3,32088,120
4,30288,120


In [6]:
item = pd.DataFrame(train_test['itemId'].value_counts()).reset_index(drop=False).rename(columns={'index':'itemId', 'itemId':'item_count'})

In [7]:
item.head()

Unnamed: 0,itemId,item_count
0,1,40000
1,129,40000
2,47,40000
3,28,40000
4,25,40000


In [8]:
train = pd.merge(train, user, on = "userId", how='left')
test = pd.merge(test, user, on = "userId", how='left')

train = pd.merge(train, item, on = "itemId", how='left')
test = pd.merge(test, item, on = "itemId", how='left')

In [9]:
test_ids = test['ID']
target = train['rating']

In [10]:
train.columns.values

array(['ID', 'userId', 'itemId', 'rating', 'user_count', 'item_count'], dtype=object)

In [13]:
cols_for_model = ['userId', 'itemId', 'user_count', 'item_count']

In [14]:
dtrain = xgb.DMatrix(data = train[cols_for_model], label = target, missing = np.nan)
dtest = xgb.DMatrix(data = test[cols_for_model], missing = np.nan)

In [15]:
params = {'objective': 'reg:linear',
          'booster': 'gbtree',
          'eval_metric': 'rmse',
          'nthread': 8,
          'silent': 1,
          'max_depth': 4,
          'subsample': 0.9,
          'eta': 0.5,
          'seed': 2017,
          'tree_method': 'exact'}

In [16]:
num_rounds = 5000
watchlist = [(dtrain, 'dtrain')]
bst = xgb.train(params, dtrain, num_rounds, watchlist, verbose_eval=True)

[0]	dtrain-rmse:3.70864
[1]	dtrain-rmse:2.84765
[2]	dtrain-rmse:2.58728
[3]	dtrain-rmse:2.51743
[4]	dtrain-rmse:2.49957
[5]	dtrain-rmse:2.49394
[6]	dtrain-rmse:2.49214
[7]	dtrain-rmse:2.49085
[8]	dtrain-rmse:2.49017
[9]	dtrain-rmse:2.48972
[10]	dtrain-rmse:2.48883
[11]	dtrain-rmse:2.48774
[12]	dtrain-rmse:2.48753
[13]	dtrain-rmse:2.48662
[14]	dtrain-rmse:2.48584
[15]	dtrain-rmse:2.48503
[16]	dtrain-rmse:2.48483
[17]	dtrain-rmse:2.48396
[18]	dtrain-rmse:2.48351
[19]	dtrain-rmse:2.48277
[20]	dtrain-rmse:2.48176
[21]	dtrain-rmse:2.48141
[22]	dtrain-rmse:2.4805
[23]	dtrain-rmse:2.48033
[24]	dtrain-rmse:2.47934
[25]	dtrain-rmse:2.47874
[26]	dtrain-rmse:2.47807
[27]	dtrain-rmse:2.47729
[28]	dtrain-rmse:2.47639
[29]	dtrain-rmse:2.47592
[30]	dtrain-rmse:2.47556
[31]	dtrain-rmse:2.47531
[32]	dtrain-rmse:2.47496
[33]	dtrain-rmse:2.47478
[34]	dtrain-rmse:2.47388
[35]	dtrain-rmse:2.47335
[36]	dtrain-rmse:2.47307
[37]	dtrain-rmse:2.47288
[38]	dtrain-rmse:2.47212
[39]	dtrain-rmse:2.47158
[40]	dtrain

In [None]:
xgb_preds = bst.predict(dtest)
submit = pd.DataFrame({'ID': test_ids, 'rating': xgb_preds})
submit.to_csv('../submissions/submit_5k.csv', index=False)