In [8]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
#import xgboost as xgb
import random
import zipfile
import time
import shutil
from sklearn.metrics import log_loss

print('Read events...')
events = pd.read_csv("../data/events.csv", dtype={'device_id': np.str})
events['counts'] = events.groupby(['device_id'])['event_id'].transform('count')
events_small = events[['device_id', 'counts']].drop_duplicates('device_id', keep='first')
print(events)

Read events...
         event_id             device_id            timestamp  longitude  \
0               1     29182687948017175  2016-05-01 00:55:25     121.38   
1               2  -6401643145415154744  2016-05-01 00:54:12     103.65   
2               3  -4833982096941402721  2016-05-01 00:08:05     106.60   
3               4  -6815121365017318426  2016-05-01 00:06:40     104.27   
4               5  -5373797595892518570  2016-05-01 00:07:18     115.88   
5               6   1476664663289716375  2016-05-01 00:27:21       0.00   
6               7   5990807147117726237  2016-05-01 00:15:13     113.73   
7               8   1782450055857303792  2016-05-01 00:15:35     113.94   
8               9  -2073340001552902943  2016-05-01 00:15:33       0.00   
9              10  -8195816569128397698  2016-05-01 00:41:31     119.34   
10             11   8663743929678393765  2016-05-01 00:44:13     106.71   
11             12   8663743929678393765  2016-05-01 00:45:30     106.71   
12        

In [10]:
print(events_small)

                    device_id  counts
0           29182687948017175     256
1        -6401643145415154744      73
2        -4833982096941402721     248
3        -6815121365017318426      47
4        -5373797595892518570     525
5         1476664663289716375      82
6         5990807147117726237    1137
7         1782450055857303792      66
8        -2073340001552902943      22
9        -8195816569128397698      48
10        8663743929678393765     240
12        8640114685470534007     293
14        2504414082456157897    3804
15        9070651185984875886     124
16        -460438353776005388     200
17        2271670507584822423     296
18        9161556264947841692      98
19        6130108008013735751      26
20        8546251726684589793     156
21       -1663840927569383079      28
22        1186608308763918427   33426
23       -6672877056143385791     283
24       -4984460214693442980     116
25        4830593356246717684     105
26       -2668732172869127974     806
27       -77

In [13]:
def map_column(table, f):
    labels = sorted(table[f].unique())
    mappings = dict()
    for i in range(len(labels)):
        mappings[labels[i]] = i
    table = table.replace({f: mappings})
    return table

In [16]:
# Phone brand
print('Read brands...')
devices = pd.read_csv("../data/phone_brand_device_model.csv", dtype={'device_id': np.str})
devices.drop_duplicates('device_id', keep='first', inplace=True)
devices = map_column(devices, 'phone_brand')
devices = map_column(devices, 'device_model')
print(devices)

Read brands...
                   device_id  phone_brand  device_model
0       -8890648629457979026           51          1517
1        1277779817574759137           51           749
2        5137427614288105724           15           560
3        3669464369358936369            9          1503
4       -5019277647504317457           15           536
5        3238009352149731868           31           774
6       -3883532755183027260           51           752
7       -2972199645857147708           31           432
8       -5827952925479472594           51           758
9       -8262508968076336275           13           982
10       5840378295166286440           15           557
11      -5776341595519925628           51           750
12       3437705102632680210           15           557
13      -3568334676360016285           15           557
14        556978549708484782           51           749
15       5694497738268412307           51           753
16       6327116114284382479     

In [17]:
# Training set
print('Read training data...')
train_data = pd.read_csv("../data/gender_age_train.csv", dtype={'device_id': np.str})
train_data = map_column(train_data, 'group')
train_data = train_data.drop(['age'], axis=1)
train_data = train_data.drop(['gender'], axis=1)
train_data = pd.merge(train_data, devices, how='left', on='device_id', left_index=True)
train_data = pd.merge(train_data, events_small, how='left', on='device_id', left_index=True)
train_data.fillna(-1, inplace=True)
print(train_data)

Read training data...
                    device_id  group  phone_brand  device_model  counts
3251685  -8076087639492063270     10           51           749    -1.0
3251685  -2897161552818060146     10           51           749    -1.0
2479655  -8260683887967679142     10           51           749     1.0
3251685  -4938849341048082022      9           51          1524    -1.0
3251685    245133531816851882      9           51           753    -1.0
3251685  -1297074871525174196      1            7           908    -1.0
3251685    236877999787307864     10          117           396    -1.0
3251685  -8098239495777311881     10           51          1524    -1.0
3251685    176515041953473526     10           13          1246    -1.0
3251685   1596610250680140042      4           15           560    -1.0
3251685   9032155484127182494      9           31           776    -1.0
280933    7477216237379271436      4           31          1546     7.0
3251685   2478205222798310601      2      

In [18]:
# Test data
print('Read test...')
test_data = pd.read_csv("../data/gender_age_test.csv", dtype={'device_id': np.str})
test_data = pd.merge(test_data, devices, how='left', on='device_id', left_index=True)
test_data = pd.merge(test_data, events_small, how='left', on='device_id', left_index=True)
test_data.fillna(-1, inplace=True)

# Features
features = list(test_data.columns.values)
features.remove('device_id')
print(features)

Read test...
['phone_brand', 'device_model', 'counts']


In [None]:
def run_xgb(train, test, features, target, random_state=0):
    eta = 0.1
    max_depth = 3
    subsample = 0.7
    colsample_bytree = 0.7
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "multi:softprob",
        "num_class": 12,
        "booster" : "gbtree",
        "eval_metric": "mlogloss",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state,
    }
    num_boost_round = 500
    early_stopping_rounds = 50
    test_size = 0.3

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    print('Length train:', len(X_train.index))
    print('Length valid:', len(X_valid.index))
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration)
    score = log_loss(y_valid.tolist(), check)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score
