# Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV 
import xgboost as xgb
import random
from sklearn import preprocessing
from itertools import groupby

# Read data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sampleSub = pd.read_csv('sample_submission.csv')

In [3]:
train.head()
from copy import deepcopy
ScanCountTrain = deepcopy(train.ScanCount)

train.DepartmentDescription = train.DepartmentDescription.fillna('Null')
test.DepartmentDescription = test.DepartmentDescription.fillna('Null')

train.FinelineNumber = train.FinelineNumber.fillna(-1)
test.FinelineNumber = test.FinelineNumber.fillna(-1)

In [4]:
trainLabels = [3, 4, 5, 6, 7, 8, 9, 12, 14, 15,
               18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
               28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
               38, 39, 40, 41, 42, 43, 44, 999]
print trainLabels, len(trainLabels)

[3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 999] 38


In [5]:
train.TripType = train.TripType.map(lambda x:trainLabels.index(x))
weekdays = list(set(train.Weekday))
train.Weekday = train.Weekday.map(lambda x:weekdays.index(x))

In [6]:
train.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,37,5,2,68113152929,-1,FINANCIAL SERVICES,1000
1,22,7,2,60538815980,1,SHOES,8931
2,22,7,2,7410811099,1,PERSONAL CARE,4504
3,18,8,2,2238403510,2,PAINT AND ACCESSORIES,3565
4,18,8,2,2006613744,2,PAINT AND ACCESSORIES,1017


## Create dict for visitnumber

In [7]:
trainVisit = train.VisitNumber.as_matrix()
rowNum = np.array(range(train.shape[0]))
visitNum = zip(rowNum, trainVisit)

from collections import OrderedDict
trainDict = OrderedDict()
for r, v in visitNum:
    if v in trainDict: 
        trainDict[v].append(r)
    else: trainDict[v] = [r]
        
testVisit = test.VisitNumber.as_matrix()
rowNum = np.array(range(test.shape[0]))
visitNum = zip(rowNum, testVisit)

testDict = OrderedDict()
for r, v in visitNum:
    if v in testDict: 
        testDict[v].append(r)
    else: testDict[v] = [r]

## default Logodds: distribution of trip types

In [8]:
trainTargetAgg = train.groupby(['VisitNumber'])['TripType'].mean()
trainTarget = trainTargetAgg.as_matrix()
trainTarget

array([37, 22, 18, ..., 31, 31,  5])

In [9]:
b = {}
for item in trainTarget:
    b[item] = b.get(item, 0) + 1

In [10]:
tripDist = np.array(b.values())/float(len(trainTarget))
tripDist

array([  3.80772206e-02,   3.61644752e-03,   4.80067730e-02,
         1.33474089e-02,   6.01208270e-02,   1.27108723e-01,
         9.89192466e-02,   2.81163116e-03,   4.18086418e-05,
         1.02222129e-02,   5.73823609e-03,   3.91956017e-03,
         6.65802621e-03,   6.69983486e-03,   9.69960491e-03,
         1.45285030e-03,   2.72696866e-02,   3.86520894e-02,
         5.25743671e-03,   8.20494596e-03,   5.14246295e-03,
         4.52578548e-03,   1.12987855e-02,   6.20858331e-03,
         2.07370864e-02,   1.37445910e-02,   7.51510337e-03,
         2.12178857e-02,   3.14087422e-02,   2.91406234e-02,
         3.04366913e-02,   1.03434580e-01,   6.40717436e-02,
         6.09360955e-03,   1.94201141e-02,   9.11428392e-03,
         1.24067145e-02,   8.82580429e-02])

In [24]:
scanCountTrain = train

In [25]:
scanCountTrain.ScanCount = map(lambda x: 1 if x > 0 else 0, scanCountTrain.ScanCount)

In [26]:
scanTripCount = scanCountTrain.groupby(['ScanCount', 'TripType']).size()

In [27]:
tripCount = scanCountTrain.groupby(['TripType']).size()
scanCount = scanCountTrain.groupby(['ScanCount']).size()

In [28]:
scanCount

ScanCount
0     15458
1    631596
dtype: int64

In [17]:
defaultLogodds = tripDist
# np.log(tripCount/len(train) * 1.0) - np.log(1.0 - tripCount/len(train) * 1.0)

In [18]:
scanCountLogodds = {}

In [19]:
from copy import deepcopy
scanCountLogodds[0] = deepcopy(defaultLogodds)
scanCountLogodds[1] = deepcopy(defaultLogodds)

In [20]:
categories = sorted(train['TripType'].unique())
for trip in scanTripCount[0].keys():
    PA = scanTripCount[0][trip]/float(scanCount[0])
    scanCountLogodds[0][categories.index(trip)] = PA
    # np.log(PA) - np.log(1.0 - PA)
scanCountLogodds[0] = pd.Series(scanCountLogodds[0])
scanCountLogodds[0].index = range(len(categories))

In [21]:
for trip in scanTripCount[1].keys():
    PA = scanTripCount[1][trip]/float(scanCount[1])
    scanCountLogodds[1][categories.index(trip)] = PA
    # np.log(PA) - np.log(1.0 - PA)
scanCountLogodds[1] = pd.Series(scanCountLogodds[1])
scanCountLogodds[1].index = range(len(categories))

In [22]:
# scanCountLogodds

In [23]:
# categories
# tripCount
departTripCount = train.groupby(['DepartmentDescription', 'TripType']).size()
departCount = train.groupby(['DepartmentDescription']).size()
departLogodds = {}
departTrain = sorted(train['DepartmentDescription'].unique())

In [51]:
# departLogodds
for dp in departTrain:
    departLogodds[dp] = deepcopy(defaultLogodds)
    for trip in departTripCount[dp].keys():
        PA = departTripCount[dp][trip]/float(departCount[dp])
        departLogodds[dp][categories.index(trip)] = PA
        # np.log(PA) - np.log(1.0 - PA)
    departLogodds[dp] = pd.Series(departLogodds[dp])
    departLogodds[dp].index = range(len(categories))

In [52]:
# categories
# tripCount
flTripCount = train.groupby(['FinelineNumber', 'TripType']).size()
flCount = train.groupby(['FinelineNumber']).size()
flLogodds = {}
flTrain = sorted(train['FinelineNumber'].unique())

for fl in flTrain:
    fl = int(fl)
    flLogodds[fl] = deepcopy(defaultLogodds)
    for trip in flTripCount[fl].keys():
        PA = flTripCount[fl][trip]/float(flCount[fl])
        flLogodds[fl][categories.index(trip)] = PA
        # np.log(PA) - np.log(1.0 - PA)
    flLogodds[fl] = pd.Series(flLogodds[fl])
    flLogodds[fl].index = range(len(categories))

In [53]:
weekdayTripCount = train.groupby(['Weekday', 'TripType']).size()
wdCount = train.groupby(['Weekday']).size()
wdLogodds = {}
wdTrain = sorted(train['Weekday'].unique())

for wd in wdTrain:
    wdLogodds[wd] = deepcopy(defaultLogodds)
    for trip in weekdayTripCount[wd].keys():
        PA = weekdayTripCount[wd][trip]/float(wdCount[wd])
        wdLogodds[wd][categories.index(trip)] = PA
        # np.log(PA) - np.log(1.0 - PA)
    wdLogodds[wd] = pd.Series(wdLogodds[wd])
    wdLogodds[wd].index = range(len(categories))

# Train target

In [54]:
# trainTargetAgg = train.groupby(['VisitNumber'])['TripType'].mean()
# trainTarget = trainTargetAgg.as_matrix()
# trainTarget

In [55]:
finelineReturn = list(set(train[train.ScanCount < 0].FinelineNumber).union(set(test[test.ScanCount < 0].FinelineNumber)))
finelineReturn = map(lambda x:int(x), finelineReturn)
finelineBuy = list(set(train[train.ScanCount > 0].FinelineNumber).union(set(test[test.ScanCount > 0].FinelineNumber)))
finelineBuy = map(lambda x:int(x), finelineBuy)
print len(finelineBuy), len(finelineReturn)

5333 3246


In [56]:
Weekdays = list(set(train.Weekday))

# Create feature matrix

In [58]:
Depart = list(set(train.DepartmentDescription).union(set(test.DepartmentDescription)))

length = len(Weekdays) + len(Depart) * 2 + len(finelineBuy) + len(trainLabels) * 5
print length

5668


In [59]:
trainFeat = np.zeros((len(set(train.VisitNumber)), length))
testFeat = np.zeros((len(set(test.VisitNumber)), length))

In [60]:
trainFeat.shape

(95674, 5668)

In [62]:
# 0 -> 6, 7 -> 44, 45 -> 82, 83 -> 120, 121 -> 158, 159 -> 196, 197 -> 265, 266 -> 334, 335 -> 5667

### Weekday

In [61]:
keys = trainDict.keys()
for i in range(len(trainDict)):
    key = keys[i]
    for val in trainDict[key]:
        idx = Weekdays.index(train.Weekday[val])
        trainFeat[i][idx] = 1

In [63]:
trainDict[key]

[647052, 647053]

In [64]:
keys = testDict.keys()
for i in range(len(testDict)):
    key = keys[i]
    for val in testDict[key]:
        idx = Weekdays.index(test.Weekday[val])
        testFeat[i][idx] = 1

# Add Logodds

In [66]:
## tripDist Logodds
keys = trainDict.keys()
for i in range(len(trainDict)):
    trainFeat[i][7 : 45] = tripDist
        
keys = testDict.keys()
for i in range(len(testDict)):
    testFeat[i][7 : 45] = tripDist

## weekday Logodds

In [None]:
keys = trainDict.keys()
for i in range(len(trainDict)):
    key = keys[i]
    for val in trainDict[key]:
        wd = train.Weekday[val]
        trainFeat[i][7 : 45] += wdLogodds[wd]
        
keys = testDict.keys()
for i in range(len(testDict)):
    key = keys[i]
    for val in testDict[key]:
        wd = test.Weekday[val]
        testFeat[i][7 : 45] += wdLogodds[wd]

## ScanCount Logodds

In [65]:
train.ScanCount = ScanCountTrain
train.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,37,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,22,7,Friday,60538815980,1,SHOES,8931
2,22,7,Friday,7410811099,1,PERSONAL CARE,4504
3,18,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,18,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [37]:
# trainDict

In [38]:
keys = trainDict.keys()
for i in range(len(trainDict)):
    key = keys[i]
    for val in trainDict[key]:
        scancount = train.ScanCount[val]
        if scancount > 0:
            flag = 1
        else:
            flag = 0
            
        trainFeat[i][7 : 45] += scanCountLogodds[flag]

In [39]:
keys = testDict.keys()
for i in range(len(testDict)):
    key = keys[i]
    for val in testDict[key]:
        scancount = test.ScanCount[val]
        if scancount > 0:
            flag = 1
        else:
            flag = 0
            
        testFeat[i][7 : 45] += scanCountLogodds[flag]

## Depart Logodds

In [40]:
keys = trainDict.keys()
for i in range(len(trainDict)):
    key = keys[i]
    for val in trainDict[key]:
        dp = train.DepartmentDescription[val]
        trainFeat[i][45 : 83] += departLogodds[dp]

In [41]:
newDP = sorted(test.DepartmentDescription.unique())

In [42]:
newDPCount = test.groupby(["DepartmentDescription"]).size()
newDPCount

DepartmentDescription
1-HR PHOTO                      394
ACCESSORIES                    1420
AUTOMOTIVE                     5459
BAKERY                         7165
BATH AND SHOWER                4665
BEAUTY                        15223
BEDDING                        2203
BOOKS AND MAGAZINES             909
BOYS WEAR                      3886
BRAS & SHAPEWEAR               2018
CAMERAS AND SUPPLIES            247
CANDY, TOBACCO, COOKIES       10310
CELEBRATION                    8635
COMM BREAD                    15688
CONCEPT STORES                   49
COOK AND DINE                  7788
DAIRY                         44824
DSD GROCERY                   68860
ELECTRONICS                    3361
FABRICS AND CRAFTS             5170
FINANCIAL SERVICES            10857
FROZEN FOODS                  21890
FURNITURE                       444
GIRLS WEAR, 4-6X  AND 7-14     4628
GROCERY DRY GOODS             72335
HARDWARE                       4885
HOME DECOR                     4069
HOME M

In [43]:
onlyNewDP = set(newDP + departTrain) - set(departTrain)
onlyNewDP

set()

In [44]:
keys = testDict.keys()
for i in range(len(testDict)):
    key = keys[i]
    for val in testDict[key]:
        dp = test.DepartmentDescription[val]
        testFeat[i][45 : 83] += departLogodds[dp]

## FL Logodds

In [45]:
keys = trainDict.keys()
for i in range(len(trainDict)):
    key = keys[i]
    for val in trainDict[key]:
        fl = train.FinelineNumber[val]
        trainFeat[i][83 : 121] += flLogodds[fl]

In [46]:
newFL = sorted(test.FinelineNumber.unique())
newFLCount = test.groupby(["FinelineNumber"]).size()
onlyNewFL = set(newFL + flTrain) - set(flTrain)
# onlyNewFL

In [47]:
for fl in onlyNewFL:
    flLogodds[fl] = deepcopy(defaultLogodds)
    flLogodds[fl].index = range(len(categories))

In [48]:
keys = testDict.keys()
for i in range(len(testDict)):
    key = keys[i]
    for val in testDict[key]:
        fl = test.FinelineNumber[val]
        testFeat[i][83 : 121] += flLogodds[fl]

# FinelineNumber for buying/Returning

In [49]:
keys = trainDict.keys()
for i in range(len(trainDict)):
    key = keys[i]
    for val in trainDict[key]:
        scancount = train.ScanCount[val]
        # finelineDepart = (train.DepartmentDescription[val], train.FinelineNumber[val])
        depart = train.DepartmentDescription[val]
        fineline = int(train.FinelineNumber[val])
        
        if scancount < 0:
            idxDepart = Depart.index(depart) + 121
            trainFeat[i][idxDepart] += 1
            # idxFineline = finelineReturn.index(fineline) + 7 + len(Depart)
        else:
            idxDepart = Depart.index(depart) + 190
            idxFineline = finelineBuy.index(fineline) + 259
            trainFeat[i][idxDepart] += 1
            trainFeat[i][idxFineline] += scancount

In [50]:
keys = testDict.keys()
for i in range(len(testDict)):
    key = keys[i]
    for val in testDict[key]:
        scancount = test.ScanCount[val]
        # finelineDepart = (test.DepartmentDescription[val], test.FinelineNumber[val])
        
        depart = test.DepartmentDescription[val]
        fineline = int(test.FinelineNumber[val])
        
        if scancount < 0:
            idxDepart = Depart.index(depart) + 121
            testFeat[i][idxDepart] += 1
            # idxFineline = finelineReturn.index(fineline) + 7 + len(Depart)
        else:
            idxDepart = Depart.index(depart) + 190
            idxFineline = finelineBuy.index(fineline) + 259
            testFeat[i][idxDepart] += 1
            testFeat[i][idxFineline] += scancount

In [51]:
sum(trainFeat[0])

5.2270737836409342

## Model 1: Train xgboost classifier

In [38]:
trainFeat.shape

(95674L, 11156L)

In [94]:
'''
cvIdx = random.sample(xrange(trainFeat.shape[0]), 10000)
cvX = trainFeat[cvIdx,]
cvY = trainTarget[cvIdx]
trainIdx = [x for x in xrange(trainFeat.shape[0]) if x not in cvIdx]
np.random.shuffle(trainIdx)
trainX = trainFeat[trainIdx,]
trainY = trainTarget[trainIdx,]
dtrain = xgb.DMatrix(trainX, label = trainY)
dtest = xgb.DMatrix(cvX, label = cvY)

param = {'max_depth': 6, 
         'eta': 0.25, 
         'silent': 1, 
         'objective': 'multi:softprob',
         'eval_metric': "mlogloss",
         'min_child_weight': 1,
         'subsample': 0.6,
         'colsample_bytree': 0.5,
         'num_class': 38
        }
        
watchlist  = [(dtrain,'train'), (dtest,'eval')]

numRound = 1000000

xgbModel = xgb.train(param, dtrain, numRound, watchlist, early_stopping_rounds = 30)

'''

'\ncvIdx = random.sample(xrange(trainFeat.shape[0]), 10000)\ncvX = trainFeat[cvIdx,]\ncvY = trainTarget[cvIdx]\ntrainIdx = [x for x in xrange(trainFeat.shape[0]) if x not in cvIdx]\nnp.random.shuffle(trainIdx)\ntrainX = trainFeat[trainIdx,]\ntrainY = trainTarget[trainIdx,]\ndtrain = xgb.DMatrix(trainX, label = trainY)\ndtest = xgb.DMatrix(cvX, label = cvY)\n\nparam = {\'max_depth\': 6, \n         \'eta\': 0.25, \n         \'silent\': 1, \n         \'objective\': \'multi:softprob\',\n         \'eval_metric\': "mlogloss",\n         \'min_child_weight\': 1,\n         \'subsample\': 0.6,\n         \'colsample_bytree\': 0.5,\n         \'num_class\': 38\n        }\n        \nwatchlist  = [(dtrain,\'train\'), (dtest,\'eval\')]\n\nnumRound = 1000000\n\nxgbModel = xgb.train(param, dtrain, numRound, watchlist, early_stopping_rounds = 30)\n\n'

In [95]:
'''
testPred = model.predict(xgb.DMatrix(testFeat))
mySampleSub = SampleSub
mySampleSub.ix[:,1:39] = testPred
mySampleSub.VisitNumber = map(lambda x: int(x), mySampleSub.VisitNumber)
mySampleSub.to_csv("xgbSub.csv", index = False)
'''

'\ntestPred = model.predict(xgb.DMatrix(testFeat))\nmySampleSub = SampleSub\nmySampleSub.ix[:,1:39] = testPred\nmySampleSub.VisitNumber = map(lambda x: int(x), mySampleSub.VisitNumber)\nmySampleSub.to_csv("xgbSub.csv", index = False)\n'

## Model 2: Lasagne Neural Networks

In [3]:
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from lasagne.layers import DropoutLayer
from lasagne.nonlinearities import softmax
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet
from nolearn.lasagne import TrainSplit

In [53]:
layers0 = [('input', InputLayer),
           ('dense', DenseLayer),
           ('dropout', DropoutLayer),
           ('output', DenseLayer)]

numClasses = 38
numFeatures = trainFeat.shape[1]

In [63]:
# 200, 0.5, 0.012, 0.7, 0.1/862
# 92       0.48677       0.65550      0.74258      0.77797  33.23s
# 200, 0.5, 0.015, 0.7, 0.1/862
# 78       0.47839       0.65376      0.73175      0.77477  40.53s


testPred = []

for x in xrange(1):
    print "--------" + str(x) + "---------"
    random.seed(6)
    net0 = NeuralNet(layers = layers0,
               input_shape = (None, numFeatures),
               dense_num_units = 256,
               dropout_p = 0.3,
               output_num_units = numClasses,
               output_nonlinearity = softmax,
               
               update = nesterov_momentum,
               update_learning_rate = 0.01,
               update_momentum = 0.75,
               
               train_split = TrainSplit(0.1),
               verbose = 1,
               max_epochs = 63)
    
    trainTarget = trainTarget.astype(np.int32)
    net0.fit(trainFeat, trainTarget)
    
    testPred.append(net0.predict_proba(testFeat))

--------0---------
# Neural Network with 1441574 learnable parameters

## Layer information

  #  name       size
---  -------  ------
  0  input      5592
  1  dense       256
  2  dropout     256
  3  output       38

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  ------
      1       [36m2.17978[0m       [32m1.51802[0m      1.43594      0.61405  33.15s
      2       [36m1.40150[0m       [32m1.23487[0m      1.13494      0.65651  19.64s
      3       [36m1.20957[0m       [32m1.11737[0m      1.08251      0.67193  20.12s
      4       [36m1.10959[0m       [32m1.04809[0m      1.05868      0.68477  19.67s
      5       [36m1.04749[0m       [32m1.00231[0m      1.04508      0.69091  19.36s
      6       [36m1.00255[0m       [32m0.96602[0m      1.03781      0.69823  19.16s
      7       [36m0.96500[0m       [32m0.94182[0m      1.02462      0.70384  20.25s
      8       [36m0.93359

In [55]:
testPredAvg = sum(testPred)/5.0

In [60]:
testPredAvg
testPred[0]

array([[  6.32291812e-05,   2.11360421e-04,   1.17973341e-04, ...,
          9.86548891e-04,   1.06986887e-04,   4.13904760e-04],
       [  4.65699936e-04,   5.60867046e-04,   3.03647903e-04, ...,
          6.64058002e-04,   1.08562736e-05,   6.88549100e-01],
       [  5.82690140e-05,   3.12188332e-05,   2.97532317e-05, ...,
          4.04316308e-07,   1.41511625e-08,   9.85141032e-01],
       ..., 
       [  5.78649879e-05,   6.57875608e-05,   2.58685635e-05, ...,
          2.54678114e-06,   2.28945880e-08,   4.71636761e-03],
       [  4.89935357e-09,   2.12032193e-08,   4.24912144e-07, ...,
          2.61271040e-03,   3.41455435e-03,   7.22595079e-07],
       [  1.92729637e-07,   6.65004886e-10,   4.19563460e-09, ...,
          5.12321960e-06,   4.19637742e-06,   8.42255474e-06]])

In [58]:
sum(testPredAvg[0,])

1.0

In [64]:
mySampleSub = sampleSub
mySampleSub.ix[:,1:39] = testPred[0]
mySampleSub.VisitNumber = map(lambda x: int(x), mySampleSub.VisitNumber)
mySampleSub.to_csv("nnSub.csv", index = False)