In [1]:
%matplotlib inline
import pandas as pd 
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from time import time
import gc
from scipy.sparse import csr_matrix
import scipy.sparse as sp
import cPickle as pickle

In [2]:
with open('stored/train_vect_with_meta.pkl','rb') as fp:
    train_vect = pickle.load(fp)

In [3]:
train_vect = sp.csr_matrix(train_vect)

In [4]:
train = pd.read_csv('raw/train.csv')
target = train[['TripType','VisitNumber']].copy()
target = target.drop_duplicates(['VisitNumber'])

target.index = target['VisitNumber']
target = target.drop('VisitNumber',1)
target = target.reset_index(drop=True)

In [5]:
target

Unnamed: 0,TripType
0,999
1,30
2,26
3,8
4,8
5,35
6,41
7,21
8,6
9,42


In [26]:
from sklearn.cross_validation import train_test_split
y = target['TripType']
classes = sorted(set(y))
inc = 0

X_train, X_test, y_train_main, y_test_main = train_test_split(train_vect, y_label, test_size = 0.15, train_size = 0.85, stratify = y)

for cl in classes:
    print cl
    replace_labels = dict(zip(classes, np.zeros(len(classes), dtype = int)))
    replace_labels[cl] = 1
    y_train = y_train_main.replace(replace_labels)
    y_test = y_test_main.replace(replace_labels)
    
    to_libfm(X_train, y_train, cl, True)
    to_libfm(X_test, y_test, cl)
    

    inc+=1

3
4
5
6
7
8
9
12
14
15
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
999


In [11]:
def to_libfm(df, y, cl, train=False):
    lines = []
    for i in  range(df.shape[0]): 
        item = df[i]
        result = str(y.values[i])
        nums = ''
        for la in sorted(zip(item.indices,item.data)):
            nums += ' '+str(la[0])+':'+str(la[1])
        result+=nums
        lines.append(result)
    
    name = 'test'
    if train:
        name = 'train'
        
    with open('libfm_files/for_fm_'+name+'_'+str(cl)+'.txt','w') as f:
        f.write('\n'.join(lines))
    

In [27]:
# time to play with console
import subprocess
for cl in classes:
    print cl
    latent = 10
    iters = 100
    cmd = "./libfm/bin/libFM -task c -train libfm_files/for_fm_train_{cl}.txt -test libfm_files/for_fm_test_{cl}.txt -rlog libfm_files/log.txt -dim ’1,1,{latent_num}’ -iter {iter_num}  -out libfm_files/predictions_{cl}.txt"
    return_code = subprocess.call(cmd.format(cl=cl,
                                            latent_num = latent,
                                            iter_num=iters)
                                  , shell=True)  

3
4
5
6
7
8
9
12
14
15
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
999


In [28]:
result = pd.DataFrame()
for cl in classes:
    
    temp = pd.read_csv('libfm_files/predictions_{cl}.txt'.format(cl=cl), header=None)
    result['TripType_'+str(cl)] = pd.Series(temp[0].values)
    
result

Unnamed: 0,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,TripType_15,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,0.019505,3.147660e-02,3.215430e-02,0.031530,1.945270e-02,1.804270e-02,2.633690e-02,1.532150e-02,4.412710e-02,4.282610e-02,...,3.342420e-02,1.417030e-02,1.999390e-02,8.086750e-03,1.458700e-02,2.074190e-02,1.868100e-02,3.288880e-02,1.262240e-02,3.722930e-02
1,0.004600,1.179660e-01,4.906910e-02,0.000000,7.095360e-11,8.929280e-01,3.593530e-09,7.070620e-04,0.000000e+00,0.000000e+00,...,1.217590e-01,8.077960e-01,0.000000e+00,2.166490e-14,2.647640e-01,3.408720e-01,2.786330e-01,6.262670e-01,1.000000e+00,7.156340e-01
2,0.004801,2.260900e-04,3.401550e-05,0.000234,5.511060e-02,3.144710e-02,3.466350e-02,2.938890e-02,5.593420e-05,3.898710e-07,...,3.296640e-02,1.686420e-02,8.363260e-05,5.540310e-02,2.115310e-02,3.149680e-03,4.532870e-02,1.415060e-02,4.122660e-02,3.992430e-02
3,0.284393,1.379250e-01,3.284570e-02,0.019834,0.000000e+00,0.000000e+00,9.880510e-03,0.000000e+00,3.376210e-01,4.443390e-01,...,0.000000e+00,0.000000e+00,7.237120e-01,9.988720e-03,1.940350e-03,2.242200e-01,0.000000e+00,9.561150e-07,0.000000e+00,0.000000e+00
4,0.013525,1.965100e-02,3.363930e-02,0.027817,3.994380e-04,1.175250e-08,8.593100e-03,7.169910e-06,6.657540e-02,4.928550e-02,...,8.164580e-04,1.768950e-03,1.736610e-02,1.498250e-15,8.006050e-03,2.667590e-02,9.935480e-06,8.100330e-03,1.397840e-06,1.732470e-03
5,0.010291,5.913730e-09,1.124390e-06,0.000040,3.168890e-01,4.514960e-01,2.422740e-02,3.214570e-01,1.116480e-05,3.916060e-07,...,4.492400e-01,4.480320e-01,2.845680e-10,5.832070e-02,1.188410e-01,4.013920e-04,4.322950e-01,6.228960e-02,5.929000e-01,3.045100e-01
6,0.000050,8.079190e-11,4.402480e-14,0.000410,3.133890e-02,3.377260e-01,1.394080e-05,1.257110e-02,2.564060e-15,1.669800e-13,...,8.720610e-02,5.753080e-03,1.075920e-10,1.017650e-02,1.361060e-03,2.201420e-04,3.737310e-02,7.315660e-03,3.441080e-02,2.863110e-02
7,0.013624,2.215330e-03,7.007460e-04,0.136791,5.993860e-07,1.331390e-03,3.850580e-03,4.642800e-04,6.337570e-03,2.000920e-03,...,9.473790e-03,6.739480e-03,1.064640e-02,2.206920e-02,1.091870e-01,1.450480e-02,2.440560e-02,2.972270e-01,5.802440e-03,6.117030e-03
8,0.011839,7.762680e-03,9.479740e-03,0.021561,3.045250e-02,1.886720e-02,1.220330e-02,1.155930e-02,1.329040e-02,1.398320e-02,...,3.205530e-02,1.405030e-02,6.906820e-03,2.645680e-03,1.728170e-02,2.128190e-02,2.171240e-02,1.388440e-02,1.577560e-02,1.888130e-02
9,0.002217,3.890650e-02,4.967680e-02,0.040017,1.093170e-05,3.745840e-08,1.150980e-02,4.890810e-07,7.722240e-02,8.874030e-02,...,3.311730e-03,1.093930e-07,6.890220e-02,9.603430e-14,3.605680e-03,1.201300e-02,2.018580e-05,3.380780e-03,6.196370e-09,2.543600e-03


In [95]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    idea from this post:
    http://www.kaggle.com/c/emc-data-science/forums/t/2149/is-anyone-noticing-difference-betwen-validation-and-leaderboard-error/12209#post12209

    Parameters
    ----------
    y_true : array, shape = [n_samples]
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    print y_pred[0]
    predictions /= predictions.sum(axis=1)[:, np.newaxis]
    print np.sum(y_pred[0])

    actual = np.zeros(y_pred.shape)
    rows = actual.shape[0]
    actual[np.arange(rows), y_true.astype(int)] = 1
    vsota = np.sum(actual * np.log(predictions))
    return -1.0 / rows * vsota

In [96]:
multiclass_log_loss(test_values.values,xgb_test)

[  4.74178814e-05   1.77983065e-06   4.51594009e-04   4.54888796e-05
   5.50692203e-03   5.20983785e-02   4.60251570e-01   6.26269468e-07
   4.31125081e-05   2.70635792e-04   1.87858641e-05   2.60534489e-05
   2.24761357e-06   3.19408136e-05   1.51174131e-03   8.13623046e-06
   3.09136370e-03   1.78801827e-03   5.01660616e-05   8.38870474e-05
   8.72518103e-06   4.19545868e-05   8.42839072e-05   8.28065731e-06
   1.25716979e-04   3.87539971e-04   4.30214732e-06   2.90078053e-04
   3.65396496e-04   1.80305506e-04   3.86268104e-04   6.30025286e-04
   1.61708071e-04   1.06857442e-04   8.11186444e-04   1.02377708e-04
   1.20039396e-04   4.70855057e-01]
1.0


4.4332731445171047

In [103]:
y_test.index

Int64Index([54233, 29942, 57458, 68910, 81324, 69466, 29547, 15484, 34025, 90761, 64151, 461, 72026, 19737, 56113, 38809, 53334, 57054, 8700, 81160, 31342, 70584, 24507, 10682, 60179, 14589, 26548, 3658, 53544, 33103, 68720, 10839, 39526, 14165, 63020, 41605, 62827, 48458, 95467, 22178, 44755, 19517, 60064, 11547, 94233, 73697, 35219, 88280, 69141, 29727, 79854, 31749, 31603, 17770, 68848, 13935, 37586, 16491, 27743, 43708, 8224, 81613, 17810, 30735, 79694, 75274, 10544, 63800, 64878, 2887, 11498, 61595, 28120, 66285, 93910, 36043, 4150, 13521, 66413, 59871, 67276, 1097, 19290, 42143, 212, 71591, 23713, 76262, 18275, 91172, 65817, 40941, 60368, 26208, 65781, 52876, 89419, 69322, 91870, 2122, ...], dtype='int64')

In [33]:
test_values = target.loc[y_test_main.index,'TripType']

In [34]:
# Replacement
replace_labels = dict(zip(classes, range(0,len(classes))))
test_values = test_values.replace(replace_labels)

In [72]:
multiclass_log_loss(test_values.values,xgb_test)

4.4332731445171047

In [58]:
y_test

54233    0
29942    0
57458    0
68910    0
81324    0
69466    0
29547    0
15484    0
34025    0
90761    0
64151    0
461      0
72026    0
19737    0
56113    0
...
778      0
59622    0
15093    0
37256    0
31578    0
73974    1
56155    0
5774     0
62059    0
26408    0
88130    0
9003     0
45384    0
86908    0
71713    0
Name: TripType, Length: 14353, dtype: int64

In [48]:
import xgboost as xgb
with open('stored/xgboost.pkl','rb') as fp:
    bst = pickle.load(fp)

In [54]:
xgb_test = bst.predict( xgb.DMatrix(train_vect[y_test_main.index] ))

In [55]:
xgb_test

array([[  4.74178814e-05,   1.77983065e-06,   4.51594009e-04, ...,
          1.02377708e-04,   1.20039396e-04,   4.70855057e-01],
       [  6.24987479e-06,   7.36643074e-07,   1.53590762e-03, ...,
          7.83966016e-03,   1.43481970e-01,   5.41590303e-02],
       [  2.55771124e-06,   1.89327602e-06,   6.14607241e-03, ...,
          7.91820465e-04,   3.93305294e-04,   4.11242433e-02],
       ..., 
       [  3.82730423e-06,   2.57679676e-06,   2.18181405e-04, ...,
          5.06745419e-04,   1.33445676e-04,   6.57905787e-02],
       [  3.76473804e-06,   3.88897706e-06,   1.11472979e-03, ...,
          1.49682416e-02,   6.25399277e-02,   1.27496377e-01],
       [  2.22981225e-06,   9.97142979e-06,   1.50654872e-03, ...,
          3.02925531e-04,   1.66538593e-04,   1.52148947e-01]], dtype=float32)

In [70]:
test_values.values

array([ 0, 32, 34, ...,  6, 16,  5])