In [2]:
%matplotlib inline
import pandas as pd 
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from time import time
import gc
from scipy.sparse import csr_matrix
import scipy.sparse as sp
import cPickle as pickle
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold, train_test_split

In [2]:
train = pd.read_csv('raw/train.csv')
test = pd.read_csv('raw/test.csv')

For this competition, you are tasked with categorizing shopping trip types based on the items that customers purchased. To give a few hypothetical examples of trip types: a customer may make a small daily dinner trip, a weekly large grocery trip, a trip to buy gifts for an upcoming holiday, or a seasonal trip to buy clothes.

Walmart has categorized the trips contained in this data into 38 distinct types using a proprietary method applied to an extended set of data. You are challenged to recreate this categorization/clustering with a more limited set of features. This could provide new and more robust ways to categorize trips.

The training set (train.csv) contains a large number of customer visits with the TripType included. You must predict the TripType for each customer visit in the test set (test.csv). Each visit may only have one TripType. You will not be provided with more information than what is given in the data (e.g. what the TripTypes represent or more product information).

The test set file is encrypted. You must complete this brief survey to receive the password.

## Data fields

* TripType - a categorical id representing the type of shopping trip the customer made. This is the ground truth that you are predicting. TripType_999 is an "other" category.
* VisitNumber - an id corresponding to a single trip by a single customer
* Weekday - the weekday of the trip
* Upc - the UPC number of the product purchased
* ScanCount - the number of the given item that was purchased. A negative value indicates a product return.
* DepartmentDescription - a high-level description of the item's department
* FinelineNumber - a more refined category for each of the products, created by Walmart

In [None]:
train.head(5)

In [None]:
train[train['TripType'] == 14]
train[train['Upc'].notnull()]['Upc'].apply(int).apply(str).apply(len).value_counts()

In [None]:
train[train['TripType'] == 14] #['Upc'].apply(int).apply(str)

In [None]:
len(train[train['Upc'].notnull()]['Upc'].apply(int).apply(str).apply(lambda x: ''.join(x[0:5])).value_counts())
    

In [None]:
# check data consistency
print len(train[['VisitNumber','Weekday']].groupby(['VisitNumber','Weekday'])) == len(train[['VisitNumber']].groupby(['VisitNumber']))
print len(test[['VisitNumber','Weekday']].groupby(['VisitNumber','Weekday'])) == len(test[['VisitNumber']].groupby(['VisitNumber']))

In [None]:
train['TripType'].hist(bins=100)

In [None]:
test['ScanCount'].hist(bins=1000)

In [3]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    idea from this post:
    http://www.kaggle.com/c/emc-data-science/forums/t/2149/is-anyone-noticing-difference-betwen-validation-and-leaderboard-error/12209#post12209

    Parameters
    ----------
    y_true : array, shape = [n_samples]
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    rows = actual.shape[0]
    actual[np.arange(rows), y_true.astype(int)] = 1
    vsota = np.sum(actual * np.log(predictions))
    return -1.0 / rows * vsota
    
        

In [None]:
train['TripType'].value_counts(dropna=False)

In [None]:
visits_count = train[['TripType','VisitNumber']].groupby('VisitNumber').agg(len)
visits_count = visits_count.reset_index(drop=False)
visits_count.columns = ['VisitNumber','VisitCounts']
visits_count = pd.merge(visits_count, train[['TripType','VisitNumber']],left_on = 'VisitNumber', 
                        right_on = 'VisitNumber',how = 'inner').drop_duplicates()
visits_count = visits_count.reset_index(drop=True)
visits_count

In [None]:
visits_count = visits_count[visits_count['TripType']!=999]
plt.scatter(visits_count['VisitCounts'],visits_count['TripType'])

In [None]:
for name in test.keys():
    print name
    print test[name].value_counts(dropna=False)
    print train[name].value_counts(dropna=False)

In [None]:
valuezz = pd.Series(train['Upc'].value_counts(dropna=False))
print len(valuezz[valuezz > 10])
valuezz = pd.Series(test['Upc'].value_counts(dropna=False))
print len(valuezz[valuezz > 10])


In [None]:
for name in test.keys():
    print name
    print len(test[name].value_counts(dropna=False).keys())
    print len(train[name].value_counts(dropna=False).keys())

In [None]:
train.shape

In [None]:
df = train[['VisitNumber','ScanCount']].groupby('VisitNumber').agg(np.sum)
np.log(df[df>=0]+1).hist(bins=50)

# Submission

In [4]:
# create DataFrame
train_clear = train.copy()
test_clear = test.copy()
target = train_clear[['TripType','VisitNumber']].copy()
train_clear = train_clear.drop('TripType',1)

# create target
target = target.drop_duplicates(['VisitNumber'])

target.index = target['VisitNumber']
target = target.drop('VisitNumber',1)
print len(target)

# Fill na
train_clear['Upc'].fillna(0,inplace=True)
train_clear['DepartmentDescription'].fillna('NoValueDept',inplace=True)
train_clear['FinelineNumber'].fillna(0,inplace=True)

test_clear['Upc'].fillna(0,inplace=True)
test_clear['DepartmentDescription'].fillna('NoValueDept',inplace=True)
test_clear['FinelineNumber'].fillna(0,inplace=True)


# set types
train_clear['Upc'] = train_clear['Upc'].apply(str)
train_clear['FinelineNumber'] = train_clear['FinelineNumber'].apply(int).apply(str)

test_clear['Upc'] = test_clear['Upc'].apply(str)
test_clear['FinelineNumber'] = test_clear['FinelineNumber'].apply(int).apply(str)

95674


In [5]:
# create upc len features
# create upc hash features
# create Fineline hash features
train_clear['Upc_Len'] = train_clear['Upc'].apply(len).apply(str)
train_clear['Upc_Hash'] = train_clear['Upc'].apply(lambda x: ''.join(x[0:5]))
train_clear['Upc_Hash_2'] = train_clear['Upc'].apply(lambda x: ''.join(x[0:2]))
train_clear['Upc_Hash_1'] = train_clear['Upc'].apply(lambda x: ''.join(x[0:1]))
train_clear['FineLine_div_100'] = train_clear['FinelineNumber'].apply(int).apply(lambda x: x // 100 ).apply(str)
train_clear['FineLine_mod_100'] = train_clear['FinelineNumber'].apply(int).apply(lambda x: x % 100 ).apply(str)
train_clear['FineLine_first_2'] = train_clear['FinelineNumber'].apply(lambda x: ''.join(x[0:2]))
train_clear['FineLine_Len'] = train_clear['FinelineNumber'].apply(len).apply(str)

test_clear['Upc_Len'] = test_clear['Upc'].apply(len).apply(str)
test_clear['Upc_Hash'] = test_clear['Upc'].apply(lambda x: ''.join(x[0:5]))
test_clear['Upc_Hash_2'] = test_clear['Upc'].apply(lambda x: ''.join(x[0:2]))
test_clear['Upc_Hash_1'] = test_clear['Upc'].apply(lambda x: ''.join(x[0:1]))
test_clear['FineLine_div_100'] = test_clear['FinelineNumber'].apply(int).apply(lambda x: x // 100 ).apply(str)
test_clear['FineLine_mod_100'] = test_clear['FinelineNumber'].apply(int).apply(lambda x: x % 100 ).apply(str)
test_clear['FineLine_first_2'] = test_clear['FinelineNumber'].apply(lambda x: ''.join(x[0:2]))
test_clear['FineLine_Len'] = test_clear['FinelineNumber'].apply(len).apply(str)

In [6]:
# keys intersection
train_upc = pd.Series(train_clear['Upc'].value_counts(dropna=False))
test_upc = pd.Series(test_clear['Upc'].value_counts(dropna=False))

upc_set = set(train_upc[train_upc > 100].keys())& set(test_upc[test_upc > 100].keys())
print len(upc_set)

upc_set_low = set(set(train_upc[train_upc <= 100].keys())& set(test_upc[test_upc <= 100].keys()))
print len(upc_set_low)

dept_set = set(train_clear['DepartmentDescription']) & set(test_clear['DepartmentDescription'])
print len(dept_set)

train_fine = pd.Series(train_clear['FinelineNumber'].value_counts(dropna=False))
test_fine = pd.Series(test_clear['FinelineNumber'].value_counts(dropna=False))
fine_set = set(train_fine.keys()) & set(test_fine.keys())
print len(fine_set)

upc_len = set(train_clear['Upc_Len']) & set(test_clear['Upc_Len'])
print len(upc_len)

fineline_len = set(train_clear['FineLine_Len']) & set(test_clear['FineLine_Len'])
print len(fineline_len)

upc_hash2 = set(train_clear['Upc_Hash_2']) & set(test_clear['Upc_Hash_2'])
print len(upc_hash2)

upc_hash1 = set(train_clear['Upc_Hash_1']) & set(test_clear['Upc_Hash_1'])
print len(upc_hash1)

train_upc_hash = pd.Series(train_clear['Upc_Hash'].value_counts(dropna=False))
test_upc_hash  = pd.Series(test_clear['Upc_Hash'].value_counts(dropna=False))

upc_hash = set(train_upc_hash[train_upc_hash > 10].keys())& set(test_upc_hash[test_upc_hash > 10].keys())
print len(upc_hash)

upc_hash_rare = set(train_upc_hash[train_upc_hash <= 10].keys())| set(test_upc_hash[test_upc_hash <= 10].keys())
print len(upc_hash_rare)

fine_div_100 = set(train_clear['FineLine_div_100']) & set(test_clear['FineLine_div_100'])
print len(fine_div_100)

fine_mod_100 = set(train_clear['FineLine_mod_100']) & set(test_clear['FineLine_mod_100'])
print len(fine_mod_100)

fine_first_2 = set(train_clear['FineLine_first_2']) & set(test_clear['FineLine_first_2'])
print len(fine_first_2)

len(fine_set)

492
68
5046
10
4
91
10
2986
100
100
100


5046

In [None]:
train_index = train_clear[train_clear['Upc_Hash'].isin(upc_hash_rare)].index
train_clear.loc[train_index,'Upc_Hash'] = np.repeat('LowValue',len(train_index))

test_index = test_clear[test_clear['Upc_Hash'].isin(upc_hash_rare)].index
test_clear.loc[test_index,'Upc_Hash'] = np.repeat('LowValue',len(test_index))

upc_hash= set(train_clear['Upc_Hash'].value_counts().keys())& set(test_clear['Upc_Hash'].value_counts().keys())
print len(upc_hash)

In [None]:
train_index = train_clear[train_clear['Upc'].isin(upc_set_low)].index
train_clear.loc[train_index,'Upc'] = np.repeat('LowValue',len(train_index))

test_index = test_clear[test_clear['Upc'].isin(upc_set_low)].index
test_clear.loc[test_index,'Upc'] = np.repeat('LowValue',len(test_index))

upc_set = set(train_clear['Upc'].value_counts().keys())& set(test_clear['Upc'].value_counts().keys())
print len(upc_set)

In [None]:
# NULL COLUMNS
# Upc - 4130
# DepartmentDescription - 1361
# FinelineNumber - 4129
train_clear

In [7]:
def log_scaler(num):
    if num >=0:
        return np.log(1+num)
    else:
        return -np.log(1-num)

In [8]:
def vectorize_df(df, dicts):
    print time()
    result = None
    inc = 0
    length = {name: [] for name in dicts.keys()}
    total_rows = []
    categorial_info = []
    sum_of_sales = []
    for name, group in df.groupby(['VisitNumber','Weekday']):
        field_dicts = []
        total_rows.append(len(group))
        
        sum_of_sales.append(log_scaler(np.sum(group['ScanCount'])))
        
        for key, value in dicts.iteritems():
            count_dict = dict.fromkeys(value, 0)
            sliced = group[['ScanCount',key]].groupby(key).agg(sum).to_dict()['ScanCount']
            length[key].append(len(sliced))
            
            for key_sl, val_sl in sliced.iteritems():
                if key_sl in count_dict.keys():# and val_sl > 0: # val_sl
                    count_dict[key_sl]+= val_sl
            
            for key in count_dict.keys():
                count_dict[key]= log_scaler(count_dict[key])
            
            field_dicts.extend(count_dict.values())
        
        categorial_info.append([name[0],name[1]])
        row = field_dicts
        
        if result is not None:
            result = sp.vstack([result,row], dtype=np.float64)
        else:
            result = csr_matrix(row)
        
        field_dicts = []
    

        # print index
        inc +=1
        if inc % 3000 == 0:
            print inc
            print time()
            gc.collect()
    
    # create names
    names = ['VisitNumber','Weekday']
        
    meta_info = pd.DataFrame(categorial_info, columns = names)
    #meta_info.columns = names 
    
    for key, value in dicts.iteritems():
        meta_info[key+'_length'] = pd.Series(length[key])
    
    meta_info['Total'] = pd.Series(total_rows)
    meta_info['Total_sales'] = pd.Series(sum_of_sales)
    gc.collect()
    
    return meta_info, result  

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer

def tfidf_vect(df, tfidf, fit_flag=False):
    
    if fit_flag:
        X_tfidf = tfidf.fit_transform(df)
    else:
        X_tfidf = tfidf.transform(df)
      
    
    return X_tfidf

tfidf = TfidfTransformer(norm='l2')

In [10]:
def add_negative(df):
    unique_visits = set(df['VisitNumber'])
    result = []
    inc = 0
    for item in unique_visits:
        split = df[df['VisitNumber'] == item]
        end_indicator = 1
        for i in split.index:
            if split.loc[i,'ScanCount'] < 0:
                result.append(1)
                break
            else:
                end_indicator += 1
                
        if end_indicator > len(split.index):
            result.append(0)
        
        inc +=1
        if inc % 10000 == 0:
            print inc
    gc.collect()
        
    return pd.Series(result)

In [11]:
all_dicts = {'DepartmentDescription': dept_set, 'Upc_Len': upc_len,  
             'FineLine_div_100': fine_div_100,'FineLine_mod_100':fine_mod_100,
             'Upc_Hash': upc_hash, 'Upc': upc_set, 'FinelineNumber': fine_set,
            'FineLine_Len':fineline_len} 
#Upc_Hash_2': upc_hash2,'Upc_Hash_1':upc_hash1}) # 

In [12]:
common_keys = ['VisitNumber','Weekday', 'Total']
for key in all_dicts.keys():
        common_keys.append(key+'_length')
common_keys

['VisitNumber',
 'Weekday',
 'Total',
 'FineLine_Len_length',
 'Upc_Hash_length',
 'FineLine_div_100_length',
 'FineLine_mod_100_length',
 'FinelineNumber_length',
 'DepartmentDescription_length',
 'Upc_length',
 'Upc_Len_length']

In [26]:
# train cycle
meta_train_vect, train_vect = vectorize_df(train_clear, all_dicts)
with open('stored/train_vect.pkl','wb') as fp:
    pickle.dump(train_vect, fp, pickle.HIGHEST_PROTOCOL)
with open('stored/train_vect_meta.pkl','wb') as fp:
    pickle.dump(meta_train_vect, fp, pickle.HIGHEST_PROTOCOL)
train_tf = tfidf_vect(train_vect, tfidf, True)
gc.collect()
train_vect = sp.hstack([train_tf,train_vect])
gc.collect()
train_tf = []
meta_train_vect['NegativeIndicator'] = add_negative(train_clear[['VisitNumber','ScanCount']])
gc.collect()
train_days = pd.get_dummies(meta_train_vect['Weekday']) # DELETE VISIT NUMBER!
gc.collect()
meta_train_vect = meta_train_vect.drop(['Weekday','VisitNumber'],1)
meta_train_vect = pd.concat([meta_train_vect, train_days],axis=1)
meta_train_vect = csr_matrix(meta_train_vect.values)
train_vect = sp.hstack([meta_train_vect,train_vect])
with open('stored/train_vect_with_meta.pkl','wb') as fp:
    pickle.dump(train_vect, fp, pickle.HIGHEST_PROTOCOL)
gc.collect()

NameError: name 'meta_train_vect' is not defined

In [None]:
meta_train_vect

In [None]:
train_vect = csr_matrix(train_vect)

In [25]:
with open('stored/train_vect.pkl','rb') as fp:
    train_vect = pickle.load(fp)

In [None]:
train_vect.shape

In [None]:
with open('stored/tfidf.pkl','wb') as fp:
    pickle.dump(tfidf, fp)

In [None]:
gc.collect()

In [27]:
# test cycle
#meta_test_vect, test_vect = vectorize_df(test_clear, all_dicts)
with open('stored/test_vect.pkl','wb') as fp:
    pickle.dump(test_vect, fp, pickle.HIGHEST_PROTOCOL)
with open('stored/test_vect_meta.pkl','wb') as fp:
    pickle.dump(meta_test_vect, fp, pickle.HIGHEST_PROTOCOL)
test_tf = tfidf_vect(test_vect, tfidf)
gc.collect()
test_vect = sp.hstack([test_tf,test_vect])
gc.collect()
test_tf = []
meta_test_vect['NegativeIndicator'] = add_negative(test_clear[['VisitNumber','ScanCount']])
gc.collect()
test_days = pd.get_dummies(meta_test_vect['Weekday'])
gc.collect()
meta_test_vect = meta_test_vect.drop(['Weekday','VisitNumber'],1)
meta_test_vect = pd.concat([meta_test_vect, test_days],axis=1)
meta_test_vect = csr_matrix(meta_test_vect.values) # add only VALUES
test_vect = sp.hstack([meta_test_vect,test_vect])
with open('stored/test_vect_with_meta.pkl','wb') as fp:
    pickle.dump(test_vect, fp, pickle.HIGHEST_PROTOCOL)
gc.collect()

10000
20000
30000
40000
50000
60000
70000
80000
90000


14

In [None]:
# here add variativity of purchases:
## single type only
## 2 type
## 3 type
## Max amount of sales
# Hierarchy approach from top to bottom
# groups of depts with conjunction1 up to 4th grade
# merge small categories & check amazon competition
# value of total sales
# closiness class prediction -> svd + KNN :) 
# play with 2Dec lection from dyakonov about categorial variables
# Use whole dataset in xgboost
# levels in some features 
# add fineline as numbers
# fineline mod 10
# 2 fisrt numbers of upc,3,4
# sorted by time - first buy
# rare items

# Numbers length -> Work
# 14 - Cannot deside and always back to buy more?
# USE LOGARITHM! and for NEGATIVE also(1+x)
# Small fractions
# TOTAL SUM OF ITEMS :)

In [14]:
classes = sorted(set(target['TripType']))
classes

[3,
 4,
 5,
 6,
 7,
 8,
 9,
 12,
 14,
 15,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 999]

In [15]:
target['TripType'].value_counts()

8      12161
39      9896
9       9464
999     8444
40      6130
7       5752
5       4593
25      3698
3       3643
36      3005
38      2912
37      2788
24      2609
35      2030
32      1984
42      1858
33      1315
6       1277
44      1187
30      1081
15       978
22       928
43       872
27       785
34       719
21       641
20       637
31       594
41       583
18       549
26       503
28       492
29       433
19       375
4        346
12       269
23       139
14         4
dtype: int64

In [16]:
# Replacement
replace_labels = dict(zip(classes, range(0,len(classes))))
target['TripType'] = target['TripType'].replace(replace_labels)

In [20]:
param = {}

# use softmax multi-class classification
param['objective'] = 'multi:softprob'
param['eval_metric'] = 'mlogloss'

# scale weight of positive examples
param['eta'] = 0.02
param['max_depth'] = 4
param['silent'] = 0
param['num_class'] = len(classes)
param['subsample'] = 0.5 # 0.5
param['colsample_bytree'] = 0.5 # 0.5

#0.1  < 500
#0.01 < 5000
def eta_f(i, num_boost_rounds):
    if i < 300:
        return 0.01
    if i < 5000:
        return 0.01 

num_round = 4000
y = target['TripType']
skf = StratifiedKFold(y.values, n_folds=4)
#print len(set(target.index)|set(train_vect.index))
for train_index, test_index in skf:
    X_train, X_test, y_train, y_test = train_test_split(train_vect, y, test_size = 0.15, train_size = 0.85, stratify = y)
    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_test = xgb.DMatrix(X_test, label=y_test)
    #xg_train = xgb.DMatrix(train_vect[train_index], label=y.values[train_index])
    #xg_test = xgb.DMatrix(train_vect[test_index], label=y.values[test_index]) 
    gc.collect()
    watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
    bst = xgb.train(param, xg_train, num_round, watchlist)#, learning_rates = eta_f)
    break
    
# get prediction

[0]	train-mlogloss:3.488158	test-mlogloss:3.490518
[1]	train-mlogloss:3.358528	test-mlogloss:3.363027
[2]	train-mlogloss:3.252251	test-mlogloss:3.257989
[3]	train-mlogloss:3.157242	test-mlogloss:3.164095
[4]	train-mlogloss:3.067469	test-mlogloss:3.075339
[5]	train-mlogloss:2.991246	test-mlogloss:3.000201
[6]	train-mlogloss:2.921841	test-mlogloss:2.932157
[7]	train-mlogloss:2.855986	test-mlogloss:2.867268
[8]	train-mlogloss:2.795435	test-mlogloss:2.807869
[9]	train-mlogloss:2.737471	test-mlogloss:2.751219
[10]	train-mlogloss:2.684099	test-mlogloss:2.698518
[11]	train-mlogloss:2.635593	test-mlogloss:2.650405
[12]	train-mlogloss:2.588908	test-mlogloss:2.604569
[13]	train-mlogloss:2.542938	test-mlogloss:2.559266
[14]	train-mlogloss:2.500414	test-mlogloss:2.517498
[15]	train-mlogloss:2.459640	test-mlogloss:2.477278
[16]	train-mlogloss:2.421522	test-mlogloss:2.439816
[17]	train-mlogloss:2.386061	test-mlogloss:2.405148
[18]	train-mlogloss:2.350486	test-mlogloss:2.370234
[19]	train-mlogloss:2.

In [None]:
saved_bst = bst

In [None]:
import cPickle as pickle
import xgboost as xgb
gc.collect()
with open('stored/xgboost.pkl','rb') as fp:
    bst = pickle.load(fp)

In [None]:
import cPickle as pickle
with open('stored/xgboost.pkl','wb') as fp:
    pickle.dump(saved_bst, fp)

In [None]:
with open('stored/target.pkl','wb') as fp:
    pickle.dump(y[ts_in], fp)

In [None]:
with open('stored/test_index.pkl','wb') as fp:
    pickle.dump(ts_in, fp)

In [28]:
# 0.620606 5000, 4, 0.01 eta
# 0.620502 7000, 4, 0.01 eta 0.61477 leaderboard
pred = bst.predict( xgb.DMatrix(test_vect ))

In [29]:
pred.shape

(95674, 38)

# Draw importance

In [None]:
%matplotlib inline
import matplotlib as mpl
mpl.use("Agg") #Needed to save figures

In [None]:
import operator
def create_feature_map(features):
    outfile = open('stored/xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

In [None]:
create_feature_map(train_vect.keys())

In [None]:
importance = bst.get_fscore(fmap='stored/xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

In [None]:
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

featp = df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
fig_featp = featp.get_figure()
fig_featp.savefig('stored/feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)

In [None]:
df.sort(['fscore'], ascending=False)

# Submission

In [30]:
sample_submission = pd.read_csv('raw/sample_submission.csv')

In [31]:
columns = sample_submission.columns
shape = sample_submission.shape

In [32]:
submission = pd.DataFrame(sample_submission['VisitNumber'])
values = pd.DataFrame(pred)
submission = pd.concat([submission, values], axis=1)
submission.columns = columns
submission

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,6.999092e-08,3.824025e-08,0.000009,0.000003,0.000105,0.006237,0.000585,2.742181e-06,1.390970e-06,...,0.000026,0.000092,0.124688,0.001978,0.000007,7.780486e-04,0.002475,0.000181,0.000020,0.000169
1,2,1.326306e-05,4.105256e-05,0.001150,0.000391,0.059379,0.035925,0.069978,4.851886e-03,6.408397e-05,...,0.003358,0.002041,0.025921,0.472007,0.000820,1.122571e-03,0.013054,0.014937,0.000714,0.059852
2,3,2.434523e-08,1.648730e-08,0.000021,0.000002,0.000055,0.000155,0.000224,1.561130e-07,5.516747e-07,...,0.000009,0.000007,0.000007,0.000010,0.000002,2.521432e-06,0.000019,0.000002,0.000002,0.999380
3,4,3.002121e-07,5.623010e-07,0.000192,0.000085,0.001538,0.015870,0.964210,2.048316e-06,7.307193e-06,...,0.000092,0.000065,0.000084,0.000123,0.000010,2.099848e-05,0.000296,0.000115,0.000031,0.009592
4,6,2.434523e-08,1.648730e-08,0.000021,0.000002,0.000055,0.000155,0.000224,1.561130e-07,5.516747e-07,...,0.000009,0.000007,0.000007,0.000010,0.000002,2.521432e-06,0.000019,0.000002,0.000002,0.999380
5,13,1.099991e-06,2.042432e-06,0.000630,0.000479,0.587704,0.314346,0.064179,5.188235e-05,1.154747e-05,...,0.000314,0.000408,0.001088,0.000923,0.000063,3.227876e-05,0.000142,0.000199,0.000032,0.019154
6,14,6.765831e-07,3.873610e-07,0.000055,0.000043,0.005473,0.000040,0.000147,3.596474e-05,1.274935e-05,...,0.000076,0.000116,0.000472,0.840400,0.000680,1.635786e-03,0.013803,0.015255,0.035773,0.000242
7,16,5.538409e-08,1.524395e-06,0.000136,0.000180,0.003553,0.000079,0.000080,1.083605e-04,1.115301e-05,...,0.000312,0.000928,0.000704,0.088169,0.001534,5.911106e-04,0.001641,0.002563,0.000435,0.000347
8,18,1.002586e-06,2.048449e-06,0.000376,0.000056,0.028053,0.000568,0.000169,2.594925e-05,1.883908e-05,...,0.001240,0.020449,0.005414,0.731137,0.001184,1.863703e-04,0.000551,0.002863,0.001502,0.000468
9,21,1.959575e-05,3.628069e-04,0.243078,0.000541,0.000726,0.000766,0.000320,3.426031e-05,3.097169e-05,...,0.255292,0.000405,0.000790,0.313147,0.000554,8.363704e-04,0.002089,0.018279,0.005043,0.004723


In [33]:
submission.to_csv('submissions/9_xgb_3675_eta0_02_new_split_dept_4_0.5_0.5.csv',index=False)

# Playing with probabilities

In [None]:
new_sub = pd.read_csv('submissions/7_xgb_fixed_more_upc_hash_count_vars_neg_flag_tfidf_originial_700_dept_4_0.5_0.5.csv')

In [None]:
new_sub.keys()[4]

In [None]:
np.log(new_sub['TripType_6']).hist(bins=50)

In [None]:
new_sub[np.log(new_sub['TripType_25']) > -5]

In [None]:
new_sub.loc[np.log(new_sub['TripType_6']) < -5,'TripType_6' ] = 0
#new_sub.loc[np.log(new_sub['TripType_23']) < -5,'TripType_23' ] = 0
#new_sub.loc[np.log(new_sub['TripType_25']) < -4,'TripType_25' ] = 0
#new_sub.loc[np.log(new_sub['TripType_26']) < -5,'TripType_26' ] = 0

In [None]:
new_sub.to_csv('submissions/8_nullify_xgb_fixed_more_upc_hash_count_vars_neg_flag_tfidf_originial_700_dept_4_0.5_0.5.csv',index=False)

In [None]:
new_sub

# Small averaging

In [37]:
sub1 = pd.read_csv('submissions/11_xgb_4000_eta0_02_rare_new_split_dept_4_0.5_0.5.csv')
sub2 = pd.read_csv('submissions/9_xgb_8000_eta0_01_new_split_dept_4_0.5_0.5.csv')
sub3 = pd.read_csv('submissions/9_xgb_3675_eta0_02_new_split_dept_4_0.5_0.5.csv')
sub4 = pd.read_csv('submissions/8_xgb_7000_eta0_01_new_split_dept_4_0.5_0.5.csv')

In [38]:
new_sub = sub1+sub2+sub3+sub4

In [40]:
new_sub

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,4,4.229215e-07,1.983208e-07,0.000029,0.000009,0.000332,0.018449,0.001335,9.780690e-06,0.000003,...,0.000088,0.000263,0.321613,0.005431,0.000019,0.002203,0.006222,0.000533,0.000061,0.000536
1,8,5.821540e-05,1.816263e-04,0.004727,0.002563,0.211870,0.191585,0.192336,1.369435e-02,0.000214,...,0.019906,0.008629,0.112093,1.812599,0.005078,0.006459,0.071254,0.065140,0.003077,0.204575
2,12,2.665553e-07,1.112653e-07,0.000078,0.000007,0.000219,0.001819,0.005990,9.232272e-07,0.000002,...,0.000040,0.000023,0.000033,0.000041,0.000009,0.000009,0.000072,0.000008,0.000006,3.991248
3,16,2.481412e-06,3.848062e-06,0.000798,0.000314,0.008007,0.073320,3.834106,7.402999e-06,0.000019,...,0.000603,0.000270,0.000499,0.000631,0.000061,0.000133,0.001726,0.000601,0.000134,0.046959
4,24,2.665553e-07,1.112653e-07,0.000078,0.000007,0.000219,0.001819,0.005990,9.232272e-07,0.000002,...,0.000040,0.000023,0.000033,0.000041,0.000009,0.000009,0.000072,0.000008,0.000006,3.991248
5,52,7.125340e-06,9.865157e-06,0.002226,0.001737,2.279946,1.361881,0.229888,1.675468e-04,0.000026,...,0.001676,0.001556,0.005924,0.003797,0.000346,0.000207,0.000559,0.000664,0.000100,0.060935
6,56,2.068424e-06,2.151327e-06,0.000206,0.000149,0.017194,0.000127,0.000440,1.063780e-04,0.000042,...,0.000445,0.000555,0.001671,3.354596,0.003039,0.005057,0.070206,0.063165,0.098999,0.000857
7,64,1.707397e-07,5.760051e-06,0.000558,0.000720,0.018661,0.000256,0.000255,2.006495e-04,0.000042,...,0.001823,0.005066,0.002797,0.373692,0.006219,0.002085,0.006188,0.009024,0.002021,0.001388
8,72,4.727238e-06,7.485650e-06,0.001294,0.000278,0.087740,0.002628,0.000610,1.457953e-04,0.000059,...,0.004662,0.069332,0.020725,3.009036,0.004614,0.001076,0.002187,0.010333,0.005076,0.001909
9,84,9.589896e-05,1.051674e-03,1.364658,0.002978,0.003171,0.002156,0.001079,1.441211e-04,0.000245,...,0.909206,0.001166,0.002503,1.137431,0.002701,0.003385,0.007241,0.073288,0.025356,0.011743


In [7]:
sub2

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,9.911663e-08,7.065865e-09,0.000005,9.239739e-07,0.000049,0.004298,0.000172,1.685083e-06,1.884764e-07,...,0.000008,0.000032,0.072276,0.000736,0.000002,4.041894e-04,0.000508,0.000065,0.000010,0.000054
1,2,1.385892e-05,1.985672e-05,0.001047,9.281239e-04,0.046395,0.040435,0.031008,4.408424e-03,1.290698e-05,...,0.005871,0.001849,0.029276,0.470795,0.000971,1.644556e-03,0.014619,0.013286,0.000346,0.045108
2,3,6.825236e-09,1.240950e-08,0.000016,2.251088e-06,0.000052,0.000120,0.000201,1.904640e-07,2.479078e-07,...,0.000008,0.000005,0.000006,0.000007,0.000002,1.740525e-06,0.000017,0.000002,0.000001,0.999476
3,4,1.911384e-07,6.032569e-07,0.000182,7.303879e-05,0.001941,0.018577,0.956238,2.368706e-06,2.074254e-06,...,0.000136,0.000055,0.000112,0.000136,0.000011,3.163833e-05,0.000412,0.000140,0.000022,0.012408
4,6,6.825236e-09,1.240950e-08,0.000016,2.251088e-06,0.000052,0.000120,0.000201,1.904640e-07,2.479078e-07,...,0.000008,0.000005,0.000006,0.000007,0.000002,1.740525e-06,0.000017,0.000002,0.000001,0.999476
5,13,4.723933e-07,8.708735e-07,0.000425,4.651172e-04,0.686858,0.232634,0.052898,2.676685e-05,1.791388e-06,...,0.000256,0.000313,0.001314,0.000621,0.000061,2.501749e-05,0.000100,0.000159,0.000015,0.009352
6,14,2.608193e-07,3.558978e-07,0.000043,1.896002e-05,0.002772,0.000033,0.000113,3.110301e-05,1.908771e-06,...,0.000078,0.000103,0.000289,0.857500,0.000457,5.255149e-04,0.013576,0.007496,0.017166,0.000170
7,16,1.290438e-08,7.250496e-07,0.000122,8.171216e-05,0.003700,0.000052,0.000046,3.917797e-05,2.545762e-06,...,0.000419,0.001025,0.000522,0.073483,0.000831,3.552476e-04,0.000759,0.001161,0.000390,0.000227
8,18,1.119848e-06,8.665147e-07,0.000281,4.363305e-05,0.012254,0.000617,0.000097,5.091607e-05,3.458619e-06,...,0.000758,0.013770,0.003854,0.711344,0.001025,1.475279e-04,0.000309,0.001332,0.000899,0.000330
9,21,3.221198e-05,3.190127e-04,0.382485,7.165641e-04,0.000689,0.000535,0.000339,3.084995e-05,2.501233e-05,...,0.141643,0.000188,0.000582,0.308466,0.000839,5.310532e-04,0.001625,0.016975,0.007027,0.002762


In [41]:
new_sub = new_sub/4.0

In [33]:
new_sub

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,1.539094e-07,9.066952e-08,0.000009,0.000003,0.000099,0.004473,0.000323,2.256853e-06,8.278864e-07,...,0.000029,0.000068,0.080887,0.001503,0.000006,0.000691,0.001748,0.000146,0.000018,0.000164
1,2,1.743330e-05,6.041937e-05,0.001306,0.000726,0.054994,0.054218,0.047950,2.867764e-03,6.205819e-05,...,0.005265,0.002308,0.028657,0.439310,0.001294,0.001843,0.018901,0.017182,0.000859,0.051536
2,3,1.141983e-07,3.791434e-08,0.000021,0.000002,0.000056,0.000672,0.002349,2.131080e-07,5.312312e-07,...,0.000011,0.000006,0.000009,0.000012,0.000002,0.000003,0.000020,0.000002,0.000002,0.996726
3,4,1.164288e-06,1.402542e-06,0.000235,0.000084,0.002310,0.020074,0.954306,1.851898e-06,5.990906e-06,...,0.000184,0.000083,0.000158,0.000190,0.000020,0.000045,0.000526,0.000167,0.000042,0.012578
4,6,1.141983e-07,3.791434e-08,0.000021,0.000002,0.000056,0.000672,0.002349,2.131080e-07,5.312312e-07,...,0.000011,0.000006,0.000009,0.000012,0.000002,0.000003,0.000020,0.000002,0.000002,0.996726
5,13,2.492355e-06,2.716161e-06,0.000553,0.000412,0.567083,0.340141,0.060931,3.606760e-05,7.201164e-06,...,0.000457,0.000413,0.001594,0.001038,0.000097,0.000068,0.000165,0.000170,0.000029,0.014702
6,14,6.546552e-07,7.354293e-07,0.000063,0.000042,0.004827,0.000032,0.000109,2.454740e-05,1.236457e-05,...,0.000134,0.000174,0.000473,0.828586,0.000908,0.001521,0.018437,0.017923,0.029037,0.000257
7,16,6.506768e-08,1.703223e-06,0.000146,0.000185,0.004816,0.000074,0.000069,4.254808e-05,1.183474e-05,...,0.000479,0.001541,0.000761,0.093478,0.001586,0.000631,0.001734,0.002469,0.000560,0.000388
8,18,1.433217e-06,2.421320e-06,0.000333,0.000069,0.023832,0.000717,0.000141,3.245825e-05,1.673893e-05,...,0.001222,0.019061,0.005531,0.737894,0.001183,0.000340,0.000660,0.002897,0.001352,0.000549
9,21,2.629673e-05,2.794401e-04,0.331270,0.000778,0.000931,0.000585,0.000254,3.185109e-05,6.446341e-05,...,0.238111,0.000324,0.000704,0.280629,0.000741,0.001206,0.002033,0.019320,0.006822,0.003225


In [44]:
new_sub.to_csv('submissions/12_avg_4_with_weights_best.csv',index=False)

In [42]:
new_sub['VisitNumber'] = sub1['VisitNumber']

In [43]:
new_sub['VisitNumber'].values

array([     1,      2,      3, ..., 191340, 191341, 191348])