In [1]:
import time
import numpy as np
import pandas as pd

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from xgboost.sklearn import XGBClassifier

from tianchi.user_location_prediction.pysrc.construct_train_test_data import DataGenerator



In [2]:
def cv_test(x, y, cv, model):
    t = time.time()
    result = np.array(cross_val_score(model, x, y, cv=cv, scoring='accuracy', n_jobs=-1))
    t_use = (time.time()-t) / 60 / cv
    print('per cv cost:%.2f min'%(t_use))
    return result

In [3]:
data_generator = DataGenerator()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
data_generator.sample.head()

Unnamed: 0,sample_id,row_id,mall_id,user_id,lgt,ltt,time,wifi_infos,shop_id
0,0,,m_1409,u_376,122.308291,32.08804,2017-08-06 21:20:00,b_6396480|-67|false;b_41124514|-86|false;b_287...,s_2871718
1,1,,m_1409,u_376,122.308162,32.08797,2017-08-06 21:20:00,b_6396480|-67|false;b_56328155|-73|false;b_411...,s_2871718
2,2,,m_4079,u_1041,117.365255,40.638214,2017-08-02 13:10:00,b_8006367|-78|false;b_2485110|-52|false;b_3005...,s_181637
3,3,,m_6587,u_1158,121.134451,31.197416,2017-08-13 12:30:00,b_26250579|-73|false;b_26250580|-64|false;b_26...,s_609470
4,4,,m_3005,u_1654,122.255867,31.35132,2017-08-25 19:50:00,b_39004150|-66|false;b_39004148|-58|false;b_21...,s_3816766


In [16]:
sample_wifi_flag = data_generator.sample_wifi[data_generator.sample_wifi.signal_flag==1][['sample_id','wifi_id']]

In [18]:
d = dict(zip(sample_wifi_flag.sample_id, sample_wifi_flag.wifi_id))

In [19]:
d[76]

'b_26244816'

In [20]:
data_generator.sample['wifi_connect'] = data_generator.sample['sample_id'].apply(lambda x: d[x] if x in d else 'None')

In [22]:
data_generator.sample.to_csv('./sample.csv', index=None)

In [25]:
data_generator.sample.groupby('mall_id')['wifi_connect'].nunique()

mall_id
m_1021     448
m_1085     216
m_1089     405
m_1175     693
m_1263     426
m_1293    1018
m_1375     450
m_1377     697
m_1409     735
m_1621     666
m_1790     427
m_1831     444
m_1920     463
m_1950     607
m_2009     471
m_2058     185
m_2123     190
m_2182     497
m_2224     938
m_2267     485
m_2270     336
m_2333     449
m_2415     711
m_2467     716
m_2578     485
m_2715     330
m_2878     497
m_2907     479
m_3005    1087
m_3019     393
          ... 
m_5767     450
m_5810     308
m_5825     523
m_5892     585
m_615      380
m_6167     257
m_622      485
m_623      330
m_625      616
m_626      507
m_6337     722
m_6587     408
m_6803     260
m_690     1358
m_7168     682
m_7374     510
m_7523     530
m_7601     303
m_7800     403
m_7973     415
m_7994     316
m_8093     452
m_822      721
m_8344     710
m_8379     364
m_9054     368
m_9068     381
m_909     1127
m_968      549
m_979      431
Name: wifi_connect, Length: 97, dtype: int64

In [4]:
feat_all = ['weekday' ,'hour' ,'holiday', 'space_loc','space_dist','wifi_power','wifi_power_dist',
            'wifi_flag','wifi_flag_dist','user_cate','user_price']
feat_submission_1 = ['weekday' ,'hour' , 'space_loc','wifi_power','wifi_flag']
feat_submission_2 = ['weekday' ,'hour' , 'space_loc','space_dist','wifi_power','wifi_power_dist',
            'wifi_flag','wifi_flag_dist']
feat_submission_3 = ['weekday' ,'hour','holiday' , 'space_loc','wifi_power','wifi_flag','user_cate','user_price']

In [5]:
data_generator.shop.mall_id.unique()

array(['m_690', 'm_6587', 'm_5892', 'm_625', 'm_3839', 'm_3739', 'm_1293',
       'm_1175', 'm_2182', 'm_2058', 'm_3871', 'm_3005', 'm_822', 'm_2467',
       'm_4406', 'm_909', 'm_4923', 'm_2224', 'm_2333', 'm_4079', 'm_5085',
       'm_2415', 'm_4543', 'm_7168', 'm_2123', 'm_4572', 'm_1790',
       'm_3313', 'm_4459', 'm_1409', 'm_979', 'm_7973', 'm_1375', 'm_4011',
       'm_1831', 'm_4495', 'm_1085', 'm_3445', 'm_626', 'm_8093', 'm_4828',
       'm_6167', 'm_3112', 'm_4341', 'm_622', 'm_4422', 'm_2267', 'm_615',
       'm_4121', 'm_9054', 'm_4515', 'm_1950', 'm_3425', 'm_3501',
       'm_4548', 'm_5352', 'm_3832', 'm_1377', 'm_1621', 'm_1263',
       'm_2578', 'm_2270', 'm_968', 'm_1089', 'm_7374', 'm_2009', 'm_6337',
       'm_7601', 'm_623', 'm_5154', 'm_5529', 'm_4168', 'm_3916', 'm_2878',
       'm_9068', 'm_3528', 'm_4033', 'm_3019', 'm_1920', 'm_8344',
       'm_6803', 'm_3054', 'm_8379', 'm_1021', 'm_2907', 'm_4094',
       'm_4187', 'm_5076', 'm_3517', 'm_2715', 'm_5810', 'm

In [6]:
mall_id = 'm_690'

In [11]:
data_generator.sample_wifi.signal_flag.unique()

array([ 0.1,  1. ])

In [12]:
print('asdasd:',data_generator.sample_wifi.shape)

asdasd: (15793864, 4)


### submission 1 LB score 0.84

In [13]:
train_x, train_y, test_x, test_row_id = data_generator.gen_sub_train_test_data(feat_submission_1, mall_id)
feat1_lr_score = cv_test(train_x, train_y, 5, LogisticRegression())
print(feat1_lr_score)
print(feat1_lr_score.mean())

MemoryError: 

### submission 2  LB score 0.85

In [None]:
train_x, train_y, test_x, test_row_id = data_generator.gen_sub_train_test_data(feat_submission_2, mall_id)
feat1_lr_score = cv_test(train_x, train_y, 5, LogisticRegression())
print(feat1_lr_score)
print(feat1_lr_score.mean())

### submission 3 LB score 0.81

In [14]:
train_x, train_y, test_x, test_row_id = data_generator.gen_sub_train_test_data(feat_submission_3, mall_id)
feat1_lr_score = cv_test(train_x, train_y, 5, LogisticRegression())
print(feat1_lr_score)
print(feat1_lr_score.mean())

MemoryError: 