In [11]:
import warnings
import os
import sys

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.linear_model import LogisticRegression


from tianchi.user_location_prediction.pycode.load_data import *

In [2]:
def load_preprosessing_data():
    shop = pd.read_csv('./pre_processing_data/shop.csv')
    user = pd.read_csv('./pre_processing_data/user.csv')
    wifi = pd.read_csv('./pre_processing_data/wifi.csv')

    shop = shop[shop.mall_id=='m_1409'].copy().reset_index(drop=True)
    user = user[user.mall_id=='m_1409'].copy().reset_index(drop=True)
    wifi = wifi[wifi.mall_id=='m_1409'].copy().reset_index(drop=True)

    return shop, user, wifi

In [14]:
abtest = load_original_abtest()

In [16]:
user = load_original_user()

In [4]:
shop, user, wifi = load_preprosessing_data()

In [5]:
abtest.head(3)

Unnamed: 0,row_id,user_id,mall_id,time_stamp,lgt,ltt,wifi_infos
0,118742,u_30097142,m_3916,2017-09-05 13:00,122.141011,39.818847,b_34366982|-82|false;b_37756289|-53|false;b_41...
1,118743,u_30097803,m_5085,2017-09-06 13:10,118.191907,32.855858,b_36722251|-81|false;b_10537579|-75|false;b_43...
2,118744,u_30097889,m_4033,2017-09-06 17:40,119.19211,32.424667,b_30026291|-74|false;b_30026290|-74|false;b_36...


In [17]:
user.head(3)

Unnamed: 0,user_id,shop_id,time_stamp,longitude,latitude,wifi_infos
0,u_376,s_2871718,2017-08-06 21:20,122.308291,32.08804,b_6396480|-67|false;b_41124514|-86|false;b_287...
1,u_376,s_2871718,2017-08-06 21:20,122.308162,32.08797,b_6396480|-67|false;b_56328155|-73|false;b_411...
2,u_1041,s_181637,2017-08-02 13:10,117.365255,40.638214,b_8006367|-78|false;b_2485110|-52|false;b_3005...


### 空间距离最近

In [6]:
def space_distance_calc(lgt, ltt):
    idx = np.argmin((shop.lgt-lgt)**2 + (shop.ltt-ltt)**2)
    return shop.iloc[idx, 0]

In [7]:
pre = [space_distance_calc(i[0], i[1]) for i in zip(user.lgt, user.ltt)]

In [8]:
accuracy_score(user.shop_id, pre)

0.33680805176132278

## 空间+wifi建模

In [9]:
wifi.head()

Unnamed: 0,user_id,wifi_id,shop_id,mall_id,lgt,ltt,time,signal_power,signal_flag
0,u_376,b_6396480,s_2871718,m_1409,122.308291,32.08804,2017-08-06 21:20,-67.0,0
1,u_376,b_41124514,s_2871718,m_1409,122.308291,32.08804,2017-08-06 21:20,-86.0,0
2,u_376,b_28723327,s_2871718,m_1409,122.308291,32.08804,2017-08-06 21:20,-90.0,0
3,u_376,b_6396479,s_2871718,m_1409,122.308291,32.08804,2017-08-06 21:20,-55.0,0
4,u_376,b_8764723,s_2871718,m_1409,122.308291,32.08804,2017-08-06 21:20,-90.0,0


In [10]:
def construct_train(wifi_df):
    signal_min = wifi_df.signal_power.min()
    wifi_power = pd.pivot_table(wifi[['user_id','wifi_id','shop_id','signal_power','time','lgt','ltt']], 
                                index=['user_id','shop_id','time','lgt','ltt'], 
                                columns='wifi_id', values='signal_power').reset_index().fillna(signal_min - 1)
    wifi_flag = pd.pivot_table(wifi[['user_id','wifi_id','shop_id','signal_flag','time','lgt','ltt']], 
                                index=['user_id','shop_id','time','lgt','ltt'], 
                                columns='wifi_id', values='signal_flag').reset_index().fillna(0)
    wifi_t = pd.concat([wifi_power, wifi_flag.drop(['user_id','shop_id','time','lgt','ltt'], axis=1)], axis=1)
    return wifi_t

In [11]:
wifi_t = construct_train(wifi)

In [12]:
wifi_t.head()

wifi_id,user_id,shop_id,time,lgt,ltt,b_10004588,b_10063227,b_10066720,b_10071970,b_10083821,...,b_9726478,b_9739374,b_9761636,b_9809138,b_9837892,b_9893403,b_9926786,b_9935522,b_9964950,b_9983657
0,u_10001808,s_9554,2017-08-13 14:00,122.306404,32.086921,-108.0,-108.0,-108.0,-108.0,-108.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,u_10022915,s_1426013,2017-08-02 21:10,122.3083,32.088127,-108.0,-108.0,-108.0,-108.0,-108.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,u_10028547,s_1465966,2017-08-27 14:20,122.308005,32.087519,-108.0,-108.0,-108.0,-108.0,-108.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,u_10032014,s_43525,2017-08-26 21:00,122.307549,32.087632,-108.0,-108.0,-108.0,-108.0,-108.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,u_10034935,s_580441,2017-08-22 20:50,122.3082,32.088059,-108.0,-108.0,-108.0,-108.0,-108.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
x = wifi_t.iloc[:,3:].values
y = wifi_t.iloc[:,1].values

In [21]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegressionCV

In [18]:
sclaer = MinMaxScaler()
x = sclaer.fit_transform(x)

In [24]:
lrcv = LogisticRegression()

In [25]:
lrcv.fit(x[:1000], y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
lrcv.score(x,y)

0.968325519894218

In [31]:
x.shape

(16638, 11418)

In [35]:
from sklearn import cross_validation

In [36]:
model = LogisticRegression()

In [38]:
scores = cross_validation.cross_val_score(model, x, y, cv=5, scoring='accuracy', n_jobs=-1)

In [39]:
scores

array([ 0.93933988,  0.94022714,  0.92844257,  0.93297101,  0.92038894])