In [1]:
import os
import sys
import gc

import random
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
path = '/home/zhouzr/data/tianchi/user_location_predict/original_data/'
os.chdir(path)

train = pd.read_csv('./训练数据-ccf_first_round_user_shop_behavior.csv')
shop = pd.read_csv('./训练数据-ccf_first_round_shop_info.csv')
test = pd.read_csv('./AB榜测试集-evaluation_public.csv')

train = pd.merge(train, shop[['shop_id','mall_id']], on='shop_id', how='left')
sample = pd.concat([train, test])
sample['time_stamp'] = pd.to_datetime(sample.time_stamp)
sample.insert(0,'sample_id',range(sample.shape[0]))
mall_id_list = shop.mall_id.unique()
del train, test; gc.collect()



42

In [2]:
def outlier_filter(df, col, min_filter=0.005, max_filter=0.995):
    min_outlier = np.percentile(df[col], min_filter*100)
    max_outlier = np.percentile(df[col], max_filter*100)
    result = df[(df[col] < max_outlier) & (df[col] > min_outlier)].reset_index(drop=True)
    return result

In [55]:
def wifi_filter_dict(df, filter_rate=None, filter_num=None):
    wifi_infos = df['wifi_infos']
    wifi_count = {}
    for wifi_infos_i in wifi_infos:
        wifi_infos_i = [wifi.split('|') for wifi in wifi_infos_i.split(';')]
        for wifi in wifi_infos_i:
            if wifi[0] in wifi_count:
                wifi_count[wifi[0]] += 1
            else:
                wifi_count[wifi[0]] = 1
    if filter_rate:
        filter_num = np.percentile(pd.Series(wifi_count).values, filter_rate*100)
    if filter_num:
        filter_num = filter_num
    wifi_dict = {}
    for k, v in wifi_count.items():
        if v > filter_num:
            wifi_dict[k] = 0
    return wifi_dict

def wifi_power_ext(df, wifi_dict):   
    wifi_infos = df.wifi_infos
    result = []
    for wifi_infos_i in wifi_infos:
        wifi_infos_i = [wifi.split('|') for wifi in wifi_infos_i.split(';')]
        result_i = wifi_dict.copy()
        for wifi in wifi_infos_i:
            if wifi[0] in wifi_dict:
                result_i[wifi[0]] = float(wifi[1]) + 120
        result.append(result_i)
    result = pd.DataFrame(result)
    return result

def gen_train_test(df, mall_id, test=True, test_rate=0.2):
    df_mall = df[df.mall_id == mall_id].reset_index(drop=True)
    df1 = df_mall[df_mall.shop_id.notnull()].reset_index(drop=True)
    df2 = df_mall[df_mall.shop_id.isnull()].reset_index(drop=True)
    if test:
        test_sample_id = random.sample(df1.sample_id.tolist(), int(test_rate * df1.shape[0]))
        train = df1[-df1.sample_id.isin(test_sample_id)].reset_index(drop=True).drop(['sample_id','mall_id'], axis=1)
        test = df1[df1.sample_id.isin(test_sample_id)].reset_index(drop=True).drop(['sample_id','mall_id'], axis=1)
    else:
        train = df1.drop(['sample_id','mall_id'], axis=1)
        test = df2.drop(['sample_id','mall_id'], axis=1)
    

    # latitude/longitude outlier processing
    train = outlier_filter(train, 'latitude')
    train = outlier_filter(train, 'longitude')
    train_size = train.shape[0]   
    test_size = test.shape[0]
    scaler = MinMaxScaler()
    space_train = scaler.fit_transform(train[['latitude', 'longitude']])
    space_test = scaler.transform(test[['latitude', 'longitude']])
    # time
    weekday_train = train.time_stamp.dt.weekday.values.reshape(train_size,1)
    weekday_test = test.time_stamp.dt.weekday.reshape(test_size,1)
    hour_train = train.time_stamp.dt.hour.values.reshape(train_size,1)
    hour_test = test.time_stamp.dt.hour.reshape(test_size,1)
    # user_id
    le = LabelEncoder()
    le.fit(np.concatenate([train.user_id, test.user_id]))
    userid_train = le.transform(train.user_id).reshape(train_size,1)
    userid_test = le.transform(test.user_id).reshape(test_size,1)
    # wifi
    wifi_dict = wifi_filter_dict(train, filter_rate=0.95)
    wifi_train = wifi_power_ext(train, wifi_dict)
    wifi_test = wifi_power_ext(test, wifi_dict)
    
    train_x = np.concatenate([space_train, weekday_train, hour_train, userid_train, wifi_train], axis=1)
    test_x = np.concatenate([space_test, weekday_test, hour_test, userid_test, wifi_test], axis=1)
    
    #label encode
    class_le = LabelEncoder()
    class_le.fit(np.concatenate([train.shop_id.values, test.shop_id.fillna('Null').values]))
    
    train_y = class_le.transform(train.shop_id)
    test_y = class_le.transform(test.shop_id.fillna('Null'))
    
#     train_y = train.shop_id.values
#     test_y = test.shop_id.values
    test_row_id = test.row_id.values
    return train_x,train_y,test_x,test_y,test_row_id

In [56]:
train_x,train_y,test_x,test_y,test_row_id = gen_train_test(sample, mall_id_list[5])



In [57]:
np.unique(train_y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83])

In [58]:
np.unique(test_y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52,
       53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
       70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83])

In [59]:
dtrain = xgb.DMatrix(train_x, train_y)
dtest = xgb.DMatrix(test_x)
param = {'objective':'multi:softmax','num_class':84}
model = xgb.train(param,dtrain)

In [60]:
accuracy_score(test_y, model.predict(dtest))

0.8854886475814413

In [61]:
model = xgb.XGBClassifier()

In [62]:
model.fit(train_x, train_y, eval_metric='auc',verbose=1)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [21]:
lr = LogisticRegression()
lr.fit(train_x, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [63]:
model.score(test_x, test_y)

0.90621915103652517

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [64]:
accuracy_score(test_y, model.predict(test_x))

0.90621915103652517

In [27]:
accuracy_score(train_y, rf.predict(train_x))

0.99806446183623665

In [25]:
accuracy_score(test_y, rf.predict(test_x))

0.88910825929582105

In [None]:
params = {
            'booster':'gbtree',
            'eta':0.1,
            'objective':'multi:softmax',
            'max_depth':10,
            'num_class':np.unique(train_y).size+1
#             'subsample':1.0,
#             'min_child_weight':5,
#             'colsample_bytree':0.2,
#             'scale_pos_weight':0.1,
#             'eval_metric':'auc',
#             'gamma':0.2,            
#             'lambda':300
}
model = xgb.train(params, dtrain)