In [1]:
import os
import sys
import gc
import random
import datetime
import time

import random
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

seed = 3
random.seed(seed)
np.random.seed(seed)



### Outlier filter

### Extract features

1. Wifi
  * Wifi Signal Power Matrix ### param: drop_num, drop_rate
  * Wifi Signal Connect Wifi-Id ### fill NA with 'Null'
2. Time
  * Weekday & Hourmap ### Param: one_hot
3. Location
  * longditude
  * latitude
4. Time & Category
  * Weekday-Category shopping_num rate
  * hourmap-Category shopping_num rate
5. Time & Price
  * Weekday-Hourmap shopping 0% price
  * Weekday-Hourmap shopping 25% price
  * Weekday-Hourmap shopping 50% price
  * Weekday-Hourmap shopping 75% price
  * Weekday-Hourmap shopping 100% price
  * Weekday-Hourmap shopping price std
6. User & Shop
  * User last shopping shop_id ### before sample time
  * User most likely shopping shop_id ### before sample time
7. User & Category
  * User last shopping category ### before sample time
  * User most shopping category ### before sample time
8. User & Price
  * User shopping mean price ### before sample time
  * User shopping price std ### before sample time

In [25]:
path = '/home/zhouzr/data/user-location-forcast/'
os.chdir(path)
train = pd.read_csv('./data/train_local.csv')
test = pd.read_csv('./data/test_local.csv')
shop = pd.read_csv('./original_data/训练数据-ccf_first_round_shop_info.csv')

### outlier filter

In [115]:
def location_outlier_filter(df, filter_rate=0.005):
    def outlier_filter(df, col, min_filter, max_filter):
            min_outlier = np.percentile(df[col], min_filter*100)
            max_outlier = np.percentile(df[col], max_filter*100)
            outlier_index = df[(df[col] > max_outlier) | (df[col] < min_outlier)].index.tolist()
            return outlier_index
    idx1 = outlier_filter(df, 'latitude', filter_rate, filter_rate)
    idx2 = outlier_filter(df, 'longitude', filter_rate, filter_rate)
    drop_idx = list(set(idx1 + idx2))
    return df.drop(drop_idx, axis=0).reset_index(drop=True)

def user_behavior_outlier_filter(df, filter_num=40):
    user_behavior = df.groupby('user_id')['sample_id'].count()
    drop_idx = user_behavior[user_behavior>40].index.tolist()
    return df.drop(drop_idx, axis=0).reset_index(drop=True)

### wifi feat

In [None]:
class WifiFeat(object):
    
    def fit(self, df, wifi_power_drop_n, wifi_connect_drop_n):
        self.get_wifi_connect_list(df, wifi_connect_drop_n)
        self.get_wifi_power_list(df, wifi_power_drop_n)
    
    def get_wifi_connect_list(self, df, drop_num):
        self.wifi_connect_list = {}
        wifi_count = {}
        wifi_infos = df['wifi_infos']
        for wifi_infos_i in wifi_infos:
            wifi_infos_i = [wifi.split('|') for wifi in wifi_infos_i.split(';')]
            for wifi in wifi_infos_i:
                if wifi[2] != 'false':
                    if wifi[0] in wifi_count:
                        wifi_count[wifi[0]] += 1
                    else:
                        wifi_count[wifi[0]] = 1
        for k, v in wifi_count.items():
            if v > drop_num:
                self.wifi_connect_list[k] = 0

    
    def get_wifi_power_list(self, df, drop_num):
        self.wifi_power_list = {}
        wifi_infos = df['wifi_infos']
        wifi_count = {}
        for wifi_infos_i in wifi_infos:
            wifi_infos_i = [wifi.split('|') for wifi in wifi_infos_i.split(';')]
            for wifi in wifi_infos_i:
                if wifi[0] in wifi_count:
                    wifi_count[wifi[0]] += 1
                else:
                    wifi_count[wifi[0]] = 1
        for k, v in wifi_count.items():
            if v > drop_num:
                self.wifi_power_list[k] = 0
    
    def extract_wifi_power(self, df):
        wifi_infos = df.wifi_infos
        result = []
        for wifi_infos_i in wifi_infos:
            wifi_infos_i = [wifi.split('|') for wifi in wifi_infos_i.split(';')]
            result_i = self.wifi_power_list.copy()
            for wifi in wifi_infos_i:
                if wifi[0] in self.wifi_power_list:
                    result_i[wifi[0]] = float(wifi[1]) + 120
            result.append(result_i)
        result = pd.DataFrame(result)
        return result
    
    def extract_wifi_connect(self, df):
        result = []
        wifi_infos = df['wifi_infos']
        for wifi_infos_i in wifi_infos:
            wifi_infos_i = [wifi.split('|') for wifi in wifi_infos_i.split(';')]
            result_i = self.wifi_connect_list.copy()
            for wifi in wifi_infos_i:
                if wifi[2] != 'false':
                    if wifi[0] in self.wifi_connect_list:
                        result_i[wifi[0]] = 1
            result.append(result_i)
        result = pd.DataFrame(result)
        return result

In [117]:
train_i.head()

Unnamed: 0,sample_id,category_id,latitude,longitude,mall_id,price,row_id,shop_id,time_stamp,user_id,wifi_infos
0,0,c_38,32.08804,122.308291,m_1409,42.0,,s_2871718,2017-08-06 21:20:00,u_376,b_6396480|-67|false;b_41124514|-86|false;b_287...
1,1,c_38,32.08797,122.308162,m_1409,42.0,,s_2871718,2017-08-06 21:20:00,u_376,b_6396480|-67|false;b_56328155|-73|false;b_411...
57,68,c_49,32.088638,122.309295,m_1409,60.0,,s_52283,2017-08-14 20:40:00,u_13782,b_11592648|-74|false;b_56326068|-61|false;b_29...
68,80,c_34,32.087681,122.310365,m_1409,38.0,,s_288430,2017-08-11 15:30:00,u_18171,b_41870824|-84|false;b_51825273|-76|false;b_41...
124,156,c_34,32.087676,122.310139,m_1409,38.0,,s_288430,2017-08-14 13:30:00,u_29343,b_55113661|-76|false;b_52642831|-73|false;b_38...


In [116]:
train_i.shift()

Unnamed: 0,sample_id,category_id,latitude,longitude,mall_id,price,row_id,shop_id,time_stamp,user_id,wifi_infos
0,,,,,,,,,NaT,,
1,0.0,c_38,32.088040,122.308291,m_1409,42.0,,s_2871718,2017-08-06 21:20:00,u_376,b_6396480|-67|false;b_41124514|-86|false;b_287...
57,1.0,c_38,32.087970,122.308162,m_1409,42.0,,s_2871718,2017-08-06 21:20:00,u_376,b_6396480|-67|false;b_56328155|-73|false;b_411...
68,68.0,c_49,32.088638,122.309295,m_1409,60.0,,s_52283,2017-08-14 20:40:00,u_13782,b_11592648|-74|false;b_56326068|-61|false;b_29...
124,80.0,c_34,32.087681,122.310365,m_1409,38.0,,s_288430,2017-08-11 15:30:00,u_18171,b_41870824|-84|false;b_51825273|-76|false;b_41...
311,156.0,c_34,32.087676,122.310139,m_1409,38.0,,s_288430,2017-08-14 13:30:00,u_29343,b_55113661|-76|false;b_52642831|-73|false;b_38...
345,399.0,c_34,32.087674,122.310001,m_1409,38.0,,s_288430,2017-08-19 16:10:00,u_67271,b_55113661|-58|false;b_38288190|-57|false;b_41...
411,442.0,c_51,32.087918,122.307925,m_1409,64.0,,s_1877554,2017-08-05 13:00:00,u_74124,b_5577439|-70|false;b_5577434|-81|false;b_3110...
412,524.0,c_30,32.087500,122.307093,m_1409,47.0,,s_298098,2017-08-17 13:30:00,u_90101,b_10687160|-85|false;b_26685632|-88|false;b_24...
413,525.0,c_30,32.087486,122.307349,m_1409,47.0,,s_298098,2017-08-07 13:50:00,u_90101,b_26685632|-89|false;b_24303621|-35|false;b_56...


In [None]:
def wifi_connect(df_train, df_test, drop_num):
    wifi_list = {}
    counter = {}
        wifi_infos = df_train['wifi_infos']
        for wifi_infos_i in wifi_infos:
            wifi_infos_i = [wifi.split('|') for wifi in wifi_infos_i.split(';')]
            for wifi in wifi_infos_i:
                if wifi[2] != 'false':
                    if wifi[0] in wifi_count:
                        counter[wifi[0]] += 1
                    else:
                        counter[wifi[0]] = 1
        for k, v in counter.items():
            if v > drop_num:
                wifi_list[k] = 0
    

In [None]:
def wifi_connect(df, drop_num):
    df_train = df[df.row_id.isnull()]
    
    

def wifi_power(df, drop_num):
    pass

In [78]:
def hourmap(t):
    if (t >= datetime.time(22,0)) or (t < datetime.time(8,0)):
        return 0
    if (t >= datetime.time(8,0)) and (t < datetime.time(9,30)):
        return 1
    if (t >= datetime.time(9,30)) and (t < datetime.time(11,30)):
        return 2
    if (t >= datetime.time(11,30)) and (t < datetime.time(14,0)):
        return 3
    if (t >= datetime.time(14,0)) and (t < datetime.time(17,0)):
        return 4
    if (t >= datetime.time(17,0)) and (t < datetime.time(20,0)):
        return 5
    if (t >= datetime.time(20,0)) and (t < datetime.time(22,0)):
        return 6





def location(df):
    pass

def week_hour(df):
    pass

def week_category(df, most_n):
    pass

def hour_category(df, most_n):
    pass

def week_price

In [None]:
class OutlierFilter(object):
    
    def __init__(self):
        pass
    
    def run(self, df):
        outlier_idx1 = self.space_outlier_detect(df)
        outlier_idx2 = self.user_outlier_detect(df)
        test_idx = df[df.row_id.notnull()].index.tolist()
        outlier_idx = list(set(outlier_idx1 + outlier_idx2) - set(test_idx))
        result = df.drop(outlier_idx, axis=0).reset_index(drop=True)
        return result
    
    def space_outlier_detect(self, df):
        
    
    def user_outlier_detect(self, df):
        

class UserFeatExt(object):
        
    def extract_user_id(self, df_):
        df = df_.copy()
        user_cnt = df.groupby('user_id')['user_id'].count()
        user_set = set(df[df.shop_id.isnull()].user_id.values) |\
        set(df[df.shop_id.notnull()].user_id.values) |\
        set(user_cnt[user_cnt>5].index.tolist())
        
        df.loc[-df.user_id.isin(user_set),'user_id'] = 'Null'
        le = LabelEncoder()
        result = le.fit_transform(df.user_id.values).reshape(-1,1)
        return result

    
class SpaceFeat(object):
    
    def fit(self, df):
        self.scaler = MinMaxScaler()
        self.scaler.fit(df[['latitude','longitude']])
        
    def extract_space(self, df):
        result = self.scaler.transform(df[['latitude','longitude']])
        return result
    
    def extract_space_dist(self, df, shop):
        def squaer_dist(loc1, loc2_list):
            dist = np.power(np.sum(np.power(loc2_list-loc1,2),1),0.5)
            return dist
        df_space = df[['latitude','longitude']].values
        shop_loc = shop[['latitude','longitude']].values
        feat = np.apply_along_axis(squaer_dist, 1, df_space, **{'loc2_list':shop_loc})
        return feat


class TimeFeat(object):
    
    def extract_weekday(self, df, onehot=True):
        result = df.time_stamp.dt.weekday.values.reshape(df.shape[0], 1)
        if onehot:
            result = pd.get_dummies(pd.Series(result.reshape(-1))).values
        return result
    
    def extract_hour(self, df, onehot=True):
        result = df.time_stamp.dt.hour.values.reshape(df.shape[0], 1)
        if onehot:
            result = pd.get_dummies(pd.Series(result.reshape(-1))).values
        return result
    
    def extract_hour_map(self, df, onehot=True):
        
        result = df.time_stamp.dt.time.apply(time_map).values.reshape(df.shape[0], 1)
        if onehot:
            result = pd.get_dummies(pd.Series(result.reshape(-1))).values
        return result
    
    def extract_weekday_hour_map(self, df, onehot=True):
        weekday = self.extract_weekday(df, onehot=False)
        hour_map = self.extract_hour_map(df, onehot=False)
        result = weekday * 100 + hour_map
        if onehot:
            result = pd.get_dummies(pd.Series(result.reshape(-1))).values
        return result