### 173 original month_lag で期間を区切り集計を行う
相対的に時系列のズレはあるが、初期3ヶ月のトランザクションを共通集計し、targetの期間と一致すると考えられるグループで予測モデルを作り、他のグループはその予測モデルで初回のロイヤリティを予測する

In [3]:
%load_ext autoreload
%autoreload 2
import gc
import re
import pandas as pd
from __future__ import print_function, division
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
import os
import sys
HOME = os.path.expanduser("~")
sys.path.append(f'{HOME}/kaggle/data_analysis/library')
import utils
from utils import get_categorical_features, get_numeric_features, reduce_mem_usage
from preprocessing import get_dummies
import datetime

from tqdm import tqdm
import time
import sys

os.listdir('../input/')
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_avtive_month']

### Data Load

In [5]:
df_train = utils.read_df_pkl('../input/train0*')
df_test = utils.read_df_pkl('../input/test0*')
df_train.set_index(key, inplace=True)
df_test.set_index(key, inplace=True)
train_test = pd.concat([df_train, df_test], axis=0)

df_hist = utils.read_df_pkl('../input/hist_clean_rdm0*')
df_hist = reduce_mem_usage(df_hist)

df_org_lag = utils.read_pkl_gzip('../input/train_test_original_month_lag.gz')
df_org_lag = reduce_mem_usage(df_org_lag)
df_org_lag.head()

df_org_lag.rename(columns={'purchase_date':'yyyymm'}, inplace=True)
print(df_hist.shape)
df_hist = df_hist.merge(df_org_lag.reset_index()[[key, 'yyyymm', 'org_month_lag']], how='inner', on=[key, 'yyyymm'])
print(df_hist.shape)

100%|██████████| 3/3 [00:00<00:00, 95.88it/s]
100%|██████████| 3/3 [00:00<00:00, 185.42it/s]
100%|██████████| 3/3 [00:11<00:00,  3.76s/it]


Mem. usage decreased to 2554.26 Mb (0.0% reduction)
Mem. usage decreased to 72.58 Mb (21.9% reduction)
(29112361, 21)
(29112361, 22)


In [6]:
def feat_agg(df, fname):
    
    global train_test
    
    df = pd.get_dummies(df, columns=['category_2', 'category_3'])
    
#     aggs = {}
    if fname.count('cat1'):
        aggs = {
        'category_3_A': ['mean'],
        'category_3_B': ['mean'],
        'category_3_C': ['mean'],
        }
    else:
        aggs = {
        'category_2_1.0': ['mean'],
        'category_2_2.0': ['mean'],
        'category_2_3.0': ['mean'],
        'category_2_4.0': ['mean'],
        'category_2_5.0': ['mean'],
        'category_3_A': ['mean'],
        'category_3_B': ['mean'],
        'category_3_C': ['mean'],
        }
    
    if not(fname.count('cat')):
        aggs['category_1'] = ['sum', 'mean']
            
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['first_active_month'] =  pd.to_datetime(df['first_active_month'])
    df['month_diff'] = (pd.to_datetime('2018-05-01') - df['purchase_date']).dt.days
    
    aggs['month_lag'] = ['mean', 'std']
    aggs['yyyy_week'] = ['nunique']
    aggs['purchase_date'] = ['max','min']
    aggs['month_diff'] = ['mean', 'std']
    
    aggs['purchase_amount'] = ['sum','max', 'min','mean']
    aggs['installments'] = ['sum', 'max', 'min','mean', 'std']
    
    aggs['merchant_id'] = ['nunique']
    aggs['merchant_category_id'] = ['nunique']
    aggs['card_id'] = ['size']
    aggs['city_id'] = ['nunique']
    aggs['state_id'] = ['nunique']
    aggs['subsector_id'] = ['nunique']
    
    new_columns = get_new_columns(fname, aggs)
    df_agg = df.groupby(key).agg(aggs)
    df_agg.columns = new_columns
    df_agg.reset_index(drop=False,inplace=True)
     
    df_agg[f'{fname}_this_term'] =  (pd.to_datetime(df_agg[f'{fname}_purchase_date_max']) - pd.to_datetime(df_agg[f'{fname}_purchase_date_min'])).dt.days
    df_agg[f'{fname}_freq_per_this_term'] = df_agg[f'{fname}_card_id_size']              / (df_agg[f'{fname}_this_term'] + 1.0)
    df_agg[f'{fname}_amount_per_this_term'] = df_agg[f'{fname}_purchase_amount_sum']     / (df_agg[f'{fname}_this_term'] + 1.0)
    df_agg[f'{fname}_instthisments_per_this_term'] = df_agg[f'{fname}_installments_sum'] / (df_agg[f'{fname}_this_term'] + 1.0)
    
    df_agg[f'{fname}_amount_per_installments_sum'] = df_agg[f'{fname}_purchase_amount_sum'] / (df_agg[f'{fname}_installments_sum'] + 1.0)
    df_agg[f'{fname}_amount_per_installments_mean'] = df_agg[f'{fname}_purchase_amount_mean'] / (df_agg[f'{fname}_installments_mean'] + 1.0)
    
#     df_agg.drop(f'{fname}_all_term_max', axis=1, inplace=True)
    
    # Monthly Agg
#     del aggs['all_term']
    new_columns = get_new_columns(fname + '_monthly_avg', aggs)
    month_agg = df.groupby([key, 'yyyymm']).agg(aggs)
        
    month_agg.columns = new_columns
    month_agg = month_agg.reset_index().drop('yyyymm', axis=1).groupby([key]).mean()
    month_agg.reset_index(drop=False,inplace=True)
    
    train_test = train_test.join(df_agg.set_index(key)).join(month_agg.set_index(key))
    del df_agg, month_agg
    gc.collect()
    
    
def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]


def save_feature(col):
    ignore_features = ['unix_first_active_month', 'first_active_month', 'card_id', target]
    if col in ignore_features: return
    if (col.count('feature_')):return
    if (col.count('purchase_date')):return
    feature = train_test[col].fillna(-1).astype('float32').values
    utils.to_pkl_gzip(path = f'../features/1_first_valid/{fname}_{col}@', obj=feature)


def impute_agg(df):
    for col in df.columns:
        if col.count('isnull'):
            df[col].fillna(-1, inplace=True)
        if col.count('null_cnt'):
            df[col].fillna(-1, inplace=True)
        if col.count('month_lag'):
            df[col].fillna(df[col].min()-15, inplace=True)
        if col.count('month_diff'):
            df[col].fillna(df[col].max()+100, inplace=True)
        if col.count('nunique'):
            df[col].fillna(-1, inplace=True)
        if col.count('amount'):
            df[col].fillna(-1, inplace=True)
        if col.count('installments'):
            df[col].fillna(-2, inplace=True)
        if col.count('unix_date'):
            df[col].fillna(df[col].min()-100000, inplace=True)
        if col.count('size'):
            df[col].fillna(-1, inplace=True)
        if col.count('term'):
            df[col].fillna(-1, inplace=True)
        if col.count('per'):
            df[col].fillna(-1, inplace=True)
        if col.count('dummie'):
            df[col].fillna(-1, inplace=True)
            
    return df

In [9]:
# term
term_num = 1
base_list = np.zeros(term_num).astype('int8')
last_list = np.arange(1, term_num+1, 1)+2
col_month = 'org_month_lag'
cat1_0 = False
# cat1_0 = True
feat_list = []

for base_month, last_month in zip(base_list, last_list):
    
    term_type = f'org_lag{base_month}_{last_month}'
    
    df = df_hist[df_hist[col_month]<=last_month]
    df = df[df[col_month]>=base_month]
    
    auth1 = df[df.authorized_flag==1]
    auth0 = df[df.authorized_flag==0]

    if cat1_0:
        df_list = [
            auth1[auth1.category_1==1]
            ,auth1[auth1.category_1==0]
            ,auth0[auth0.category_1==1]
            ,auth0[auth0.category_1==0]
        ]
        fname_list = [f'{term_type}_auth1_cat1', f'{term_type}_auth1_cat0', f'{term_type}_auth0_cat1', f'{term_type}_auth0_cat0']
    else:
        df_list = [auth1, auth0]
        fname_list = [f'{term_type}_auth1', f'{term_type}_auth0']
        
        
    for df, fname in zip(df_list, fname_list):
        
        df_train = utils.read_df_pkl('../input/train0*')
        df_test = utils.read_df_pkl('../input/test0*')
        df_train.set_index(key, inplace=True)
        df_test.set_index(key, inplace=True)
        train_test = pd.concat([df_train, df_test], axis=0)
        
        feat_agg(df, fname)

    
        train_test['first_active_month'] =  pd.to_datetime(train_test['first_active_month'])
        for col in train_test.columns:
            if col.count('purchase_date'):
                train_test[col] = pd.to_datetime( train_test[col])
        
        # 最終までの期間
        train_test[f'{fname}_term_from_first_month'] = (train_test[f'{fname}_purchase_date_max'] - train_test[f'first_active_month']).dt.days
        
        # 20180501までの期間を各データセットパターンで
        train_test[f'{fname}_term_from_now'] = (pd.to_datetime('2018-05-01') - train_test[f'{fname}_purchase_date_max']).dt.days
        
        
        # auth1, auth0の間の期間
        # とりあえずいらない
#         if cat1_0:
#             train_test[f'{term_type}_auth1_0_cat0_term'] = train_test[f'{term_type}_auth1_cat0_term_from_now'] - train_test[f'{term_type}_auth0_cat0_term_from_now']
#             train_test[f'{term_type}_auth1_0_cat1_term'] = train_test[f'{term_type}_auth1_cat1_term_from_now'] - train_test[f'{term_type}_auth0_cat1_term_from_now']
#         else:
#             train_test[f'{term_type}_auth1_0_term'] = train_test[f'{term_type}_auth1_term_from_now'] - train_test[f'{term_type}_auth0_term_from_now']
#             train_test[f'{term_type}_auth1_0_term'] = train_test[f'{term_type}_auth1_term_from_now'] - train_test[f'{term_type}_auth0_term_from_now']
#             train_test[f'{term_type}_auth1_0_purchase_date_max_diff'] = (train_test[f'{term_type}_auth1_purchase_date_max'] - train_test[f'{term_type}_auth0_purchase_date_max']).dt.days
#             train_test[f'{term_type}_auth1_0_purchase_date_min_diff'] = (train_test[f'{term_type}_auth1_purchase_date_min'] - train_test[f'{term_type}_auth0_purchase_date_min']).dt.days
        
        date_min_cols = [col for col in train_test.columns if col.count('purchase_date_min') and not(col.count('diff'))]
        for col in date_min_cols:
            fname = ''
            if col.count('auth1'):
                fname += 'auth1_'
            elif col.count('auth0'):
                fname += 'auth0_'
            if col.count('new'):
                fname += 'new_'
            if col.count('cat1'):
                fname += 'cat1_'
            elif col.count('cat0'):
                fname += 'cat0_'
            train_test[f'{term_type}_{fname}first_buy'] = (train_test[col] - train_test[f'first_active_month']).dt.days

        
        train_test = impute_agg(train_test)
            
        print(f"Saving features... {fname} {train_test.shape}")
        
        ignore_features = ['first_active_month', 'card_id', target]

        train_test.columns = [col.replace('.', '_') for col in train_test.columns]
        
        feat_list.append(train_test.copy())
        
        del train_test
        gc.collect()

100%|██████████| 3/3 [00:00<00:00, 110.75it/s]
100%|██████████| 3/3 [00:00<00:00, 186.15it/s]


Saving features... auth1_ (325540, 76)


100%|██████████| 3/3 [00:00<00:00, 113.99it/s]
100%|██████████| 3/3 [00:00<00:00, 183.22it/s]


Saving features... auth0_ (325540, 76)


In [10]:
feat_list[0].head()

Unnamed: 0_level_0,feature_1,feature_2,feature_3,first_active_month,target,org_lag0_3_auth1_category_2_1_0_mean,org_lag0_3_auth1_category_2_2_0_mean,org_lag0_3_auth1_category_2_3_0_mean,org_lag0_3_auth1_category_2_4_0_mean,org_lag0_3_auth1_category_2_5_0_mean,org_lag0_3_auth1_category_3_A_mean,org_lag0_3_auth1_category_3_B_mean,org_lag0_3_auth1_category_3_C_mean,org_lag0_3_auth1_category_1_sum,org_lag0_3_auth1_category_1_mean,org_lag0_3_auth1_month_lag_mean,org_lag0_3_auth1_month_lag_std,org_lag0_3_auth1_yyyy_week_nunique,org_lag0_3_auth1_purchase_date_max,org_lag0_3_auth1_purchase_date_min,org_lag0_3_auth1_month_diff_mean,org_lag0_3_auth1_month_diff_std,org_lag0_3_auth1_purchase_amount_sum,org_lag0_3_auth1_purchase_amount_max,org_lag0_3_auth1_purchase_amount_min,org_lag0_3_auth1_purchase_amount_mean,org_lag0_3_auth1_installments_sum,org_lag0_3_auth1_installments_max,org_lag0_3_auth1_installments_min,org_lag0_3_auth1_installments_mean,org_lag0_3_auth1_installments_std,org_lag0_3_auth1_merchant_id_nunique,org_lag0_3_auth1_merchant_category_id_nunique,org_lag0_3_auth1_card_id_size,org_lag0_3_auth1_city_id_nunique,org_lag0_3_auth1_state_id_nunique,org_lag0_3_auth1_subsector_id_nunique,org_lag0_3_auth1_this_term,org_lag0_3_auth1_freq_per_this_term,org_lag0_3_auth1_amount_per_this_term,org_lag0_3_auth1_instthisments_per_this_term,org_lag0_3_auth1_amount_per_installments_sum,org_lag0_3_auth1_amount_per_installments_mean,org_lag0_3_auth1_monthly_avg_category_2_1_0_mean,org_lag0_3_auth1_monthly_avg_category_2_2_0_mean,org_lag0_3_auth1_monthly_avg_category_2_3_0_mean,org_lag0_3_auth1_monthly_avg_category_2_4_0_mean,org_lag0_3_auth1_monthly_avg_category_2_5_0_mean,org_lag0_3_auth1_monthly_avg_category_3_A_mean,org_lag0_3_auth1_monthly_avg_category_3_B_mean,org_lag0_3_auth1_monthly_avg_category_3_C_mean,org_lag0_3_auth1_monthly_avg_category_1_sum,org_lag0_3_auth1_monthly_avg_category_1_mean,org_lag0_3_auth1_monthly_avg_month_lag_mean,org_lag0_3_auth1_monthly_avg_month_lag_std,org_lag0_3_auth1_monthly_avg_yyyy_week_nunique,org_lag0_3_auth1_monthly_avg_month_diff_mean,org_lag0_3_auth1_monthly_avg_month_diff_std,org_lag0_3_auth1_monthly_avg_purchase_amount_sum,org_lag0_3_auth1_monthly_avg_purchase_amount_max,org_lag0_3_auth1_monthly_avg_purchase_amount_min,org_lag0_3_auth1_monthly_avg_purchase_amount_mean,org_lag0_3_auth1_monthly_avg_installments_sum,org_lag0_3_auth1_monthly_avg_installments_max,org_lag0_3_auth1_monthly_avg_installments_min,org_lag0_3_auth1_monthly_avg_installments_mean,org_lag0_3_auth1_monthly_avg_installments_std,org_lag0_3_auth1_monthly_avg_merchant_id_nunique,org_lag0_3_auth1_monthly_avg_merchant_category_id_nunique,org_lag0_3_auth1_monthly_avg_card_id_size,org_lag0_3_auth1_monthly_avg_city_id_nunique,org_lag0_3_auth1_monthly_avg_state_id_nunique,org_lag0_3_auth1_monthly_avg_subsector_id_nunique,org_lag0_3_auth1_term_from_first_month,org_lag0_3_auth1_term_from_now,org_lag0_3_auth1_first_buy
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1
C_ID_92a2005557,5,2,1,2017-06-01,-0.820283,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-6.318182,0.789051,14.0,2017-09-30 20:06:30,2017-06-27 14:18:08,268.090909,24.8018,7.519531,0.439697,0.007523,0.068359,0.0,0.0,0.0,0.0,0.0,41.0,16.0,110.0,3.0,1.0,10.0,95.0,1.145833,0.078328,0.0,7.519531,0.068359,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-6.5,0.0,4.25,270.671985,6.685468,1.879883,0.234375,0.016998,0.071289,0.0,0.0,0.0,0.0,0.0,14.75,7.75,27.5,1.5,1.0,4.75,121.0,212.0,26.0
C_ID_3d0044924f,4,1,0,2017-01-01,0.392913,0.739726,0.0,0.0,0.0,0.0,0.0,0.657534,0.342466,19.0,0.260274,-10.630137,1.148754,16.0,2017-04-28 12:51:22,2017-01-06 16:29:42,427.452055,33.540208,14.992188,2.0,0.00602,0.205444,145.0,10.0,1.0,1.986328,1.896484,48.0,31.0,73.0,6.0,2.0,19.0,111.0,0.651786,0.133859,1.294643,0.102661,0.068787,0.770292,0.0,0.0,0.0,0.0,0.0,0.671627,0.328373,4.75,0.229708,-10.5,0.0,4.5,423.482684,7.52555,3.75,1.329102,0.009743,0.209717,36.25,8.0,1.0,2.023438,1.946289,15.5,12.25,18.25,3.25,1.75,9.25,117.0,367.0,5.0
C_ID_d639edf6cd,2,2,0,2016-08-01,0.688056,,,,,,,,,,,-28.0,-15.0,-1.0,NaT,NaT,584.0,185.559921,-1.0,-1.0,-1.0,-1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,,,,,,,,,,,-28.0,-15.0,-1.0,584.0,121.213203,-1.0,-1.0,-1.0,-1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
C_ID_186d6a6901,4,3,0,2017-09-01,0.142495,0.181818,0.0,0.0,0.690909,0.0,0.0,0.890909,0.090909,7.0,0.127273,-3.854545,0.869614,13.0,2017-12-29 15:44:09,2017-09-26 16:22:21,189.4,25.119418,5.953125,2.0,0.007507,0.108215,62.0,3.0,0.0,1.126953,0.51123,39.0,22.0,55.0,7.0,5.0,13.0,93.0,0.585106,0.063331,0.659574,0.094482,0.050873,0.139296,0.0,0.0,0.654099,0.0,0.0,0.841014,0.123272,1.75,0.206605,-3.5,0.0,3.25,177.196498,6.467581,1.488281,0.713867,0.009956,0.158936,15.5,2.0,0.75,1.139648,0.368652,11.0,7.75,13.75,3.25,2.75,5.25,119.0,122.0,25.0
C_ID_cdbd2c0db2,1,3,0,2017-11-01,-0.159749,0.078125,0.0,0.0,0.820312,0.007812,0.0,0.96875,0.03125,12.0,0.09375,-1.320312,1.02668,17.0,2018-02-28 20:40:41,2017-11-12 00:00:00,114.039062,31.794716,19.890625,2.0,0.000762,0.155396,144.0,12.0,1.0,1.125,1.003906,65.0,26.0,128.0,6.0,6.0,17.0,108.0,1.174312,0.182483,1.321101,0.137207,0.07312,0.07675,0.0,0.0,0.809777,0.011905,0.0,0.960313,0.039687,3.0,0.101568,-1.5,0.0,4.75,119.060438,8.433847,4.972656,1.337891,0.014877,0.165405,36.0,4.5,1.0,1.141602,0.681152,20.0,10.0,32.0,3.5,3.5,7.5,119.0,61.0,11.0


### Save Feature

In [12]:
ignore_features = ['first_active_month', 'card_id', target]
fname = '173_l03'

for df_feat in feat_list:
    for col in df_feat.columns:
        if col in ignore_features: continue
        if (col.count('feature_')):continue
        if (col.count('unix_date')):continue
    #     if (col.count('purchase_date')) and not(col.count('diff')):continue
        if (col.count('purchase_date')):continue
        feature = df_feat[col].fillna(-1).astype('float32').values
            
        utils.to_pkl_gzip(path = f'../features/1_first_valid/{fname}_{col}@', obj=feature)
#         utils.to_pkl_gzip(path = f'../features/2_second_valid/{fname}_{col}@', obj=feature)

In [72]:
# なぜかうまくできなかったので個別に

fname = '173_l02'

aggs = {}
aggs['purchase_date'] = ['max', 'min']
max_min_1 = auth1[[key, 'purchase_date']].groupby(key).agg(aggs)
max_min_0 = auth0[[key, 'purchase_date']].groupby(key).agg(aggs)

max_min_1.columns = ['auth1_purchase_date_max', 'auth1_purchase_date_min']
max_min_0.columns = ['auth0_purchase_date_max', 'auth0_purchase_date_min']

max_min = max_min_1.join(max_min_0)
max_min.head()

max_min[f'{term_type}_auth1_0_purchase_date_max_diff'] = (max_min[f'auth1_purchase_date_max'] - max_min[f'auth0_purchase_date_max']).dt.days
max_min[f'{term_type}_auth1_0_purchase_date_min_diff'] = (max_min[f'auth1_purchase_date_min'] - max_min[f'auth0_purchase_date_min']).dt.days

col_list = [ f'{term_type}_auth1_0_purchase_date_max_diff' ,f'{term_type}_auth1_0_purchase_date_min_diff' ]
max_min = train_test.join(max_min)

for col in col_list:
    tmp = max_min[col].fillna(-1).values.astype('float32')
    print(tmp.shape)
    utils.to_pkl_gzip(path = f'../features/2_second_valid/{fname}_{col}@', obj=tmp)

(325540,)
(325540,)


In [81]:
#========================================================================
# City_id
#========================================================================
df = auth1
df['cnt'] = 1
df_city = df.groupby([key, 'city_id'])['cnt'].sum().reset_index()
df_rec = df.groupby([key])['cnt'].sum().reset_index()

main = df_city.merge(df_rec, how='inner', on=key)
del df_city, df_rec
gc.collect()

df['authorized_flag'] = df['authorized_flag'].map(lambda x: 0 if x=='N' else 1)
auth_mean = df.groupby('city_id')['authorized_flag'].mean()

main['main_city_ratio'] = main['cnt_x'] / (main['cnt_y'])
main_city_idx = main.groupby(key)['main_city_ratio'].idxmax()

main_city = main.iloc[main_city_idx][[key, 'city_id', 'main_city_ratio']]
main_city['main_city_auth_mean'] = main_city['city_id'].map(auth_mean)
print(main_city.shape)
main_city.head()

Unnamed: 0,card_id,city_id,cnt_x,cnt_y
0,C_ID_00007093c1,-1,2,20
1,C_ID_00007093c1,244,18,20
2,C_ID_0001238066,-1,4,36
3,C_ID_0001238066,69,5,36
4,C_ID_0001238066,291,1,36
5,C_ID_0001238066,314,17,36
6,C_ID_0001238066,333,9,36
7,C_ID_0001793786,69,1,19
8,C_ID_0001793786,96,1,19
9,C_ID_0001793786,179,5,19


(234208, 4)


Unnamed: 0,card_id,city_id,main_city_ratio,main_city_auth_mean
1,C_ID_00007093c1,244,0.9,1
5,C_ID_0001238066,314,0.472222,1
10,C_ID_0001793786,204,0.631579,1
13,C_ID_000183fdda,161,0.872727,1
17,C_ID_00027503e2,146,0.857143,1


In [82]:
feat_no = '175_l02'

df_train = utils.read_df_pkl('../input/train0*')
df_test = utils.read_df_pkl('../input/test0*')
df_train.set_index(key, inplace=True)
df_test.set_index(key, inplace=True)
train_test = pd.concat([df_train, df_test], axis=0)
main_city.set_index(key, inplace=True)

df_feat = train_test.merge(main_city, how='left', on=key)

for col in df_feat.columns:
    if col in ignore_list:continue
    if not(col.count('main_city_ratio')):continue
        
    df_feat[col].fillna(-1, inplace=True)
    feature = df_feat[col].astype('float32').values
    
#     utils.to_pkl_gzip(path = f'../features/1_first_valid/{feat_no}_all_{col}@', obj=feature)
    utils.to_pkl_gzip(path = f'../features/2_second_valid/{feat_no}_{col}@', obj=feature)

100%|██████████| 3/3 [00:00<00:00, 41.97it/s]
100%|██████████| 3/3 [00:00<00:00, 81.18it/s]
