In [13]:
import pandas as pd 
import numpy as np
from tqdm import tqdm 
import datetime, time, gc 
# from utils import distance, haversine, standard, pad_seq 
from scipy.stats import skew, kurtosis
from zipfile import ZipFile
from collections import Counter
from sklearn.metrics import roc_auc_score as auc
# import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [14]:
# !pip install pandas==1.1.0

In [15]:
all_data = pd.read_pickle("data/train_test.pkl")

In [16]:
all_data['communication_hours'] = all_data['communication_onlinerate'].apply(lambda x: len(x.split('^')))

In [6]:
%%time

def group_fea(df,key,target):
    tmp = df.groupby(key, as_index=False)[target].agg({
        key+target + '_nunique': 'nunique',
    }).reset_index()
    del tmp['index']
    # print("**************************{}**************************".format(target))
    return tmp

feature_key = ['uid','age', 'career','city']
feature_target = ['task_id','adv_id','dev_id','slot_id', 'adv_prim_id', 'spread_app_id','indu_name']

for key in tqdm(feature_key):
    for target in feature_target:
        tmp = group_fea(all_data, key, target)
        all_data = all_data.merge(tmp, on=key, how='left')


# 计数特征
print("======计数特征======")
for col in tqdm(['adv_id', 'adv_prim_id', 'task_id', 'uid', 'city']):
#     print(col)
    col_name = '{}_count_fea'.format(col)
    all_data[col_name] = all_data[col].map(all_data[col].value_counts())
    all_data[col_name] = (all_data[col_name]-all_data[col_name].min()) / (all_data[col_name].max()-all_data[col_name].min())
    all_data[col_name] = all_data[col_name].astype(np.float32)
    
# 交叉特征计数特征
usr_cols = ['uid', 'age', 'gender', 'city', 'career'] 
adv_cols = ['task_id', 'adv_id', 'spread_app_id', 'adv_prim_id', 'dev_id', 'app_second_class']
all_data['cnt'] = 1


# 组合特征点击次数
for i in tqdm(range(len(usr_cols))):
    for j in range(len(adv_cols)):
        col_name = "cnt_click_of_"+usr_cols[i]+"_and_"+adv_cols[j]
        s = time.time()
        se = all_data.groupby([usr_cols[i], adv_cols[j]])['cnt'].sum()
        dt = all_data[[usr_cols[i], adv_cols[j]]]
        se = (pd.merge(dt, se.reset_index(), how='left',
                        on=[usr_cols[i], adv_cols[j]]).sort_index()['cnt'].fillna(value=0)).astype(np.int32)
        semax = se.max()
        semin = se.min()
        all_data[col_name] = ((se-se.min())/(se.max()-se.min())*100).fillna(value=0).astype(np.int32).values

100%|██████████| 4/4 [03:17<00:00, 49.42s/it]
  0%|          | 0/5 [00:00<?, ?it/s]



100%|██████████| 5/5 [00:02<00:00,  2.26it/s]
100%|██████████| 5/5 [02:03<00:00, 24.72s/it]

CPU times: user 3min 59s, sys: 1min 23s, total: 5min 23s
Wall time: 5min 23s





In [18]:
# 统计特征

def group_fea(df,key,target):
    tmp = df.groupby(key, as_index=False)[target].agg({
        key+target + '_count': 'count',
    }).reset_index()
    del tmp['index']
    return tmp

feature_key = ['uid','age', 'career', 'city']
feature_target = ['task_id','adv_id','dev_id','slot_id', 'adv_prim_id', 'spread_app_id','indu_name']

for key in tqdm(feature_key):
    for target in feature_target:
        tmp = group_fea(all_data, key, target)
        all_data = all_data.merge(tmp, on=key, how='left')

100%|██████████| 4/4 [01:58<00:00, 29.59s/it]


In [19]:
##统计各类别在此次出现前的count数
def count_cat_prep(df, column, newcolumn):
    count_dict = {}
    df[newcolumn] = 0
    data = df[[column,newcolumn]].values
    for cat_list in data:
        if cat_list[0] not in count_dict:
            count_dict[cat_list[0]] = 0
            cat_list[1] = 0
        else:
            count_dict[cat_list[0]] += 1
            cat_list[1] = count_dict[cat_list[0]]
    df[[column,newcolumn]] = data

all_data['user_task_id'] = all_data['uid'].astype(str)+"_"+all_data['task_id'].astype(str)
all_data['user_adv_id'] = all_data['uid'].astype(str)+"_"+all_data['adv_id'].astype(str)
all_data['user_dev_id'] = all_data['uid'].astype(str)+"_"+all_data['dev_id'].astype(str)
all_data['user_advprim_id'] = all_data['uid'].astype(str)+"_"+all_data['adv_prim_id'].astype(str)
all_data['user_slot_id'] = all_data['uid'].astype(str)+"_"+all_data['slot_id'].astype(str)
all_data['user_indu_name'] = all_data['uid'].astype(str)+"_"+all_data['indu_name'].astype(str)

##统计各类别在总样本中的count数
for column in tqdm(['uid', 'task_id', 'adv_id', 'dev_id', 'adv_prim_id', 'slot_id', 'indu_name',
               'user_task_id','user_adv_id','user_dev_id','user_advprim_id','user_slot_id','user_indu_name']):
    count_cat_prep(all_data, column, column+'_click_count_prep')

100%|██████████| 13/13 [03:24<00:00, 15.71s/it]


In [23]:
for feat in tqdm(['user_task_id','user_adv_id','user_dev_id','user_advprim_id','user_slot_id','user_indu_name']):
    lbe = LabelEncoder()
    all_data[feat] = lbe.fit_transform(all_data[feat])

100%|██████████| 6/6 [01:50<00:00, 18.46s/it]


In [25]:
all_data = reduce_mem_usage(all_data)

100%|██████████| 84/84 [00:56<00:00,  1.48it/s]

Mem. usage decreased to 1647.95 Mb (56.2% reduction)





In [27]:
count_cols = ['uidtask_id_count', 'uidadv_id_count',
       'uiddev_id_count', 'uidslot_id_count', 'uidadv_prim_id_count',
       'uidspread_app_id_count', 'uidindu_name_count', 'agetask_id_count',
       'ageadv_id_count', 'agedev_id_count', 'ageslot_id_count',
       'ageadv_prim_id_count', 'agespread_app_id_count', 'ageindu_name_count',
       'careertask_id_count', 'careeradv_id_count', 'careerdev_id_count',
       'careerslot_id_count', 'careeradv_prim_id_count',
       'careerspread_app_id_count', 'careerindu_name_count',
       'citytask_id_count', 'cityadv_id_count', 'citydev_id_count',
       'cityslot_id_count', 'cityadv_prim_id_count', 'cityspread_app_id_count',
       'cityindu_name_count', 'user_task_id', 'user_adv_id', 'user_dev_id',
       'user_advprim_id', 'user_slot_id', 'user_indu_name',
       'uid_click_count_prep', 'task_id_click_count_prep',
       'adv_id_click_count_prep', 'dev_id_click_count_prep',
       'adv_prim_id_click_count_prep', 'slot_id_click_count_prep',
       'indu_name_click_count_prep', 'user_task_id_click_count_prep',
       'user_adv_id_click_count_prep', 'user_dev_id_click_count_prep',
       'user_advprim_id_click_count_prep', 'user_slot_id_click_count_prep',
       'user_indu_name_click_count_prep']
all_data[count_cols].to_pickle("data/train_test_countfea.pkl")

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score
import gc, time, os, sys, argparse
import warnings
warnings.filterwarnings('ignore')
# train=pd.read_csv('invite_info.txt',sep='\s+',names=['qid','uid','time','target'])
# test=pd.read_csv('invite_info_evaluate_1.txt',sep='\s+',names=['qid','uid','time'])
import scipy.sparse
from scipy import linalg
from scipy.special import iv
import scipy.sparse as sp
from sklearn import preprocessing
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
import networkx as nx

class ProNE():
    def __init__(self, G, emb_size=100, step=10, theta=0.5, mu=0.2, n_iter=5, random_state=2019):
        self.G = G
        self.emb_size = emb_size
        self.G = self.G.to_undirected()
        self.node_number = self.G.number_of_nodes()
        self.random_state = random_state
        self.step = step
        self.theta = theta
        self.mu = mu
        self.n_iter = n_iter
        
        mat = scipy.sparse.lil_matrix((self.node_number, self.node_number))

        for e in tqdm(self.G.edges()):
            if e[0] != e[1]:
                mat[int(e[0]), int(e[1])] = 1
                mat[int(e[1]), int(e[0])] = 1
        self.mat = scipy.sparse.csr_matrix(mat)
        print(mat.shape)

    def get_embedding_rand(self, matrix):
        # Sparse randomized tSVD for fast embedding
        t1 = time.time()
        l = matrix.shape[0]
        smat = scipy.sparse.csc_matrix(matrix)  # convert to sparse CSC format
        print('svd sparse', smat.data.shape[0] * 1.0 / l ** 2)
        U, Sigma, VT = randomized_svd(smat, n_components=self.emb_size, 
                                      n_iter=self.n_iter, 
                                      random_state=self.random_state)
        U = U * np.sqrt(Sigma)
        U = preprocessing.normalize(U, "l2")
        print('sparsesvd time', time.time() - t1)
        return U

    def get_embedding_dense(self, matrix, emb_size):
        # get dense embedding via SVD
        t1 = time.time()
        U, s, Vh = linalg.svd(matrix, full_matrices=False, 
                              check_finite=False, overwrite_a=True)
        U = np.array(U)
        U = U[:, :emb_size]
        s = s[:emb_size]
        s = np.sqrt(s)
        U = U * s
        U = preprocessing.normalize(U, "l2")
        print('densesvd time', time.time() - t1)
        return U

    def fit(self, tran, mask):
        # Network Embedding as Sparse Matrix Factorization
        t1 = time.time()
        l1 = 0.75
        C1 = preprocessing.normalize(tran, "l1")
        neg = np.array(C1.sum(axis=0))[0] ** l1

        neg = neg / neg.sum()

        neg = scipy.sparse.diags(neg, format="csr")
        neg = mask.dot(neg)
        print("neg", time.time() - t1)

        C1.data[C1.data <= 0] = 1
        neg.data[neg.data <= 0] = 1

        C1.data = np.log(C1.data)
        neg.data = np.log(neg.data)

        C1 -= neg
        F = C1
        features_matrix = self.get_embedding_rand(F)
        return features_matrix

    def chebyshev_gaussian(self, A, a, order=10, mu=0.5, s=0.5):
        # NE Enhancement via Spectral Propagation
        print('Chebyshev Series -----------------')
        t1 = time.time()

        if order == 1:
            return a

        A = sp.eye(self.node_number) + A
        DA = preprocessing.normalize(A, norm='l1')
        L = sp.eye(self.node_number) - DA

        M = L - mu * sp.eye(self.node_number)

        Lx0 = a
        Lx1 = M.dot(a)
        Lx1 = 0.5 * M.dot(Lx1) - a

        conv = iv(0, s) * Lx0
        conv -= 2 * iv(1, s) * Lx1
        for i in range(2, order):
            Lx2 = M.dot(Lx1)
            Lx2 = (M.dot(Lx2) - 2 * Lx1) - Lx0
            #         Lx2 = 2*L.dot(Lx1) - Lx0
            if i % 2 == 0:
                conv += 2 * iv(i, s) * Lx2
            else:
                conv -= 2 * iv(i, s) * Lx2
            Lx0 = Lx1
            Lx1 = Lx2
            del Lx2
            print('Bessell time', i, time.time() - t1)
        mm = A.dot(a - conv)
        self.embeddings = self.get_embedding_dense(mm, self.emb_size)
        return self.embeddings
    
    def transform(self):
        if self.embeddings is None:
            print("Embedding is not train")
            return {}
        self.embeddings = pd.DataFrame(self.embeddings)
        self.embeddings.columns = ['ProNE_Emb_{}'.format(i) for i in range(len(self.embeddings.columns))]
        self.embeddings = self.embeddings.reset_index().rename(columns={'index' : 'nodes'}).sort_values(by=['nodes'],ascending=True).reset_index(drop=True)

        return self.embeddings

In [8]:
!pip install networkx

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting networkx
  Downloading http://mirrors.tencentyun.com/pypi/packages/9b/cd/dc52755d30ba41c60243235460961fc28022e5b6731f16c268667625baea/networkx-2.5-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 11.3 MB/s eta 0:00:01
Installing collected packages: networkx
Successfully installed networkx-2.5


In [10]:
uid_lbl,qid_lbl = LabelEncoder(), LabelEncoder()
all_data['new_uid'] = uid_lbl.fit_transform(all_data['uid'])
all_data['new_qid'] = qid_lbl.fit_transform(all_data['task_id'])
all_data['new_qid'] += all_data['new_uid'].max() + 1
G = nx.Graph()
G.add_edges_from(all_data[['new_uid','new_qid']].values)

model = ProNE(G,emb_size=48, n_iter=6,step=12) 

features_matrix = model.fit(model.mat, model.mat)
model.chebyshev_gaussian(model.mat, features_matrix,
                         model.step, model.mu, model.theta)
emb = model.transform() 

emb = emb[emb['nodes'].isin(all_data['new_uid'])]
emb['nodes'] = uid_lbl.inverse_transform(emb['nodes'])
emb.rename(columns={'nodes' : 'uid'},inplace=True)

for col in emb.columns[1:]:
    emb[col] = emb[col].astype(np.float32)

100%|██████████| 6627212/6627212 [01:55<00:00, 57563.03it/s]


(966194, 966194)
neg 1.409947395324707
svd sparse 1.419816395136892e-05
sparsesvd time 68.86140561103821
Chebyshev Series -----------------
Bessell time 2 6.790277719497681
Bessell time 3 9.962549448013306
Bessell time 4 13.141770124435425
Bessell time 5 16.316610097885132
Bessell time 6 19.49117684364319
Bessell time 7 22.638962507247925
Bessell time 8 25.79183268547058
Bessell time 9 28.953147888183594
Bessell time 10 32.121281147003174
Bessell time 11 35.28276038169861
densesvd time 3.8150386810302734


In [11]:
emb

Unnamed: 0,uid,ProNE_Emb_0,ProNE_Emb_1,ProNE_Emb_2,ProNE_Emb_3,ProNE_Emb_4,ProNE_Emb_5,ProNE_Emb_6,ProNE_Emb_7,ProNE_Emb_8,ProNE_Emb_9,ProNE_Emb_10,ProNE_Emb_11,ProNE_Emb_12,ProNE_Emb_13,ProNE_Emb_14,ProNE_Emb_15,ProNE_Emb_16,ProNE_Emb_17,ProNE_Emb_18,ProNE_Emb_19,ProNE_Emb_20,ProNE_Emb_21,ProNE_Emb_22,ProNE_Emb_23,ProNE_Emb_24,ProNE_Emb_25,ProNE_Emb_26,ProNE_Emb_27,ProNE_Emb_28,ProNE_Emb_29,ProNE_Emb_30,ProNE_Emb_31,ProNE_Emb_32,ProNE_Emb_33,ProNE_Emb_34,ProNE_Emb_35,ProNE_Emb_36,ProNE_Emb_37,ProNE_Emb_38,ProNE_Emb_39,ProNE_Emb_40,ProNE_Emb_41,ProNE_Emb_42,ProNE_Emb_43,ProNE_Emb_44,ProNE_Emb_45,ProNE_Emb_46,ProNE_Emb_47
0,1000001,-0.368049,-0.205605,0.245584,-0.465182,0.022273,-0.136212,-0.239742,0.069814,0.112702,-0.250295,-0.300914,-0.071377,-0.018565,0.050335,0.098738,-0.015571,0.076139,-0.003024,0.052570,0.084980,0.016284,-0.011652,-0.039404,0.006671,0.080930,0.005801,0.097062,0.042537,0.098667,-0.014544,-0.030883,-0.035041,-0.036763,0.051876,-0.047577,0.009678,0.073017,0.018323,-0.060582,-0.037882,-0.077012,0.026160,-0.040626,-0.001764,-0.021964,0.081913,-0.223155,0.387472
1,1000002,-0.320304,-0.040334,0.187420,-0.176862,0.248591,0.038425,0.268446,0.190359,-0.033522,0.078030,-0.046019,0.157644,0.181244,0.110088,0.035933,-0.014851,0.019784,-0.066940,-0.036788,0.077256,0.048873,0.167618,0.012839,0.003644,-0.022316,0.010298,0.078227,0.033889,0.056659,-0.062186,0.027300,0.061975,-0.148747,-0.180922,-0.005468,0.064894,-0.085493,0.073160,-0.053648,-0.058345,-0.175747,0.109790,0.118606,0.043752,-0.109746,0.117716,-0.296929,0.513527
2,1000003,-0.298371,-0.054082,0.493450,-0.144025,0.062780,0.010314,-0.025696,-0.017017,-0.039877,0.117618,0.107614,0.155447,-0.048769,-0.032764,0.129267,-0.046288,0.052606,0.005766,-0.037177,-0.108276,0.278352,-0.192409,0.004610,0.061396,-0.225328,-0.029793,-0.147588,0.207141,-0.040204,0.098991,0.139241,-0.152397,-0.006657,0.042175,-0.242173,0.108371,0.022710,0.113316,-0.179504,0.127407,-0.254658,0.021499,-0.124882,0.078241,-0.010420,0.194289,0.022752,0.036560
3,1000004,-0.402797,-0.296493,0.208601,-0.111620,-0.396681,-0.157061,0.023676,-0.052419,0.083414,0.139461,0.015087,0.090849,-0.110101,0.369088,0.084461,0.092004,0.086161,0.072749,-0.078114,-0.150922,0.118787,-0.224015,-0.075395,0.109571,-0.075717,-0.036530,-0.035385,0.009050,0.086931,-0.096299,-0.077729,0.005752,-0.012282,-0.040884,-0.186858,0.210111,-0.139948,-0.000708,0.156819,-0.026949,-0.027212,0.031611,0.022583,-0.069044,-0.011701,0.055856,0.085636,0.106879
4,1000005,-0.239758,-0.312144,0.465431,-0.004154,0.166084,0.066340,0.180704,-0.138323,-0.237283,-0.320525,0.235794,-0.191335,-0.324694,0.126129,-0.001832,0.048436,0.014434,-0.023839,0.070107,0.004211,0.023559,-0.069327,-0.005246,-0.004620,-0.023468,-0.019958,-0.024958,-0.014543,-0.021688,-0.008737,0.028561,0.099043,-0.065216,-0.099198,0.016055,0.045879,-0.039683,0.005903,-0.057896,-0.047956,-0.013798,0.058844,0.103298,0.019857,-0.028578,0.061230,-0.250734,0.199480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
961853,2237669,-0.360691,-0.290667,0.286432,-0.231596,0.292787,0.048617,0.153766,-0.160987,0.164669,0.208109,-0.180801,0.006893,-0.124384,-0.045740,0.030217,0.087448,-0.083865,-0.050694,-0.012636,-0.151001,0.124751,0.081806,-0.050258,0.049887,-0.018626,-0.237372,0.016120,0.052189,0.007112,0.069830,-0.024279,0.010642,-0.012216,0.032945,-0.034766,0.121792,-0.012832,-0.061032,-0.035562,-0.100957,-0.123301,0.059382,0.011579,-0.031452,-0.036076,-0.026065,-0.089359,0.441836
961854,2237670,-0.293357,-0.171973,0.286546,-0.118879,-0.012549,0.167391,0.141795,-0.061074,0.241667,0.221097,0.159995,0.018302,0.133836,0.083154,0.190089,0.080844,0.158206,-0.028737,-0.065107,-0.155830,0.001873,0.025652,-0.055451,-0.062904,0.022282,-0.046261,-0.041884,0.174258,0.142560,-0.053767,-0.037909,0.269769,0.225457,-0.106987,-0.050639,0.037713,0.066683,0.142394,-0.072244,0.079365,0.093814,0.082489,0.032557,-0.180274,0.076169,0.143289,-0.070229,0.397756
961855,2237671,-0.347759,-0.113205,0.519505,0.246031,0.159094,0.223885,-0.169376,-0.193576,-0.032818,-0.091356,0.174773,-0.037654,0.050514,0.039181,-0.087470,-0.135635,-0.019892,-0.069740,-0.004200,0.018125,0.119081,0.120512,0.114366,-0.198444,0.159414,-0.001681,-0.009741,0.037282,0.068456,-0.072779,0.013928,-0.188399,0.043992,-0.002197,0.075300,0.000277,-0.024043,0.067721,0.114064,-0.004569,-0.043420,0.010244,0.054030,-0.014785,-0.047842,-0.069791,-0.008469,0.353850
961856,2237672,-0.328002,-0.065719,0.513095,-0.098086,-0.031276,0.240971,-0.244969,-0.288519,0.094212,-0.061888,-0.019878,0.216363,-0.011086,0.010759,-0.095290,-0.036844,0.035807,-0.028724,0.090347,0.001076,0.098204,0.096631,-0.040077,-0.047343,-0.014788,0.200298,0.203474,-0.183259,-0.139485,0.068450,-0.029133,-0.217487,-0.055314,0.033208,-0.157315,0.035803,-0.164370,0.094416,0.183035,-0.033465,0.085499,0.013952,-0.005476,0.075964,-0.049219,-0.013327,-0.020690,-0.067269


In [15]:
del all_data['new_uid'], all_data['new_qid']

In [24]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in tqdm(df.columns):
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem,
                                                                          100 * (start_mem - end_mem) / start_mem))
    return df

In [17]:
all_data = reduce_mem_usage(all_data)

100%|██████████| 101/101 [00:32<00:00,  3.12it/s]

Mem. usage decreased to 1045.23 Mb (69.3% reduction)





In [18]:
all_data = all_data.merge(emb, on='uid', how='left')

In [21]:
del all_data['cnt']
all_data = reduce_mem_usage(all_data)

In [23]:
all_data.to_pickle("data/train_test_fea.pkl")

# target encoding

In [3]:
train_df = all_data.iloc[:-1000000]
test_df = all_data.iloc[-1000000:].reset_index(drop=True)

In [4]:

##########################target_enc feature#######################
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
enc_list = ['net_type', 'task_id', 'adv_id', 'adv_prim_id', 'age', 'app_first_class',
            'app_second_class', 'career', 'city', 'consume_purchase', 
            'uid', 'dev_id', 'tags', 'slot_id']

for f in tqdm(enc_list):
    train_df[f + '_target_enc'] = 0
    test_df[f + '_target_enc'] = 0
    for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
        trn_x = train_df[[f, 'label']].iloc[trn_idx].reset_index(drop=True)
        val_x = train_df[[f]].iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['label'].agg({f + '_target_enc': 'mean'})
        val_x = val_x.merge(enc_df, on=f, how='left')
        test_x = test_df[[f]].merge(enc_df, on=f, how='left')
        val_x[f + '_target_enc'] = val_x[f + '_target_enc'].fillna(train_df['label'].mean())
        test_x[f + '_target_enc'] = test_x[f + '_target_enc'].fillna(train_df['label'].mean())
        train_df.loc[val_idx, f + '_target_enc'] = val_x[f + '_target_enc'].values
        test_df[f + '_target_enc'] += test_x[f + '_target_enc'].values / skf.n_splits

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.

In [5]:
test_df.head()

Unnamed: 0,label,uid,task_id,adv_id,creat_type_cd,adv_prim_id,dev_id,inter_type_cd,slot_id,spread_app_id,tags,app_first_class,app_second_class,age,city,city_rank,device_name,device_size,career,gender,net_type,residence,his_app_size,his_on_shelf_time,app_score,emui_dev,list_time,device_price,up_life_duration,up_membership_grade,membership_life_duration,consume_purchase,communication_onlinerate,communication_avgonline_30d,indu_name,pt_d,net_type_target_enc,task_id_target_enc,adv_id_target_enc,adv_prim_id_target_enc,age_target_enc,app_first_class_target_enc,app_second_class_target_enc,career_target_enc,city_target_enc,consume_purchase_target_enc,uid_target_enc,dev_id_target_enc,tags_target_enc,slot_id_target_enc
0,-1,1804331,3577,5884,7,207,17,5,13,13,37,4,21,6,262,5,29,141,9,4,2,46,4,3,2,20,9,3,-1,-1,-1,2,3^4^5^6^7^8^9^10^11^12^13^14^15^16^17^18^19^20...,12,17,8,0.035893,0.043609,0.043609,0.050807,0.032035,0.038655,0.043264,0.042234,0.031386,0.034956,0.026657,0.048696,0.043978,0.024208
1,-1,1692064,2926,4168,3,142,36,5,17,80,41,4,26,4,275,5,56,141,3,2,3,23,5,3,2,20,8,4,18,-1,-1,2,7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23,11,42,8,0.028816,0.032494,0.032494,0.027398,0.028809,0.038655,0.025651,0.025853,0.033199,0.034956,0.034484,0.028163,0.025666,0.03036
2,-1,2080926,3869,6751,7,130,30,5,19,28,39,4,17,5,398,4,46,141,3,4,2,34,6,2,2,14,10,3,20,-1,-1,2,5^6^7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^...,11,36,8,0.035893,0.056831,0.056831,0.057671,0.026046,0.038655,0.040023,0.025853,0.033213,0.034956,0.034484,0.058795,0.041818,0.016701
3,-1,1273981,2763,3547,7,156,56,5,17,58,37,4,21,6,108,4,56,141,9,2,2,29,14,3,2,20,8,4,-1,-1,-1,2,6^7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23,10,17,8,0.035893,0.017765,0.017765,0.034552,0.032035,0.038655,0.043264,0.042234,0.033243,0.034956,0.0,0.033804,0.043978,0.03036
4,-1,1792927,1193,3734,7,178,17,5,12,70,39,4,17,7,247,3,99,141,4,2,2,40,2,3,2,20,12,2,18,-1,-1,2,7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23,11,36,8,0.035893,0.034026,0.034026,0.045426,0.056194,0.038655,0.040023,0.028527,0.042397,0.034956,0.0,0.048696,0.041818,0.065718


In [9]:
target_enc = ['net_type_target_enc', 'task_id_target_enc',
       'adv_id_target_enc', 'adv_prim_id_target_enc', 'age_target_enc',
       'app_first_class_target_enc', 'app_second_class_target_enc',
       'career_target_enc', 'city_target_enc', 'consume_purchase_target_enc',
       'uid_target_enc', 'dev_id_target_enc', 'tags_target_enc',
       'slot_id_target_enc']

train_test_targetenc = pd.concat([train_df[target_enc], test_df[target_enc]])
train_test_targetenc = train_test_targetenc.reset_index(drop=True)

train_test_targetenc = reduce_mem_usage(train_test_targetenc)

100%|██████████| 14/14 [00:06<00:00,  2.09it/s]

Mem. usage decreased to 213.62 Mb (75.0% reduction)





In [12]:
train_test_targetenc.to_pickle("data/train_test_targetenc.pkl")