In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy import stats
import matplotlib
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import gc
import warnings
from collections import Counter
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
pd.set_option('expand_frame_repr', False) #数据超过总宽度后，是否折叠显示
pd.set_option('display.width', 100) #数据显示总宽度
pd.set_option('max_rows', 200) #显示最多行数，超出该数以省略号表示
pd.set_option('max_columns', 100) #显示最多列数，超出该数以省略号表示
pd.set_option('max_colwidth', 16) #设置单列的宽度，用字符个数表示，单个数据长度超出该数时以省略号表示
pd.set_option('large_repr', 'truncate') #数据超过设置显示最大行列数时，带省略号显示/若是info则是统计信息显示
pd.set_option('show_dimensions', True) #当数据带省略号显示时，是否在最后显示数据的维度

sns.set_style("whitegrid")
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif' 
matplotlib.rcParams['axes.unicode_minus'] = False
matplotlib.fontsize='20'

In [2]:
dir_path = '/home/zuoyuhui/datasets/天猫/data_format1/'
file_train = 'train_format1.csv'
file_test = 'test_format1.csv'
file_user_info = 'user_info_format1.csv'
file_user_log = 'user_log_format1.csv'

In [3]:
train_data = pd.read_csv(dir_path+file_train)
test_data = pd.read_csv(dir_path+file_test)
user_info = pd.read_csv(dir_path+file_user_info)
user_log = pd.read_csv(dir_path+file_user_log)

In [4]:
#定义内存压缩方法
def reduce_men_usage(df,verbose=True):
    start_mem = df.memory_usage().sum()/ 1024**2
    numerics = ['int16','int32','int64','float16','float32','float64']
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                #NumPy分别提供numpy.iinfo 并numpy.finfo 验证NumPy整数和浮点值的最小值或最大值：
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
                    
            end_men = df.memory_usage().sum()/1024**2
    print('Memory usage after optimization is :{:.2f} MB'.format(end_men))
    print('Decreased by {:1f}%'.format(100*(start_mem - end_men)/start_mem))
    return df

In [5]:
num_rows=None
train_data = reduce_men_usage(pd.read_csv(dir_path+file_train,num_rows))
test_data = reduce_men_usage(pd.read_csv(dir_path+file_test,num_rows))
user_info = reduce_men_usage(pd.read_csv(dir_path+file_user_info,num_rows))
user_log = reduce_men_usage(pd.read_csv(dir_path+file_user_log,num_rows))

Memory usage after optimization is :1.74 MB
Decreased by 70.831885%
Memory usage after optimization is :3.49 MB
Decreased by 41.665817%
Memory usage after optimization is :3.24 MB
Decreased by 66.665828%
Memory usage after optimization is :890.48 MB
Decreased by 69.642854%


In [6]:
# 合并用户信息
all_data = train_data.append(test_data)
all_data = all_data.merge(user_info,on=['user_id'],how='left')
del train_data,test_data,user_info

In [7]:
def clear_mem():
    %reset -f out
    %reset -f in
    gc.collect()
clear_mem()

Flushing output cache (0 entries)
Flushing input history


In [8]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 522341 entries, 0 to 522340
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      522341 non-null  int32  
 1   merchant_id  522341 non-null  int16  
 2   label        260864 non-null  float64
 3   prob         0 non-null       float64
 4   age_range    519763 non-null  float16
 5   gender       514796 non-null  float16
dtypes: float16(2), float64(2), int16(1), int32(1)
memory usage: 16.9 MB


In [9]:
# 按时间排序
user_log = user_log.sort_values(['user_id','time_stamp'])

In [10]:
#合并数据
# 对每个用户逐个合并所有字段，合并字段为
#item_id /cat_id/brand_id/time_stamp/action_type
list_join_function = lambda x:" ".join([str(i) for i in x])

agg_dict = {
    'item_id':list_join_function,
    'cat_id':list_join_function,
    'seller_id':list_join_function,
    'brand_id':list_join_function,
    'time_stamp':list_join_function,
    'action_type':list_join_function
}

rename_dict = {
    'item_id':'item_path',
    'cat_id':'cat_path',
    'seller_id':'seller_path',
    'brand_id':'brand_path',
    'time_stamp':'time_stamp_path',
    'action_type':'action_type_path'
}

def merge_list(df_ID,join_columns,df_data,agg_dict,rename_dict):
    df_data = df_data.groupby(join_columns).agg(agg_dict).reset_index().rename(columns=rename_dict)
    df_ID = df_ID.merge(df_data,on=join_columns,how='left')
    return df_ID

all_data=merge_list(all_data,'user_id',user_log,agg_dict,rename_dict)

In [11]:
all_data

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path
0,34176,3906,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...
1,34176,121,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...
2,34176,4356,1.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...
3,34176,2217,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...
4,230784,4818,0.0,,0.0,0.0,191923 19192...,1023 1023 10...,3545 3545 35...,5860.0 5860....,601 601 601 ...,0 2 0 0 0 0 ...
...,...,...,...,...,...,...,...,...,...,...,...,...
522336,228479,3111,,,6.0,0.0,802791 97730...,602 602 602 ...,2823 2823 26...,1128.0 1128....,511 511 512 ...,3 3 2 2 2 3 ...
522337,97919,2341,,,8.0,1.0,484765 12876...,737 464 464 ...,4408 235 235...,6968.0 2020....,626 707 707 ...,2 0 0 0 0 0 ...
522338,97919,3971,,,8.0,1.0,484765 12876...,737 464 464 ...,4408 235 235...,6968.0 2020....,626 707 707 ...,2 0 0 0 0 0 ...
522339,32639,3536,,,0.0,0.0,394570 39457...,1413 1413 14...,1065 1065 10...,6376.0 4468....,523 523 523 ...,0 2 0 0 0 0 ...


In [12]:
del user_log
clear_mem()

Flushing output cache (1 entries)
Flushing input history


In [13]:
# 定义特征统计函数

# 定义统计数据总数的函数
def cnt_(x):
    try:
        return len(x.split(' '))
    except:
        return -1
    
# 定义统计数据唯一值总数的函数
def nunique_(x):
    try:
        return len(set(x.split(' ')))
    except:
        return -1
    
# 定义统计数据最大值的函数
def max_(x):
    try:
        return np.max([float(i) for i in x.split(' ')])
    except:
        return -1
    
# 定义统计数据最小值的函数
def min_(x):
    try:
        return np.min([float(i) for i in x.split(' ')])
    except:
        return -1
    
# 定义统计数据标准差的函数
def std_(x):
    try:
        return np.std([float(i) for i in x.split(' ')])
    except:
        return -1

# 定义统计数据中topN数据的函数
def most_n(x,n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][0]
    except:
        return -1
    
# 定义统计数据中topN数据总数的函数
def most_n_cnt(x,n):
    try:
        return Counter(x.split(' ').most_common(n))[n-1][1]
    except:
        return -1

In [14]:
Counter(all_data.brand_path[1].split(' ')).most_common(2)[1][0]

'6268.0'

In [15]:
# 调用数据集的特征统计函数
def user_cnt(df_data,single_col,name):
    df_data[name] = df_data[single_col].apply(cnt_)
    return df_data

def user_nunique(df_data,single_col,name):
    df_data[name] = df_data[single_col].apply(nunique_)
    return df_data

def user_min(df_data,single_col,name):
    df_data[name] = df_data[single_col].apply(min_)
    return df_data

def user_max(df_data,single_col,name):
    df_data[name] = df_data[single_col].apply(max_)
    return df_data

def user_std(df_data,single_col,name):
    df_data[name] = df_data[single_col].apply(std_)
    return df_data

def user_most_n(df_data,single_col,name,n=1):
    func = lambda x:most_n(x,n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

def user_most_n_cnt(df_data,single_col,name,n=1):
    func = lambda x:most_n_cnt(x,n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

# 店铺统计特征统计： 统计与店铺特点有关的特征，如店铺、商品、品牌等。

In [16]:

"""
    提取基本统计特征
"""
all_data_test = all_data.head(2000)

# 统计用户点击、浏览、加购、购买行为

# 总次数
all_data_test = user_cnt(all_data_test,'seller_path','user_cnt')
# 不同店铺个数
all_data_test = user_nunique(all_data_test,'seller_path','seller_nunique')
# 不同品类个数
all_data_test = user_nunique(all_data_test,'brand_path','brand_nunique')
# 不同品牌个数
all_data_test = user_nunique(all_data_test,'item_path','item_nunique')
# 不同商品个数
all_data_test = user_nunique(all_data_test,'item_path','item_nunique')
# 活跃天数
all_data_test = user_nunique(all_data_test,'time_stamp_path','time_stamp_nunique')
# 不同用户行为种数
all_data_test = user_nunique(all_data_test,'action_type_path','action_type_nunique')
# 最晚时间
all_data_test = user_max(all_data_test,'action_type_path','time_stamp_max')
# 最早时间
all_data_test = user_min(all_data_test,'action_type_path','time_stamp_min')
# 活跃天数方差
all_data_test = user_std(all_data_test,'action_type_path','time_stamp_std')

In [17]:
all_data_test.head()

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique,time_stamp_max,time_stamp_min,time_stamp_std
0,34176,3906,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.63427
1,34176,121,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.63427
2,34176,4356,1.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.63427
3,34176,2217,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.63427
4,230784,4818,0.0,,0.0,0.0,191923 19192...,1023 1023 10...,3545 3545 35...,5860.0 5860....,601 601 601 ...,0 2 0 0 0 0 ...,54,20,19,31,16,2,2.0,0.0,0.671791


In [18]:
# 最早和最晚相差天数
all_data_test['time_stamp_range'] = all_data_test['time_stamp_max'] - all_data_test['time_stamp_min']
# 用户最喜欢的店铺
all_data_test = user_most_n(all_data_test,'seller_path','seller_most_1',n=1)
# 最喜欢的类目
all_data_test = user_most_n(all_data_test,'cat_path','cat_most_1',n=1)
# 最喜欢的品牌
all_data_test = user_most_n(all_data_test,'brand_path','brand_most_1',n=1)
# 最常见的行为动作
all_data_test = user_most_n(all_data_test,'action_type_path','action_type_1',n=1)

# ....
# 最喜欢的店铺 行为次数
all_data_test = user_most_n_cnt(all_data_test,'seller_path','seller_most_1_cnt',n=1)
# 最喜欢的类目 行为次数
all_data_test = user_most_n_cnt(all_data_test,'cat_path','cat_most_1_cnt',n=1)
# 最喜欢的品牌 行为次数
all_data_test = user_most_n_cnt(all_data_test,'brand_path','brand_most_1_cnt',n=1)
# 最常见的行为动作 行为次数
all_data_test = user_most_n_cnt(all_data_test,'action_type_path','action_path_1_cnt',n=1)

In [19]:
all_data_test

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique,time_stamp_max,time_stamp_min,time_stamp_std,time_stamp_range,seller_most_1,cat_most_1,brand_most_1,action_type_1,seller_most_1_cnt,cat_most_1_cnt,brand_most_1_cnt,action_path_1_cnt
0,34176,3906,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.634270,3.0,331,662,4094.0,0,-1,-1,-1,-1
1,34176,121,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.634270,3.0,331,662,4094.0,0,-1,-1,-1,-1
2,34176,4356,1.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.634270,3.0,331,662,4094.0,0,-1,-1,-1,-1
3,34176,2217,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.634270,3.0,331,662,4094.0,0,-1,-1,-1,-1
4,230784,4818,0.0,,0.0,0.0,191923 19192...,1023 1023 10...,3545 3545 35...,5860.0 5860....,601 601 601 ...,0 2 0 0 0 0 ...,54,20,19,31,16,2,2.0,0.0,0.671791,2.0,3556,407,1236.0,0,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,220293,2883,0.0,,4.0,1.0,551995 49773...,50 50 683 68...,638 2782 288...,1444.0 84.0 ...,608 608 614 ...,0 0 0 0 0 0 ...,45,9,9,20,7,3,3.0,0.0,0.726908,3.0,2677,656,1579.0,0,-1,-1,-1,-1
1996,155013,3727,0.0,,2.0,0.0,881928 83327...,737 737 737 ...,4390 3718 49...,5224.0 3168....,804 804 804 ...,0 0 0 0 0 0 ...,106,29,29,49,10,3,3.0,0.0,0.347011,3.0,3727,737,6816.0,0,-1,-1,-1,-1
1997,24453,1487,0.0,,5.0,1.0,1049364 1049...,1075 1075 12...,3365 3365 12...,7524.0 7524....,524 524 623 ...,2 0 0 0 0 0 ...,81,13,13,52,7,2,2.0,0.0,0.710494,2.0,4346,407,2920.0,0,-1,-1,-1,-1
1998,155781,1861,0.0,,0.0,2.0,437216 10406...,407 407 1438...,3163 118 421...,3880.0 6020....,514 528 729 ...,2 0 0 0 0 0 ...,59,43,42,47,12,2,2.0,0.0,0.502793,2.0,1199,1389,5408.0,0,-1,-1,-1,-1


# 用户特征统计：对用户的点击、加购、购买、收藏等特征进行统计

In [20]:
# 对点击、加购、购买、收藏分开统计
import copy
def col_cnt_(df_data,columns_list,action_type):
    try:
        data_dict={}
        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']
        
        for col in col_list:
            data_dict[col] = df_data[col].split(' ')
            
        path_len = len(data_dict[col])
        
        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_'+data_dict[col_][i_]
            data_out.append(data_txt)
            
        return len(data_out)
    except:
        return -1

    
def col_nuique_(df_data,columns_list,action_type):
    try:
        data_dict = {}
        col_list = copy.deepcopy(columns_list)
        if action_type!=None:
            col_list += ['action_type_path']
        
        for col in col_list:
            data_dict[col] = df_data[col].split(' ')
        
        path_len = len(data_dict[col].split(' '))
        
        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_]== action_type:
                    data_txt += '_'+data_dict[col_][i_]
            data_out.append(data_txt)
        return len(set(data_out))
    except:
        return -1
    
def user_col_cnt(df_data,columns_list,action_type,name):
    df_data[name] = df_data.apply(lambda x:col_cnt_(x,columns_list,action_type),axis=1)
    return df_data

def user_col_nunique(df_data,columns_list,action_type,name):
    df_data[name] = df_data.apply(lambda x:col_nuique_(x,columns_list,action_type),axis=1)
    return df_data

In [21]:
# 点击次数
all_data_test = user_col_cnt(all_data_test,['seller_path'],'0','user_cnt_0')
# 加购次数
all_data_test = user_col_cnt(all_data_test,['seller_path'],'1','user_cnt_1')
# ....

In [22]:
# 点击次数
all_data_test = user_col_cnt(all_data_test,['seller_path','item_path'],'0','seller_nunique_0')
# 不同店铺数
all_data_test = user_col_nunique(all_data_test,['seller_path','item_path'],'0','seller_nunique_0')
# ...

In [23]:
all_data_test.columns

Index(['user_id', 'merchant_id', 'label', 'prob', 'age_range', 'gender', 'item_path', 'cat_path',
       'seller_path', 'brand_path', 'time_stamp_path', 'action_type_path', 'user_cnt',
       'seller_nunique', 'brand_nunique', 'item_nunique', 'time_stamp_nunique',
       'action_type_nunique', 'time_stamp_max', 'time_stamp_min', 'time_stamp_std',
       'time_stamp_range', 'seller_most_1', 'cat_most_1', 'brand_most_1', 'action_type_1',
       'seller_most_1_cnt', 'cat_most_1_cnt', 'brand_most_1_cnt', 'action_path_1_cnt',
       'user_cnt_0', 'user_cnt_1', 'seller_nunique_0'],
      dtype='object')

In [24]:
"""
利用Countvector 和TF-IDF提取特征
"""
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from scipy import sparse

tfidfVec = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,ngram_range=(1,1),max_features=100)

columns_list =['seller_path']
for i,col in enumerate(columns_list):
    tfidfVec.fit(all_data_test[col])
    data_ = tfidfVec.transform(all_data_test[col])
    if i==0:
        data_cat = data_
    else:
        data_cat=sparse.hstack((data_cat,data_))

In [25]:
df_tiidf = pd.DataFrame(data_cat.toarray())
df_tiidf.columns = ['tfidf_'+str(i) for i in df_tiidf.columns]
all_data_test = pd.concat([all_data_test,df_tiidf],axis=1)

In [26]:
all_data_test.head()

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique,time_stamp_max,time_stamp_min,time_stamp_std,time_stamp_range,seller_most_1,cat_most_1,brand_most_1,action_type_1,seller_most_1_cnt,cat_most_1_cnt,brand_most_1_cnt,action_path_1_cnt,user_cnt_0,user_cnt_1,seller_nunique_0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,tfidf_10,tfidf_11,tfidf_12,tfidf_13,tfidf_14,tfidf_15,tfidf_16,...,tfidf_50,tfidf_51,tfidf_52,tfidf_53,tfidf_54,tfidf_55,tfidf_56,tfidf_57,tfidf_58,tfidf_59,tfidf_60,tfidf_61,tfidf_62,tfidf_63,tfidf_64,tfidf_65,tfidf_66,tfidf_67,tfidf_68,tfidf_69,tfidf_70,tfidf_71,tfidf_72,tfidf_73,tfidf_74,tfidf_75,tfidf_76,tfidf_77,tfidf_78,tfidf_79,tfidf_80,tfidf_81,tfidf_82,tfidf_83,tfidf_84,tfidf_85,tfidf_86,tfidf_87,tfidf_88,tfidf_89,tfidf_90,tfidf_91,tfidf_92,tfidf_93,tfidf_94,tfidf_95,tfidf_96,tfidf_97,tfidf_98,tfidf_99
0,34176,3906,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.63427,3.0,331,662,4094.0,0,-1,-1,-1,-1,451,451,-1,0.0,0.0,0.009531,0.186124,0.0,0.173224,0.036434,0.0,0.0,0.012205,0.0,0.0,0.0,0.0,0.0,0.0,0.012738,...,0.865332,0.0,0.0,0.0,0.010166,0.0,0.0,0.0,0.011752,0.0,0.0,0.0,0.199454,0.057178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234063,0.026766,0.011404,0.0,0.031593,0.0,0.0,0.059432,0.0,0.0,0.010496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012666,0.011178,0.0,0.0,0.0,0.0,0.012925,0.0
1,34176,121,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.63427,3.0,331,662,4094.0,0,-1,-1,-1,-1,451,451,-1,0.0,0.0,0.009531,0.186124,0.0,0.173224,0.036434,0.0,0.0,0.012205,0.0,0.0,0.0,0.0,0.0,0.0,0.012738,...,0.865332,0.0,0.0,0.0,0.010166,0.0,0.0,0.0,0.011752,0.0,0.0,0.0,0.199454,0.057178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234063,0.026766,0.011404,0.0,0.031593,0.0,0.0,0.059432,0.0,0.0,0.010496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012666,0.011178,0.0,0.0,0.0,0.0,0.012925,0.0
2,34176,4356,1.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.63427,3.0,331,662,4094.0,0,-1,-1,-1,-1,451,451,-1,0.0,0.0,0.009531,0.186124,0.0,0.173224,0.036434,0.0,0.0,0.012205,0.0,0.0,0.0,0.0,0.0,0.0,0.012738,...,0.865332,0.0,0.0,0.0,0.010166,0.0,0.0,0.0,0.011752,0.0,0.0,0.0,0.199454,0.057178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234063,0.026766,0.011404,0.0,0.031593,0.0,0.0,0.059432,0.0,0.0,0.010496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012666,0.011178,0.0,0.0,0.0,0.0,0.012925,0.0
3,34176,2217,0.0,,6.0,0.0,581818 87900...,1505 662 150...,416 3606 416...,4014.0 33.0 ...,521 521 521 ...,0 0 0 0 0 0 ...,451,109,106,256,47,3,3.0,0.0,0.63427,3.0,331,662,4094.0,0,-1,-1,-1,-1,451,451,-1,0.0,0.0,0.009531,0.186124,0.0,0.173224,0.036434,0.0,0.0,0.012205,0.0,0.0,0.0,0.0,0.0,0.0,0.012738,...,0.865332,0.0,0.0,0.0,0.010166,0.0,0.0,0.0,0.011752,0.0,0.0,0.0,0.199454,0.057178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234063,0.026766,0.011404,0.0,0.031593,0.0,0.0,0.059432,0.0,0.0,0.010496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012666,0.011178,0.0,0.0,0.0,0.0,0.012925,0.0
4,230784,4818,0.0,,0.0,0.0,191923 19192...,1023 1023 10...,3545 3545 35...,5860.0 5860....,601 601 601 ...,0 2 0 0 0 0 ...,54,20,19,31,16,2,2.0,0.0,0.671791,2.0,3556,407,1236.0,0,-1,-1,-1,-1,54,54,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.860915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# 嵌入特征
import gensim

# Train word2Vec model
model = gensim.models.Word2Vec(
    all_data_test['seller_path'].apply(lambda x:x.split(' ')),
    size=100,
    window=5,
    min_count=5,
    workers=4
)
model.save("product2Vec.model")
model = gensim.models.Word2Vec.load("product2Vec.model")

In [28]:
def mean_w2v_(x,model,size=100):
    try:
        i=0
        for word in x.split(' '):
            if word in model.wv.vocab:
                i+=1
                if i == 1:
                    vec = np.zeros(size)
                vec += model.wv(word)
        return vec/i
    except:
        return np.zeros(size)
    
def get_mean_w2v(df_data,columns,model,size):
    data_array = []
    for index,row in df_data.iterrows():
        w2v = mean_w2v_(row[columns],model,size)
        data_array.append(w2v)
    return pd.DataFrame(data_array)

df_embedding = get_mean_w2v(all_data_test,'seller_path',model,100)
df_embedding.columns = ['embedding_'+str(i) for i in df_embedding.columns]

In [29]:
df_embedding

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,embedding_10,embedding_11,embedding_12,embedding_13,embedding_14,embedding_15,embedding_16,embedding_17,embedding_18,embedding_19,embedding_20,embedding_21,embedding_22,embedding_23,embedding_24,embedding_25,embedding_26,embedding_27,embedding_28,embedding_29,embedding_30,embedding_31,embedding_32,embedding_33,embedding_34,embedding_35,embedding_36,embedding_37,embedding_38,embedding_39,embedding_40,embedding_41,embedding_42,embedding_43,embedding_44,embedding_45,embedding_46,embedding_47,embedding_48,embedding_49,embedding_50,embedding_51,embedding_52,embedding_53,embedding_54,embedding_55,embedding_56,embedding_57,embedding_58,embedding_59,embedding_60,embedding_61,embedding_62,embedding_63,embedding_64,embedding_65,embedding_66,embedding_67,embedding_68,embedding_69,embedding_70,embedding_71,embedding_72,embedding_73,embedding_74,embedding_75,embedding_76,embedding_77,embedding_78,embedding_79,embedding_80,embedding_81,embedding_82,embedding_83,embedding_84,embedding_85,embedding_86,embedding_87,embedding_88,embedding_89,embedding_90,embedding_91,embedding_92,embedding_93,embedding_94,embedding_95,embedding_96,embedding_97,embedding_98,embedding_99
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
df_embedding_value = [col for col in df_embedding.columns if df_embedding[col].nunique() <= 1]  #全是0？？？

In [31]:
# all_data_test = pd.concat([all_data_test,df_embedding],axis=1)

In [32]:
import lightgbm
import xgboost
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import MultinomialNB,GaussianNB

In [33]:
# 读取训练数据和验证数据
features_columns = [
    c for c in all_data_test.columns if c not in[
        'label','prob','seller_path','cat_path','brand_path','action_type_path',
        'item_path','time_stamp_path','seller_most_1_cnt','cat_most_1_cnt','brand_most_1_cnt','action_path_1_cnt',
         'seller_nunique_0'
    ]
]

In [34]:
data_value = [col for col in features_columns if all_data_test[col].nunique() <= 1] 

In [35]:
data_value

[]

In [36]:
X_train = all_data_test[~all_data_test['label'].isna()][features_columns]
y_train = all_data_test[~all_data_test['label'].isna()]['label']
x_valid = all_data_test[~all_data_test['label'].isna()][features_columns]

In [38]:
from sklearn.metrics import log_loss

In [39]:
# 处理函数值inf及nan 为特征工程准备
def get_matrix(data):
    where_are_nan = np.isnan(data)
    where_are_inf = np.isnan(data)
    data[where_are_inf]=0
    data[where_are_nan]=0
    return data

X_train = np.float_(get_matrix(np.float_(X_train)))
y_train = np.int_(y_train)
x_valid = X_train

In [40]:
from sklearn.model_selection import StratifiedKFold,KFold
folds =5
seed = 1
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [52]:
def lgb_model_zuo(train_x,train_y,test_x,kf,label_split=None):
    train = np.zeros((train_x.shape[0],1))
    test = np.zeros((test_x.shape[0],1))
    test_pre = np.empty((folds,test_x.shape[0],1))
    cv_scores = []
    feature_importance_values = []
    best_iterations = []
    
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metrics': 'auc',
        'scale_pos_weight': 13.5,
        'learning_rate': 0.05,
        'random_state': 927
        }
    
    for i,(train_index,test_index) in enumerate(kf.split(train_x,label_split)):
        tr_x = train_x[train_index]
        tr_y = train_y[train_index]
        te_x = train_x[test_index]
        te_y = train_y[test_index]
        
        train_matrix = lgb.Dataset(tr_x,label=tr_y)
        test_matrix = lgb.Dataset(te_x,label=te_y)
        
        model = lgb.train(params=params,
                        train_set=train_matrix,
                        valid_sets=test_matrix,
                        valid_names=['valid'],
                        num_boost_round=10000,
                        early_stopping_rounds=100,
                        verbose_eval=50)
        
        pre = model.predict(te_x, num_iteration=model.best_iteration)
        print(pre)
        train[test_index] = pre.reshape(-1,1)
        test_pre[i,:] = model.predict(te_x,num_iteration=model.best_iteration)
        cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
        
        feature_importance_values.append(model.feature_importance())
        best_iteration = model.best_iteration
            
        gc.enable()
        del model, tr_x, va_x
        gc.collect()
        print('now score is:',cv_scores)
        print('best_score is:',model.best_score)
        
    test[:] = test_pre.mean(axis=0)
    print('score_list:',cv_scores)
    print('score_mean:',np.mean(cv_scores))
    
    valid_scores_softmax = np.exp(np.array(cv_scores)) / np.sum(np.exp(np.array(cv_scores)))
    feature_importance_values = np.array(feature_importance_values).T
    
    submission = pd.DataFrame({'id': test_id, 'label': test_predictions})
    feature_importances = pd.DataFrame({'feature': feature_names, 
                                        'importance': feature_importance_values.dot(valid_scores_softmax)})


    fold_names = list(range(kf.get_n_splits()))
    fold_names.append('overall')
    

    metrics = pd.DataFrame({'fold': fold_names,
                            'valid': valid_scores})
    
    return train.reshape(-1,1),test.reshape(-1,1), feature_importances, metrics, best_iterations
        
        
        
        

In [53]:
train_data,test_data, feature_importances, metrics, best_iterations = lgb_model_zuo(X_train, y_train, x_valid, kf)

[LightGBM] [Info] Number of positive: 103, number of negative: 1497
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5251
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 117
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064375 -> initscore=-2.676489
[LightGBM] [Info] Start training from score -2.676489
Training until validation scores don't improve for 100 rounds
[50]	valid's auc: 0.544529
[100]	valid's auc: 0.541135
[150]	valid's auc: 0.562217
[200]	valid's auc: 0.561394
[250]	valid's auc: 0.558309
Early stopping, best iteration is:
[173]	valid's auc: 0.567873
[1.26759741e-01 1.38644452e-02 1.83836980e-02 5.11988788e-03
 1.63928685e-03 8.99589795e-03 2.49862512e-02 1.60273479e-02
 3.24128178e-02 6.72567959e-03 2.57697314e-03 5.47727424e-03
 6.83306967e-03 3.31808374e-03 6.70525263e-04 1.08574886e-02
 2.16885256e-03 3.71269560e-04 8.32808454

ValueError: could not broadcast input array from shape (400) into shape (2000,1)

In [47]:
from sklearn.metrics import log_loss

In [56]:
def lgb_model(X, y, test, cat_cols):
    print(f'features count:',{len(test.columns)})
#     print(f'features:',{list(test.columns)})
    
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metrics': 'auc',
        'scale_pos_weight': 13.5,
        'learning_rate': 0.05,
        'random_state': 927
    }
    
    feature_names = list(X.columns)
    out_of_fold = np.zeros(X.shape[0])
    test_predictions = np.zeros(test.shape[0])
    feature_importance_values = []
    valid_scores = []
    best_iterations = []
    
    kf = StratifiedKFold(n_split=5,random_state=927)
    for i,(tr,va) in enumerate(kf.split(X,y)):
        print(f'cv1')
        tr_x, tr_y = X.iloc[tr],y.iloc[tr]
        va_x, va_y = X.iloc[va],y.iloc[va]
        lgb_tr = lgb.Dataset(tr_x,tr_y)
        lgb_va = lgb.Dataset(va_x,va_y,reference=lgb_tr)
        
        model = lgb.train(params=params,
                         train_set=lgb_tr,
                         valid_sets=[lgb_va],
                         valid_names=['valid'],
                         num_boost_round=10000,
#                          categorical_feature=cat_cols,
                         early_stopping_rounds=100,
                         verbose_eval=50)
        
        feature_importance_values.append(model.feature_importance())
        best_iteration = model.best_iteration
        test_predictions += model.predict(test,num_iteration=best_iteration)/kf.get_n_splits()
        out_of_fold[va] = model.predict(X.iloc[va],num_iteration=best_iteration)
        print(model.best_score)
        valid_scores = log_loss(va_y,out_of_fold[va])
        best_iterations.append(best_iteration)
        
        gc.enable()
        del model,tr_x,va_x
        gc.collect()
        
    valid_scores_softmax = np.exp(np.array(valid_scores)/np.sum(np.exp(np.array(valid_scores))))
    feature_importance_values = np.array(feature_importance_values).T
    
    valid_scores_softmax = np.exp(np.array(valid_scores)) / np.sum(np.exp(np.array(valid_scores)))
    feature_importance_values = np.array(feature_importance_values).T
    
    submission = pd.DataFrame({'id': test_id,'label': test_predictions})
    feature_importances = pd.DataFrame({'feature': feature_names, 
                                        'importance': feature_importance_values.dot(valid_scores_softmax)})


    fold_names = list(range(kf.get_n_splits()))
    fold_names.append('overall')
    
    valid_loss = tpr_weight_function(y, out_of_fold)
    valid_scores.append(valid_loss)

    metrics = pd.DataFrame({'fold': fold_names,
                            'valid': valid_scores})
    
    return submission, feature_importances, metrics, best_iterations

In [57]:
lgb_model(X_train, y_train, x_valid, None)

features count: {120}


TypeError: __init__() got an unexpected keyword argument 'n_split'