In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_column",100)
pd.set_option("expand_frame_repr",False)
%matplotlib inline

In [2]:
#1--读取数据
test_data=pd.read_csv(r'D:\文件\学习\数据集\天猫用户复购\data_format1\test_format1.csv')
train_data=pd.read_csv(r'D:\文件\学习\数据集\天猫用户复购\data_format1\train_format1.csv')
user_info=pd.read_csv(r'D:\文件\学习\数据集\天猫用户复购\data_format1\user_info_format1.csv')
user_log=pd.read_csv(r'D:\文件\学习\数据集\天猫用户复购\data_format1\user_log_format1.csv')

In [3]:
# reduce memory
def reduce_mem_usage(df, verbose=True):
    #定义起始内存显示
    start_mem = df.memory_usage().sum() / 1024**2
    #定义一个数据类型的列表
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    for col in df.columns:
        #定义col_type接受数据集各个字段的数据类型
        col_type = df[col].dtypes
        if col_type in numerics:
            #显示字段中单个数据占用的最小字节
            c_min = df[col].min()
            #显示字段中单个数据占用的最大字节
            c_max = df[col].max()
            #截取数据类型的前三位数
            if str(col_type)[:3] == 'int':
                #当前字段占用的最小字节数量和最大字节数量在int8类型的范围内，则将当前字段的数据类型转换成int8
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                #当前字段占用的最小字节数量和最大字节数量在float16类型的范围内，则将当前字段的数据类型转换成float16
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    #定义结束时的内存显示               
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
#合并用户数据
all_data=train_data.append(test_data)
all_data_path=all_data.merge(user_info,on=["user_id"],how="left")

In [5]:
user_log=user_log.sort_values(["user_id","time_stamp"])
user_log.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
23288890,1,181459,276,2245,4750.0,1009,0
23288891,1,779078,276,2245,4750.0,1009,0
23288892,1,779078,276,2245,4750.0,1009,0
23288893,1,452837,276,2245,4750.0,1009,0
23288894,1,543397,276,2245,4750.0,1009,0


In [6]:
#合并user_id各个字段
def join(x):
    data_list=[]
    for i in x:
        data_list.append(str(i))
    data_join_list=",".join(data_list)
    return data_join_list

In [7]:
agg_dict={
    "item_id":join,
    "cat_id":join,
    "seller_id":join,
    "brand_id":join,
    "time_stamp":join,
    "action_type":join
}

In [8]:
rename_dict={
    "item_id":"item_path",
    "cat_id":"cat_path",
    "seller_id":"seller_path",
    "brand_id":"brand_path",
    "time_stamp":"time_stamp_path",
    "action_type":"action_type_path"
}

In [9]:
user_log_path=user_log.groupby("user_id").agg(agg_dict).reset_index().rename(columns=rename_dict)

In [10]:
user_log_path.head()

Unnamed: 0,user_id,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path
0,1,"181459,779078,779078,452837,543397,504149,5041...","276,276,276,276,276,1023,1023,1023,1023,1252,1...","2245,2245,2245,2245,2245,925,925,925,925,4026,...","4750.0,4750.0,4750.0,4750.0,4750.0,7402.0,7402...","1009,1009,1009,1009,1009,1011,1011,1011,1011,1...","0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,2,2,..."
1,2,"348983,749563,239288,751744,239288,714176,1972...","177,177,602,602,602,1213,602,602,602,1213,1213...","2223,2223,420,420,420,420,420,420,420,420,420,...","3273.0,3273.0,4953.0,4953.0,4953.0,4058.0,4953...","527,527,626,626,626,626,626,626,626,626,626,62...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,..."
2,3,"895754,895754,182882,182882,985337,175397,9463...","1505,1505,1271,1271,1271,662,1134,1134,1134,45...","795,795,2123,2123,4925,1102,4461,4461,4461,474...","3608.0,3608.0,4796.0,4796.0,8005.0,1214.0,905....","516,516,627,627,627,727,819,819,820,906,906,90...","2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
3,4,"836727,243874,185489,95474,30073,790055,790055...","612,1577,1505,1505,1505,1505,1505,1505,1505,15...","1221,1221,1221,1221,1221,1221,1221,1221,1221,1...","7734.0,7734.0,7734.0,7734.0,7734.0,7734.0,7734...","527,527,527,527,527,527,527,527,527,527,527,52...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
4,5,"511956,728354,568450,754044,1011255,741215,362...","302,351,812,1213,142,464,1028,35,1213,351,142,...","3098,3215,641,3736,1483,176,4848,641,3736,4547...","5545.0,5482.0,4265.0,3125.0,4640.0,6662.0,2803...","519,520,520,520,520,520,520,520,520,520,520,52...","3,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,..."


In [11]:
#前2000行作为测试样本(学习时使用，避免全量数据耗时过多)
all_data_test=user_log_path.head(2000)

In [12]:
all_data_test.to_csv(r'D:\文件\学习\数据集\天猫用户复购\data_format1\random_2000.csv')

In [13]:
#定义工具类函数
#统计数据的总数
def cnt_(x):
    try:
        return len(x.split(","))
    except:
        return -1
#统计去重后数据总数
def nunique_(x):
    try:
        return len(set(x.split(",")))
    except:
        return -1
#统计最大值
def max_(x):
    try:
        col_list=[]
        for i in x.split(","):
            col_list.append(int(i))
            data_max=np.max(col_list)
        return data_max
    except:
        return -1
#统计最小值
def min_(x):
    try:
        col_list=[]
        for i in x.split(","):
            col_list.append(int(i))
        data_min=np.min(col_list)
        return data_min
    except:
        return -1
#统计标注差
def std_(x):
    try:
        col_list=[]
        for i in x.split(","):
            col_list.append(int(i))
        data_std=np.std(col_list)
        return data_std
    except:
        return -1
#统计数据中top_n的值
def most_n(x,n):
    try:
        return Counter(x.split(" ")).most_common(n)[n-1][0]
    except:
        return -1
#统计数据中top_n的数据具体有多少
def most_n_cnt(x,n):
    try:
        return Counter(x.split(" ")).most_common(n)[n-1][1]
    except:
        return -1

In [14]:
#在工具类的基础上构造业务使用方向的函数
#用户操作总次数
def user_cnt(df_data,single_col,name):
    df_data[name]=df_data[single_col].apply(cnt_)
    return df_data
#用户下单数量
def user_nunique(df_data,single_col,name):
    df_data[name]=df_data[single_col].apply(nunique_)
    return df_data
#用户下单最多的商品
def user_max(df_data,single_col,name):
    df_data[name]=df_data[single_col].apply(max_)
    return df_data
#用户下单最少的商品
def user_min(df_data,single_col,name):
    df_data[name]=df_data[single_col].apply(min_)
    return df_data
#用户下单的方差
def user_std(df_data,single_col,name):
    df_data[name]=df_data[single_col].apply(std_)
    return df_data
#用户下单的前n个商品
def user_most_n(df_data,single_col,name,n=1):
    func=lambda x:most_n(x,n)
    df_data[name]=df_data[single_col].apply(func)
    return df_data
#用户下单的前n个商品的数量
def user_most_n_cnt(df_data,single_col,name,n=1):
    func=lambda x:most_n_cnt(x,n)
    df_data[name]=df_data[single_col].apply(func)
    return df_data

In [15]:
#调用业务函数，创造特征
#用户操作总次数(字段选择seller_path，因为每操作一次就有一个商户信息)
all_data_test=user_cnt(all_data_test,"seller_path","user_cnt")
all_data_test.head()

Unnamed: 0,user_id,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt
0,1,"181459,779078,779078,452837,543397,504149,5041...","276,276,276,276,276,1023,1023,1023,1023,1252,1...","2245,2245,2245,2245,2245,925,925,925,925,4026,...","4750.0,4750.0,4750.0,4750.0,4750.0,7402.0,7402...","1009,1009,1009,1009,1009,1011,1011,1011,1011,1...","0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,2,2,...",33
1,2,"348983,749563,239288,751744,239288,714176,1972...","177,177,602,602,602,1213,602,602,602,1213,1213...","2223,2223,420,420,420,420,420,420,420,420,420,...","3273.0,3273.0,4953.0,4953.0,4953.0,4058.0,4953...","527,527,626,626,626,626,626,626,626,626,626,62...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,...",63
2,3,"895754,895754,182882,182882,985337,175397,9463...","1505,1505,1271,1271,1271,662,1134,1134,1134,45...","795,795,2123,2123,4925,1102,4461,4461,4461,474...","3608.0,3608.0,4796.0,4796.0,8005.0,1214.0,905....","516,516,627,627,627,727,819,819,820,906,906,90...","2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",68
3,4,"836727,243874,185489,95474,30073,790055,790055...","612,1577,1505,1505,1505,1505,1505,1505,1505,15...","1221,1221,1221,1221,1221,1221,1221,1221,1221,1...","7734.0,7734.0,7734.0,7734.0,7734.0,7734.0,7734...","527,527,527,527,527,527,527,527,527,527,527,52...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",50
4,5,"511956,728354,568450,754044,1011255,741215,362...","302,351,812,1213,142,464,1028,35,1213,351,142,...","3098,3215,641,3736,1483,176,4848,641,3736,4547...","5545.0,5482.0,4265.0,3125.0,4640.0,6662.0,2803...","519,520,520,520,520,520,520,520,520,520,520,52...","3,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,...",173


In [16]:
#不同店铺的个数
all_data_test=user_nunique(all_data_test,"seller_path","seller_nunique")
all_data_test.head()

Unnamed: 0,user_id,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique
0,1,"181459,779078,779078,452837,543397,504149,5041...","276,276,276,276,276,1023,1023,1023,1023,1252,1...","2245,2245,2245,2245,2245,925,925,925,925,4026,...","4750.0,4750.0,4750.0,4750.0,4750.0,7402.0,7402...","1009,1009,1009,1009,1009,1011,1011,1011,1011,1...","0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,2,2,...",33,9
1,2,"348983,749563,239288,751744,239288,714176,1972...","177,177,602,602,602,1213,602,602,602,1213,1213...","2223,2223,420,420,420,420,420,420,420,420,420,...","3273.0,3273.0,4953.0,4953.0,4953.0,4058.0,4953...","527,527,626,626,626,626,626,626,626,626,626,62...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,...",63,14
2,3,"895754,895754,182882,182882,985337,175397,9463...","1505,1505,1271,1271,1271,662,1134,1134,1134,45...","795,795,2123,2123,4925,1102,4461,4461,4461,474...","3608.0,3608.0,4796.0,4796.0,8005.0,1214.0,905....","516,516,627,627,627,727,819,819,820,906,906,90...","2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",68,23
3,4,"836727,243874,185489,95474,30073,790055,790055...","612,1577,1505,1505,1505,1505,1505,1505,1505,15...","1221,1221,1221,1221,1221,1221,1221,1221,1221,1...","7734.0,7734.0,7734.0,7734.0,7734.0,7734.0,7734...","527,527,527,527,527,527,527,527,527,527,527,52...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",50,12
4,5,"511956,728354,568450,754044,1011255,741215,362...","302,351,812,1213,142,464,1028,35,1213,351,142,...","3098,3215,641,3736,1483,176,4848,641,3736,4547...","5545.0,5482.0,4265.0,3125.0,4640.0,6662.0,2803...","519,520,520,520,520,520,520,520,520,520,520,52...","3,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,...",173,56


In [17]:
#商品品类的数量
all_data_test=user_nunique(all_data_test,"cat_path","cat_nunique")

In [18]:
#不同品牌的个数
all_data_test=user_nunique(all_data_test,"brand_path","brand_nunique")
all_data_test.head()

Unnamed: 0,user_id,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique
0,1,"181459,779078,779078,452837,543397,504149,5041...","276,276,276,276,276,1023,1023,1023,1023,1252,1...","2245,2245,2245,2245,2245,925,925,925,925,4026,...","4750.0,4750.0,4750.0,4750.0,4750.0,7402.0,7402...","1009,1009,1009,1009,1009,1011,1011,1011,1011,1...","0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,2,2,...",33,9,6,9
1,2,"348983,749563,239288,751744,239288,714176,1972...","177,177,602,602,602,1213,602,602,602,1213,1213...","2223,2223,420,420,420,420,420,420,420,420,420,...","3273.0,3273.0,4953.0,4953.0,4953.0,4058.0,4953...","527,527,626,626,626,626,626,626,626,626,626,62...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,...",63,14,14,15
2,3,"895754,895754,182882,182882,985337,175397,9463...","1505,1505,1271,1271,1271,662,1134,1134,1134,45...","795,795,2123,2123,4925,1102,4461,4461,4461,474...","3608.0,3608.0,4796.0,4796.0,8005.0,1214.0,905....","516,516,627,627,627,727,819,819,820,906,906,90...","2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",68,23,19,22
3,4,"836727,243874,185489,95474,30073,790055,790055...","612,1577,1505,1505,1505,1505,1505,1505,1505,15...","1221,1221,1221,1221,1221,1221,1221,1221,1221,1...","7734.0,7734.0,7734.0,7734.0,7734.0,7734.0,7734...","527,527,527,527,527,527,527,527,527,527,527,52...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",50,12,13,12
4,5,"511956,728354,568450,754044,1011255,741215,362...","302,351,812,1213,142,464,1028,35,1213,351,142,...","3098,3215,641,3736,1483,176,4848,641,3736,4547...","5545.0,5482.0,4265.0,3125.0,4640.0,6662.0,2803...","519,520,520,520,520,520,520,520,520,520,520,52...","3,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,...",173,56,40,60


In [19]:
#不同商品的个数
all_data_test=user_nunique(all_data_test,"item_path","item_nunique")
all_data_test.head()

Unnamed: 0,user_id,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique
0,1,"181459,779078,779078,452837,543397,504149,5041...","276,276,276,276,276,1023,1023,1023,1023,1252,1...","2245,2245,2245,2245,2245,925,925,925,925,4026,...","4750.0,4750.0,4750.0,4750.0,4750.0,7402.0,7402...","1009,1009,1009,1009,1009,1011,1011,1011,1011,1...","0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,2,2,...",33,9,6,9,12
1,2,"348983,749563,239288,751744,239288,714176,1972...","177,177,602,602,602,1213,602,602,602,1213,1213...","2223,2223,420,420,420,420,420,420,420,420,420,...","3273.0,3273.0,4953.0,4953.0,4953.0,4058.0,4953...","527,527,626,626,626,626,626,626,626,626,626,62...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,...",63,14,14,15,43
2,3,"895754,895754,182882,182882,985337,175397,9463...","1505,1505,1271,1271,1271,662,1134,1134,1134,45...","795,795,2123,2123,4925,1102,4461,4461,4461,474...","3608.0,3608.0,4796.0,4796.0,8005.0,1214.0,905....","516,516,627,627,627,727,819,819,820,906,906,90...","2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",68,23,19,22,45
3,4,"836727,243874,185489,95474,30073,790055,790055...","612,1577,1505,1505,1505,1505,1505,1505,1505,15...","1221,1221,1221,1221,1221,1221,1221,1221,1221,1...","7734.0,7734.0,7734.0,7734.0,7734.0,7734.0,7734...","527,527,527,527,527,527,527,527,527,527,527,52...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",50,12,13,12,28
4,5,"511956,728354,568450,754044,1011255,741215,362...","302,351,812,1213,142,464,1028,35,1213,351,142,...","3098,3215,641,3736,1483,176,4848,641,3736,4547...","5545.0,5482.0,4265.0,3125.0,4640.0,6662.0,2803...","519,520,520,520,520,520,520,520,520,520,520,52...","3,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,...",173,56,40,60,87


In [20]:
#活跃天数
all_data_test=user_nunique(all_data_test,"time_stamp_path","time_stamp_nunique")
all_data_test.head()

Unnamed: 0,user_id,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique
0,1,"181459,779078,779078,452837,543397,504149,5041...","276,276,276,276,276,1023,1023,1023,1023,1252,1...","2245,2245,2245,2245,2245,925,925,925,925,4026,...","4750.0,4750.0,4750.0,4750.0,4750.0,7402.0,7402...","1009,1009,1009,1009,1009,1011,1011,1011,1011,1...","0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,2,2,...",33,9,6,9,12,5
1,2,"348983,749563,239288,751744,239288,714176,1972...","177,177,602,602,602,1213,602,602,602,1213,1213...","2223,2223,420,420,420,420,420,420,420,420,420,...","3273.0,3273.0,4953.0,4953.0,4953.0,4058.0,4953...","527,527,626,626,626,626,626,626,626,626,626,62...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,...",63,14,14,15,43,9
2,3,"895754,895754,182882,182882,985337,175397,9463...","1505,1505,1271,1271,1271,662,1134,1134,1134,45...","795,795,2123,2123,4925,1102,4461,4461,4461,474...","3608.0,3608.0,4796.0,4796.0,8005.0,1214.0,905....","516,516,627,627,627,727,819,819,820,906,906,90...","2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",68,23,19,22,45,13
3,4,"836727,243874,185489,95474,30073,790055,790055...","612,1577,1505,1505,1505,1505,1505,1505,1505,15...","1221,1221,1221,1221,1221,1221,1221,1221,1221,1...","7734.0,7734.0,7734.0,7734.0,7734.0,7734.0,7734...","527,527,527,527,527,527,527,527,527,527,527,52...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",50,12,13,12,28,10
4,5,"511956,728354,568450,754044,1011255,741215,362...","302,351,812,1213,142,464,1028,35,1213,351,142,...","3098,3215,641,3736,1483,176,4848,641,3736,4547...","5545.0,5482.0,4265.0,3125.0,4640.0,6662.0,2803...","519,520,520,520,520,520,520,520,520,520,520,52...","3,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,...",173,56,40,60,87,30


In [21]:
#同一用户产生的操作类型的数量
all_data_test=user_nunique(all_data_test,"action_type_path","action_type_nunique")
all_data_test.head()

Unnamed: 0,user_id,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique
0,1,"181459,779078,779078,452837,543397,504149,5041...","276,276,276,276,276,1023,1023,1023,1023,1252,1...","2245,2245,2245,2245,2245,925,925,925,925,4026,...","4750.0,4750.0,4750.0,4750.0,4750.0,7402.0,7402...","1009,1009,1009,1009,1009,1011,1011,1011,1011,1...","0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,2,2,...",33,9,6,9,12,5,2
1,2,"348983,749563,239288,751744,239288,714176,1972...","177,177,602,602,602,1213,602,602,602,1213,1213...","2223,2223,420,420,420,420,420,420,420,420,420,...","3273.0,3273.0,4953.0,4953.0,4953.0,4058.0,4953...","527,527,626,626,626,626,626,626,626,626,626,62...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,...",63,14,14,15,43,9,3
2,3,"895754,895754,182882,182882,985337,175397,9463...","1505,1505,1271,1271,1271,662,1134,1134,1134,45...","795,795,2123,2123,4925,1102,4461,4461,4461,474...","3608.0,3608.0,4796.0,4796.0,8005.0,1214.0,905....","516,516,627,627,627,727,819,819,820,906,906,90...","2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",68,23,19,22,45,13,3
3,4,"836727,243874,185489,95474,30073,790055,790055...","612,1577,1505,1505,1505,1505,1505,1505,1505,15...","1221,1221,1221,1221,1221,1221,1221,1221,1221,1...","7734.0,7734.0,7734.0,7734.0,7734.0,7734.0,7734...","527,527,527,527,527,527,527,527,527,527,527,52...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",50,12,13,12,28,10,2
4,5,"511956,728354,568450,754044,1011255,741215,362...","302,351,812,1213,142,464,1028,35,1213,351,142,...","3098,3215,641,3736,1483,176,4848,641,3736,4547...","5545.0,5482.0,4265.0,3125.0,4640.0,6662.0,2803...","519,520,520,520,520,520,520,520,520,520,520,52...","3,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,...",173,56,40,60,87,30,3


In [22]:
#用户最喜欢的品牌
all_data_test=user_most_n(all_data_test,"brand_path","brand_most_1",n=1)
all_data_test.head()

Unnamed: 0,user_id,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique,brand_most_1
0,1,"181459,779078,779078,452837,543397,504149,5041...","276,276,276,276,276,1023,1023,1023,1023,1252,1...","2245,2245,2245,2245,2245,925,925,925,925,4026,...","4750.0,4750.0,4750.0,4750.0,4750.0,7402.0,7402...","1009,1009,1009,1009,1009,1011,1011,1011,1011,1...","0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,2,2,...",33,9,6,9,12,5,2,-1
1,2,"348983,749563,239288,751744,239288,714176,1972...","177,177,602,602,602,1213,602,602,602,1213,1213...","2223,2223,420,420,420,420,420,420,420,420,420,...","3273.0,3273.0,4953.0,4953.0,4953.0,4058.0,4953...","527,527,626,626,626,626,626,626,626,626,626,62...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,...",63,14,14,15,43,9,3,-1
2,3,"895754,895754,182882,182882,985337,175397,9463...","1505,1505,1271,1271,1271,662,1134,1134,1134,45...","795,795,2123,2123,4925,1102,4461,4461,4461,474...","3608.0,3608.0,4796.0,4796.0,8005.0,1214.0,905....","516,516,627,627,627,727,819,819,820,906,906,90...","2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",68,23,19,22,45,13,3,-1
3,4,"836727,243874,185489,95474,30073,790055,790055...","612,1577,1505,1505,1505,1505,1505,1505,1505,15...","1221,1221,1221,1221,1221,1221,1221,1221,1221,1...","7734.0,7734.0,7734.0,7734.0,7734.0,7734.0,7734...","527,527,527,527,527,527,527,527,527,527,527,52...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",50,12,13,12,28,10,2,-1
4,5,"511956,728354,568450,754044,1011255,741215,362...","302,351,812,1213,142,464,1028,35,1213,351,142,...","3098,3215,641,3736,1483,176,4848,641,3736,4547...","5545.0,5482.0,4265.0,3125.0,4640.0,6662.0,2803...","519,520,520,520,520,520,520,520,520,520,520,52...","3,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,...",173,56,40,60,87,30,3,-1


In [23]:
#用户最喜欢的店铺行为次数
all_data_test=user_most_n_cnt(all_data_test,"seller_path","seller_most_1_cnt")
all_data_test.head()

Unnamed: 0,user_id,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique,brand_most_1,seller_most_1_cnt
0,1,"181459,779078,779078,452837,543397,504149,5041...","276,276,276,276,276,1023,1023,1023,1023,1252,1...","2245,2245,2245,2245,2245,925,925,925,925,4026,...","4750.0,4750.0,4750.0,4750.0,4750.0,7402.0,7402...","1009,1009,1009,1009,1009,1011,1011,1011,1011,1...","0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,2,2,...",33,9,6,9,12,5,2,-1,-1
1,2,"348983,749563,239288,751744,239288,714176,1972...","177,177,602,602,602,1213,602,602,602,1213,1213...","2223,2223,420,420,420,420,420,420,420,420,420,...","3273.0,3273.0,4953.0,4953.0,4953.0,4058.0,4953...","527,527,626,626,626,626,626,626,626,626,626,62...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,...",63,14,14,15,43,9,3,-1,-1
2,3,"895754,895754,182882,182882,985337,175397,9463...","1505,1505,1271,1271,1271,662,1134,1134,1134,45...","795,795,2123,2123,4925,1102,4461,4461,4461,474...","3608.0,3608.0,4796.0,4796.0,8005.0,1214.0,905....","516,516,627,627,627,727,819,819,820,906,906,90...","2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",68,23,19,22,45,13,3,-1,-1
3,4,"836727,243874,185489,95474,30073,790055,790055...","612,1577,1505,1505,1505,1505,1505,1505,1505,15...","1221,1221,1221,1221,1221,1221,1221,1221,1221,1...","7734.0,7734.0,7734.0,7734.0,7734.0,7734.0,7734...","527,527,527,527,527,527,527,527,527,527,527,52...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",50,12,13,12,28,10,2,-1,-1
4,5,"511956,728354,568450,754044,1011255,741215,362...","302,351,812,1213,142,464,1028,35,1213,351,142,...","3098,3215,641,3736,1483,176,4848,641,3736,4547...","5545.0,5482.0,4265.0,3125.0,4640.0,6662.0,2803...","519,520,520,520,520,520,520,520,520,520,520,52...","3,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,...",173,56,40,60,87,30,3,-1,-1
