In [5]:
# 查看数据文件目录  list datalab files
!ls datalab/231718

Antai_AE_round1_item_attr_20190626.zip	Antai_AE_round1_train_20190626.zip
Antai_AE_round1_test_20190626.csv


In [2]:
# 查看个人永久空间文件  list files in your permanent storage
!ls /home/tianchi/myspace/


sub.csv


In [None]:
# 查看当前kernel下已安装的包  list packages
!pip list --format=columns

In [4]:
# 导入相关包
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import gc

pd.set_option('display.float_format',lambda x : '%.2f' % x)
plt.style.use('seaborn-dark') 
plt.rcParams['axes.unicode_minus']=False 
plt.rcParams['figure.figsize'] = (10.0, 5.0)
plt.rcParams['font.sans-serif'] = ['SimHei']

In [5]:
path = './datalab/231718/'
item = pd.read_csv(path+'Antai_AE_round1_item_attr_20190626.zip')
train = pd.read_csv(path+'Antai_AE_round1_train_20190626.zip')
test = pd.read_csv(path+'Antai_AE_round1_test_20190626.csv')
# 合并train和test文件
df = pd.concat([train.assign(is_train=1), test.assign(is_train=0)])
del train,test; gc.collect()

df['create_order_time'] = pd.to_datetime(df['create_order_time'])
df['date'] = df['create_order_time'].dt.date
df['day'] = df['create_order_time'].dt.day
df['hour'] = df['create_order_time'].dt.hour
df = pd.merge(df, item, how='left', on='item_id')

memory = df.memory_usage().sum() / 1024**2 
print('Before memory usage of properties dataframe is :', memory, " MB")

dtype_dict = {'buyer_admin_id' : 'int32', 
              'item_id' : 'int32', 
              'store_id' : 'int32',
              'irank' : 'int16',
              'item_price' : 'int16',
              'cate_id' : 'int16',
              'is_train' : 'int8',
              'day' : 'int8',
              'hour' : 'int8',
             }

df = df.fillna(0).astype(dtype_dict)
memory = df.memory_usage().sum() / 1024**2 
print('After memory usage of properties dataframe is :', memory, " MB")

train = df[df['is_train']==1]
test = df[df['is_train']==0]
trainyy = train[train.buyer_country_id =='yy']

Before memory usage of properties dataframe is : 1292.8728714  MB
After memory usage of properties dataframe is : 658.867905617  MB


In [6]:
df.head()

Unnamed: 0,buyer_country_id,buyer_admin_id,item_id,create_order_time,irank,is_train,date,day,hour,cate_id,store_id,item_price
0,xx,8362078,1,2018-08-10 23:49:44,12,1,2018-08-10,10,23,2324,10013,4501
1,xx,9694304,2,2018-08-03 23:55:07,9,1,2018-08-03,3,23,3882,4485,2751
2,yy,101887,3,2018-08-27 08:31:26,3,1,2018-08-27,27,8,155,8341,656
3,xx,8131786,3,2018-08-31 06:00:19,9,1,2018-08-31,31,6,155,8341,656
4,xx,9778613,5,2018-08-21 06:01:56,14,1,2018-08-21,21,6,1191,1949,1689


In [16]:
def groupby_cnt_ratio(df, col):
    if isinstance(col, str):
        col = [col]
    key = ['is_train', 'buyer_country_id'] + col
    cnt_stat = df.groupby(key).size().to_frame('count').reset_index().sort_values(by=['count'], ascending=False)
    return cnt_stat
#     ratio_stat = (cnt_stat / cnt_stat.groupby(['is_train', 'buyer_country_id']).transform(pd.Series.sum)).rename(columns={'count':'count_ratio'})
#     return pd.merge(cnt_stat.reset_index(), ratio_stat.reset_index(), on=key, how='outer').sort_values(by=['count'], ascending=False)

In [8]:
# 过滤一些用户重复购买的商品
# 构造关联特征
yy = df[df.buyer_country_id =='yy']


In [17]:
admin_cnt = groupby_cnt_ratio(yy, ['buyer_admin_id', 'item_id'])
admin_cnt.head()

Unnamed: 0,is_train,buyer_country_id,buyer_admin_id,item_id,count
1800549,1,yy,2381782,10389160,1271
109753,0,yy,2041038,5595070,963
1601201,1,yy,2045158,6428237,804
1800542,1,yy,2381782,5355362,559
1821037,1,yy,12355554,9389408,556


In [7]:
import numpy as np
import pandas as pd
import os 
from tqdm import tqdm_notebook
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

path = './datalab/231718/'
# item   = pd.read_csv(path+'Antai_AE_round1_item_attr_20190626.csv')
# submit = pd.read_csv(path+'Antai_AE_round1_submit_20190715.csv', header=None)
# test   = pd.read_csv(path+'Antai_AE_round1_test_20190626.csv')
# train  = pd.read_csv(path+'Antai_AE_round1_train_20190626.csv')


item = pd.read_csv(path+'Antai_AE_round1_item_attr_20190626.zip')
train = pd.read_csv(path+'Antai_AE_round1_train_20190626.zip')
test = pd.read_csv(path+'Antai_AE_round1_test_20190626.csv')

def get_preprocessing(df_):
    df = df_.copy()   
    df['hour']  = df['create_order_time'].apply(lambda x:int(x[11:13]))
    df['day']   = df['create_order_time'].apply(lambda x:int(x[8:10]))
    df['month'] = df['create_order_time'].apply(lambda x:int(x[5:7]))
    df['year']  = df['create_order_time'].apply(lambda x:int(x[0:4]))
    df['date']  = (df['month'].values - 7) * 31 + df['day']    
    del df['create_order_time']    
    return df

train = get_preprocessing(train)
test  = get_preprocessing(test)

# 高频item_id
temp = train.loc[train.buyer_country_id=='yy']
temp = temp.drop_duplicates(subset=['buyer_admin_id','item_id'], keep='first')
item_cnts = temp.groupby(['item_id']).size().reset_index()
item_cnts.columns = ['item_id','cnts']
item_cnts = item_cnts.sort_values('cnts', ascending=False)
items = item_cnts['item_id'].values.tolist()

# 很多admin的历史行为不够30个item，所以就需要填充够30个
# 这里使用train下yy的数据构造item_id频次排序，然后依次填充
def item_fillna(tmp_):  
    tmp = tmp_.copy()   
    l = len(tmp)
    if l == 30:
        tmp = tmp
    elif l < 30:
        m = 30 - l
        items_t = items.copy()
        for i in range(m):
            for j in range(50):
                it = items_t.pop(0)
                if it not in tmp:
                    tmp.append(it)
                    break
    elif l > 30:
        tmp = tmp[:30]
    
    return tmp

# 获取top30的item
def get_item_list(df_):
    df = df_.copy()
    dic = {}
    flag = 0
    for item in df[['buyer_admin_id','item_id']].values:
        try:
            dic[item[0]].append(item[1])
        except:
            if flag != 0:
                # 去重
                tmp = []
                for i in dic[flag]:
                    if i not in tmp:
                        tmp.append(i)
                # 填充
                tmp = item_fillna(tmp)
                dic[flag] = tmp
                
                flag = item[0]
            else:
                flag = item[0]
            dic[item[0]] = [item[1]]

    return dic

test = test.sort_values(['buyer_admin_id','irank'])
dic = get_item_list(test)

# 最终提交
temp = pd.DataFrame({'lst':dic}).reset_index()
for i in range(30):
    temp[i] = temp['lst'].apply(lambda x:x[i])
del temp['lst']
temp.to_csv('submission.csv',index=False,header=None)

In [9]:
# ! mv /home/tianchi/submission.csv /home/tianchi/myspace/

In [12]:
train = get_preprocessing(train)
test  = get_preprocessing(test)

In [14]:
train.head()

Unnamed: 0,buyer_country_id,buyer_admin_id,item_id,irank,hour,day,month,year,date
0,xx,8362078,1,12,23,10,8,2018,41
1,xx,9694304,2,9,23,3,8,2018,34
2,yy,101887,3,3,8,27,8,2018,58
3,xx,8131786,3,9,6,31,8,2018,62
4,xx,9778613,5,14,6,21,8,2018,52


In [16]:
# 高频item_id
temp = train.loc[train.buyer_country_id=='yy']
# 去除重复值
temp = temp.drop_duplicates(subset=['buyer_admin_id','item_id'], keep='first')

In [2]:

item_cnts = temp.groupby(['item_id']).size().reset_index()
item_cnts.columns = ['item_id','cnts']
item_cnts = item_cnts.sort_values('cnts', ascending=False)
# item_cnts


NameError: name 'temp' is not defined

In [21]:
# item_cnts['item_id'].values.tolist()
# 很多admin的历史行为不够30个item，所以就需要填充够30个
# 这里使用train下yy的数据构造item_id频次排序，然后依次填充
def item_fillna(tmp_):  
    tmp = tmp_.copy()   
    l = len(tmp)
    if l == 30:
        tmp = tmp
    elif l < 30:
        m = 30 - l
        items_t = items.copy()
        for i in range(m):
            for j in range(50):
                it = items_t.pop(0)
                if it not in tmp:
                    tmp.append(it)
                    break
    elif l > 30:
        tmp = tmp[:30]
    
    return tmp

# # 获取top30的item
def get_item_list(df_):
    df = df_.copy()
    dic = {}
    flag = 0
    for item in df[['buyer_admin_id','item_id']].values:
        try:
            dic[item[0]].append(item[1])
        except:
            if flag != 0:
                # 去重
                tmp = []
                for i in dic[flag]:
                    if i not in tmp:
                        tmp.append(i)
                # 填充
                tmp = item_fillna(tmp)
                dic[flag] = tmp
                
                flag = item[0]
            else:
                flag = item[0]
            dic[item[0]] = [item[1]]

    return dic

test = test.sort_values(['buyer_admin_id','irank'])
# dic = get_item_list(test)

In [23]:
dic = get_item_list(test)

In [1]:
# 最终提交
temp = pd.DataFrame({'lst':dic}).reset_index()
for i in range(30):
    temp[i] = temp['lst'].apply(lambda x:x[i])
del temp['lst']
temp.to_csv('submission.csv',index=False,header=None)

NameError: name 'pd' is not defined