In [2]:
import pickle
import os
import sys
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(BASE_DIR, '../../config'))
import numpy as np
import pandas as pd

class DIN_preprocess(object):
    def __init__(self):
        self.ROOT = FEATURE_PATH + '/DIN'
        self.RAW = self.ROOT + '/raw'
        self.PROCESSED = self.ROOT + '/processed'
        my_vocab_dict = {"count_ocr": 59775,
               "count_asr": 59768,
               "count_des": 41241,
               "count_ocr_char": 20995,
               "count_asr_char": 20870,
               "count_des_char": 20988,
               "count_tag": 350,
               "count_key": 23262}
        
        my_len_dict = {"ocr": 21,
               "asr": 21,
               "des": 20,
               "ocr_char": 41,
               "asr_char": 41,
               "des_char": 41,
               "tag": 11,
               "key": 18}
        
        if not os.path.exists(ROOT):
            os.mkdir(self.RAW)
            os.mkdir(self.PROCESSED)
        self.data_path = DATASET_PATH
        self.user_action, self.feed_info, self.feed_emb, self.test = self.load()
    
    def load(self):
        user_action = pd.read_csv(self.data_path + '/user_action.csv')
        feed_info = pd.read_csv(self.data_path + '/feed_info.csv')
        feed_emb = pd.read_csv(self.data_path + '/feed_embeddings.csv')
        test = pd.read_csv(self.data_path + '/test_a.csv')
        return user_action, feed_info, feed_emb, test
    
    def get_id_mapping(self, mats, col, begin=1):
        mats = list(map(lambda x: x[col], mats))
        ids = np.concatenate(mats, axis=-1)
        ids = np.unique(ids)
        count = ids.shape[0]
        map_ids = np.arange(begin, begin + count, dtype=np.int32)
        mapid_series = pd.Series(index=ids, data=map_ids)
        return count, mapid_series
    
    def map_id(self, mat, mapping, col):
        mapped_id = mat[col].map(lambda x: mapping[x])
        mat[col] = mapped_id
        return mat
    
    def mapping_all_id(self):
        count_uid, uid_series = self.get_id_mapping([self.user_action, self.test], 'userid', begin=0)
        count_fid, fid_series = self.get_id_mapping([self.feed_info], 'feedid', begin=0)
        count_aid, aid_series = self.get_id_mapping([self.feed_info], 'authorid', begin=0)
        _, songid_series = self.get_id_mapping([self.feed_info], 'bgm_song_id', begin=0)
        _, singerid_series = self.get_id_mapping([self.feed_info], 'bgm_singer_id', begin=0)
        songid_series = songid_series[~songid_series.index.duplicated(keep='first')]
        singerid_series = singerid_series[~singerid_series.index.duplicated(keep='first')]
        count_songid = songid_series.values.shape[0]
        count_singerid = singerid_series.values.shape[0]
        
        self.user_action = self.map_id(self.user_action, uid_series, 'userid')
        self.user_action = self.map_id(self.user_action, fid_series, 'feedid')
        self.feed_info = self.map_id(self.feed_info, fid_series, 'feedid')
        self.feed_info = self.map_id(self.feed_info, aid_series, 'authorid')
        self.feed_info = self.map_id(self.feed_info, songid_series, 'bgm_song_id')
        self.feed_info = self.map_id(self.feed_info, singerid_series, 'bgm_singer_id')
        self.feed_emb = self.map_id(self.feed_emb, fid_series, 'feedid')
        self.test = self.map_id(self.test, fid_series, 'feedid')
        self.test = self.map_id(self.test, uid_series, 'userid')
        
        result = {'count_uid': count_uid, 'count_fid':count_fid, 'count_aid':count_aid, 
                  'count_singerid':count_singerid, 'count_songid':count_songid, 
                  'uid_series':uid_series, 'fid_series':fid_series, 'aid_series':aid_series, 
                  'songid_series', songid_series, 'singerid_series':singerid_series}
        result.update(my_vocab_dict)
        return result
    
    def format_mapping(self, df):
        index = pd.Series(df.index, name='src')
        value = pd.Series(df.values, name='des')
        return pd.concat([index, value], axis=1)
    
    def save(self, dict_data):
        with open(self.PROCESSED + '/statistic.pkl') as f:
            # 还未算上tag等信息
            pickle.dump([dict_data['count_uid'], 
                         dict_data['count_fid'], 
                         dict_data['count_aid'], 
                         dict_data['count_songid'],
                         dict_data['count_singerid'],
                         self.user_action['device'].unique().shape[0]], f, pickle.HIGHEST_PROTOCOL)
            
        self.format_mapping(dict_data['uid_series']).to_csv(self.RAW + '/uid_mapping.csv', index=False)
        self.format_mapping(dict_data['fid_series']).to_csv(self.RAW + '/fid_mapping.csv', index=False)
        self.format_mapping(dict_data['aid_series']).to_csv(self.RAW + '/aid_mapping.csv', index=False)
        self.format_mapping(dict_data['songid_series']).to_csv(self.RAW + '/songid_mapping.csv', index=False)
        self.format_mapping(dict_data['singerid_series']).to_csv(self.RAW + '/singerid_mapping.csv', index=False)
        
        self.user_action.to_csv(self.RAW + '/user_action.csv', index=False)
        self.feed_info.to_csv(self.RAW + '/feed_info.csv', index=False)
        self.feed_emb.to_csv(self.RAW + '/feed_embeddings.csv', index=False)
        self.test.to_csv(self.RAW + '/test_a.csv', index=False)
if __name__ == '__main__':
    preprocess = DIN_preprocess()
    dict_data = preprocess.mapping_all_id()
    preprocess.save(dict_data)

In [None]:
import os
import sys
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(BASE_DIR, '../../config'))
from config import *
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# after run Preprocess.py
class DIN_Process(object):
    def __init__(self, pca_dim=64):
        self.pca_dim = pca_dim
        self.RAW = FEATURE_PATH + '/DIN/raw'
        self.PROCESSED = FEATURE_PATH + '/DIN/processed'
        self.user_action, self.feed_info, self.feed_emb, self.test = self.load()
        self.use_cols = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id',
                        'videoplayseconds', 'device', 'date_',
                        'read_comment', 'like', 'comment', 'click_avatar', 'forward', 'follow', 'favorite']
    
    def load(self):
        user_action = pd.read_csv(self.RAW + '/user_action.csv')
        feed_info = pd.read_csv(self.RAW + '/feed_info.csv')
        feed_emb = pd.read_csv(self.RAW + '/feed_embeddings.csv')
        test = pd.read_csv(self.RAW + '/test_a.csv')
        return user_action, feed_info, feed_emb, test
    
    #总的流程处理
    def process(self):
        self.feature_eng(self.pca_dim)
        self.train = self.user_action.merge(self.feed_info, on='feedid', how='left')[self.use_cols]
        self.test = self.test.merge(self.feed_info, on='feedid', how='left')[self.use_cols]
        self.train[self.use_cols] = self.train[self.use_cols].fillna(0)
        self.test[self.use_cols] = self.test[self.use_cols].fillna(0)
        self.save_all()
        
    # 特征工程
    def feature_eng(self, pca_dim):
        self.process_feed_emb(pca_dim)
        self.process_videoplayseconds()
    # feed emb降维
    def process_feed_emb(self, pca_dim=None):
        self.feed_emb = self.feed_emb.sort_values(by=['feedid'])
        sorted_embeddings = [np.array(list(map(lambda x: eval(x), emb_list.strip(' ').split(' '))), dtype=np.float32) 
                             for emb_list in self.feed_emb['feed_embedding']]
        sorted_embeddings = [np.expand_dims(emb, axis=0) for emb in sorted_embeddings]
        np_embeddings = np.concatenate(sorted_embeddings, axis=0)
        
        if pca_dim is None:
            result = np_embeddings
        else:
            pca = PCA(n_components=pca_dim)
            pca.fit(np_embeddings)
            result = pca.transform(np_embeddings)
        zero_pad = np.zeros((1, result.shape[-1]), dtype=np.float32)
        self.feed_emb = np.concatenate([zero_pad, result], axis=0)
        
    # 归一化    
    def process_videoplayseconds(self):
        eps = 1e-9
        val = self.feed_info['videoplayseconds']
        norm = (val - val.min()) / (val.max() - val.min() + eps)
        self.feed_info['videoplayseconds'] = norm
    
    def save_all(self):
        self.train.to_csv(self.PROCESSED + '/data.csv', index=False)
        self.test.to_csv(self.PROCESSED + '/test.csv', index=False)
        np.save(self.PROCESSED + '/feed_embeddings{0}.npy'.format(self.pca_dim), self.feed_emb)

if __name__ == '__main__':
    DIN_Process(pca_dim=64)

In [44]:
!pwd

/home/tione/notebook/wbdc2021-preliminary-48c2b28c233f4934b362696daef770e4/src/prepare


In [45]:
path = '../../data/wedata/feature/DIN/raw/'
u = pd.read_csv(path + 'user_action.csv')
f = pd.read_csv(path + 'feed_info.csv')

In [81]:
s.groupby('userid')['feedid'].count().values

array([260, 659, 249, ..., 324, 508, 191])

In [61]:
a = s.groupby('userid')

In [67]:
b = list(a['feedid'])

In [76]:
s.isna().sum()

userid                        0
feedid                        0
date_                         0
device                        0
read_comment                  0
comment                       0
like                          0
play                          0
stay                          0
click_avatar                  0
forward                       0
follow                        0
favorite                      0
authorid                      0
videoplayseconds              0
description              219807
ocr                     1886890
asr                     1721179
bgm_song_id                   0
bgm_singer_id                 0
manual_keyword_list     2255381
machine_keyword_list     848552
manual_tag_list            6707
machine_tag_list          28759
description_char         217989
ocr_char                1905399
asr_char                1721179
dtype: int64

In [1]:
import pandas as pd
import datatable as dt

In [2]:
d = pd.read_csv('../../data/wedata/wechat_algo_data2/user_action.csv')

In [3]:
d

Unnamed: 0,userid,feedid,date_,device,read_comment,comment,like,play,stay,click_avatar,forward,follow,favorite
0,0,99073,2,1,0,0,0,2750,3027,0,0,0,0
1,0,23598,2,1,0,0,0,1500,2044,0,0,0,0
2,0,12361,2,1,0,0,0,27250,27601,0,0,0,0
3,0,3867,2,1,0,0,0,1500,1873,0,0,0,0
4,0,97482,2,1,0,0,0,14250,14866,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
73175506,250248,102903,14,2,0,0,0,953,4768,0,0,0,0
73175507,250248,69675,14,2,0,0,0,5187,5502,0,0,0,0
73175508,250248,19208,14,2,0,0,0,4039,4105,0,0,0,0
73175509,250248,37682,14,2,0,0,0,9202,10035,0,0,0,0


In [5]:
d['userid'].nunique()

199999

In [6]:
t = pd.read_csv('../../data/wedata/wechat_algo_data2/test_a.csv')

In [7]:
t

Unnamed: 0,userid,feedid,device
0,175282,50458,2
1,80036,42329,2
2,145791,85242,2
3,28430,9425,1
4,44393,11866,2
...,...,...,...
4252092,153322,51633,2
4252093,39430,20147,2
4252094,2524,89043,2
4252095,69629,27238,2
