In [2]:
import pickle
import os
import sys
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(BASE_DIR, '../../config'))
import numpy as np
import pandas as pd

class DIN_preprocess(object):
    def __init__(self):
        self.ROOT = FEATURE_PATH + '/DIN'
        self.RAW = self.ROOT + '/raw'
        self.PROCESSED = self.ROOT + '/processed'
        my_vocab_dict = {"count_ocr": 59775,
               "count_asr": 59768,
               "count_des": 41241,
               "count_ocr_char": 20995,
               "count_asr_char": 20870,
               "count_des_char": 20988,
               "count_tag": 350,
               "count_key": 23262}
        
        my_len_dict = {"ocr": 21,
               "asr": 21,
               "des": 20,
               "ocr_char": 41,
               "asr_char": 41,
               "des_char": 41,
               "tag": 11,
               "key": 18}
        
        if not os.path.exists(ROOT):
            os.mkdir(self.RAW)
            os.mkdir(self.PROCESSED)
        self.data_path = DATASET_PATH
        self.user_action, self.feed_info, self.feed_emb, self.test = self.load()
    
    def load(self):
        user_action = pd.read_csv(self.data_path + '/user_action.csv')
        feed_info = pd.read_csv(self.data_path + '/feed_info.csv')
        feed_emb = pd.read_csv(self.data_path + '/feed_embeddings.csv')
        test = pd.read_csv(self.data_path + '/test_a.csv')
        return user_action, feed_info, feed_emb, test
    
    def get_id_mapping(self, mats, col, begin=1):
        mats = list(map(lambda x: x[col], mats))
        ids = np.concatenate(mats, axis=-1)
        ids = np.unique(ids)
        count = ids.shape[0]
        map_ids = np.arange(begin, begin + count, dtype=np.int32)
        mapid_series = pd.Series(index=ids, data=map_ids)
        return count, mapid_series
    
    def map_id(self, mat, mapping, col):
        mapped_id = mat[col].map(lambda x: mapping[x])
        mat[col] = mapped_id
        return mat
    
    def mapping_all_id(self):
        count_uid, uid_series = self.get_id_mapping([self.user_action, self.test], 'userid', begin=0)
        count_fid, fid_series = self.get_id_mapping([self.feed_info], 'feedid', begin=0)
        count_aid, aid_series = self.get_id_mapping([self.feed_info], 'authorid', begin=0)
        _, songid_series = self.get_id_mapping([self.feed_info], 'bgm_song_id', begin=0)
        _, singerid_series = self.get_id_mapping([self.feed_info], 'bgm_singer_id', begin=0)
        songid_series = songid_series[~songid_series.index.duplicated(keep='first')]
        singerid_series = singerid_series[~singerid_series.index.duplicated(keep='first')]
        count_songid = songid_series.values.shape[0]
        count_singerid = singerid_series.values.shape[0]
        
        self.user_action = self.map_id(self.user_action, uid_series, 'userid')
        self.user_action = self.map_id(self.user_action, fid_series, 'feedid')
        self.feed_info = self.map_id(self.feed_info, fid_series, 'feedid')
        self.feed_info = self.map_id(self.feed_info, aid_series, 'authorid')
        self.feed_info = self.map_id(self.feed_info, songid_series, 'bgm_song_id')
        self.feed_info = self.map_id(self.feed_info, singerid_series, 'bgm_singer_id')
        self.feed_emb = self.map_id(self.feed_emb, fid_series, 'feedid')
        self.test = self.map_id(self.test, fid_series, 'feedid')
        self.test = self.map_id(self.test, uid_series, 'userid')
        
        result = {'count_uid': count_uid, 'count_fid':count_fid, 'count_aid':count_aid, 
                  'count_singerid':count_singerid, 'count_songid':count_songid, 
                  'uid_series':uid_series, 'fid_series':fid_series, 'aid_series':aid_series, 
                  'songid_series', songid_series, 'singerid_series':singerid_series}
        result.update(my_vocab_dict)
        return result
    
    def format_mapping(self, df):
        index = pd.Series(df.index, name='src')
        value = pd.Series(df.values, name='des')
        return pd.concat([index, value], axis=1)
    
    def save(self, dict_data):
        with open(self.PROCESSED + '/statistic.pkl') as f:
            # 还未算上tag等信息
            pickle.dump([dict_data['count_uid'], 
                         dict_data['count_fid'], 
                         dict_data['count_aid'], 
                         dict_data['count_songid'],
                         dict_data['count_singerid'],
                         self.user_action['device'].unique().shape[0]], f, pickle.HIGHEST_PROTOCOL)
            
        self.format_mapping(dict_data['uid_series']).to_csv(self.RAW + '/uid_mapping.csv', index=False)
        self.format_mapping(dict_data['fid_series']).to_csv(self.RAW + '/fid_mapping.csv', index=False)
        self.format_mapping(dict_data['aid_series']).to_csv(self.RAW + '/aid_mapping.csv', index=False)
        self.format_mapping(dict_data['songid_series']).to_csv(self.RAW + '/songid_mapping.csv', index=False)
        self.format_mapping(dict_data['singerid_series']).to_csv(self.RAW + '/singerid_mapping.csv', index=False)
        
        self.user_action.to_csv(self.RAW + '/user_action.csv', index=False)
        self.feed_info.to_csv(self.RAW + '/feed_info.csv', index=False)
        self.feed_emb.to_csv(self.RAW + '/feed_embeddings.csv', index=False)
        self.test.to_csv(self.RAW + '/test_a.csv', index=False)
if __name__ == '__main__':
    preprocess = DIN_preprocess()
    dict_data = preprocess.mapping_all_id()
    preprocess.save(dict_data)

In [None]:
import os
import sys
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(BASE_DIR, '../../config'))
from config import *
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# after run Preprocess.py
class DIN_Process(object):
    def __init__(self, pca_dim=64):
        self.pca_dim = pca_dim
        self.RAW = FEATURE_PATH + '/DIN/raw'
        self.PROCESSED = FEATURE_PATH + '/DIN/processed'
        self.user_action, self.feed_info, self.feed_emb, self.test = self.load()
        self.use_cols = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id',
                        'videoplayseconds', 'device', 'date_',
                        'read_comment', 'like', 'comment', 'click_avatar', 'forward', 'follow', 'favorite']
    
    def load(self):
        user_action = pd.read_csv(self.RAW + '/user_action.csv')
        feed_info = pd.read_csv(self.RAW + '/feed_info.csv')
        feed_emb = pd.read_csv(self.RAW + '/feed_embeddings.csv')
        test = pd.read_csv(self.RAW + '/test_a.csv')
        return user_action, feed_info, feed_emb, test
    
    #总的流程处理
    def process(self):
        self.feature_eng(self.pca_dim)
        self.train = self.user_action.merge(self.feed_info, on='feedid', how='left')[self.use_cols]
        self.test = self.test.merge(self.feed_info, on='feedid', how='left')[self.use_cols]
        self.train[self.use_cols] = self.train[self.use_cols].fillna(0)
        self.test[self.use_cols] = self.test[self.use_cols].fillna(0)
        self.save_all()
        
    # 特征工程
    def feature_eng(self, pca_dim):
        self.process_feed_emb(pca_dim)
        self.process_videoplayseconds()
    # feed emb降维
    def process_feed_emb(self, pca_dim=None):
        self.feed_emb = self.feed_emb.sort_values(by=['feedid'])
        sorted_embeddings = [np.array(list(map(lambda x: eval(x), emb_list.strip(' ').split(' '))), dtype=np.float32) 
                             for emb_list in self.feed_emb['feed_embedding']]
        sorted_embeddings = [np.expand_dims(emb, axis=0) for emb in sorted_embeddings]
        np_embeddings = np.concatenate(sorted_embeddings, axis=0)
        
        if pca_dim is None:
            result = np_embeddings
        else:
            pca = PCA(n_components=pca_dim)
            pca.fit(np_embeddings)
            result = pca.transform(np_embeddings)
        zero_pad = np.zeros((1, result.shape[-1]), dtype=np.float32)
        self.feed_emb = np.concatenate([zero_pad, result], axis=0)
        
    # 归一化    
    def process_videoplayseconds(self):
        eps = 1e-9
        val = self.feed_info['videoplayseconds']
        norm = (val - val.min()) / (val.max() - val.min() + eps)
        self.feed_info['videoplayseconds'] = norm
    
    def save_all(self):
        self.train.to_csv(self.PROCESSED + '/data.csv', index=False)
        self.test.to_csv(self.PROCESSED + '/test.csv', index=False)
        np.save(self.PROCESSED + '/feed_embeddings{0}.npy'.format(self.pca_dim), self.feed_emb)

if __name__ == '__main__':
    DIN_Process(pca_dim=64)

In [44]:
!pwd

/home/tione/notebook/wbdc2021-preliminary-48c2b28c233f4934b362696daef770e4/src/prepare


In [45]:
path = '../../data/wedata/feature/DIN/raw/'
u = pd.read_csv(path + 'user_action.csv')
f = pd.read_csv(path + 'feed_info.csv')

In [60]:
s

Unnamed: 0,userid,feedid,date_,device,read_comment,comment,like,play,stay,click_avatar,...,asr,bgm_song_id,bgm_singer_id,manual_keyword_list,machine_keyword_list,manual_tag_list,machine_tag_list,description_char,ocr_char,asr_char
0,0,67410,1,1,0,0,1,500,5366,0,...,104002 104002 104002 104002 104002 104002 1040...,13746,3557,,7978;9680,219;6;124,266 0.68073416;41 0.68073416,24513 8512 27464 24513 30027 73 23103 6243 287...,11458 1254 28079 20753 23952 10212 8740 32214 ...,23112 23112 23112 23112 23112 23112 23112 2311...
1,0,69691,1,1,0,0,0,250,1533,0,...,117252 44399 22662 4438,25160,17501,,4388;24079;19399;19166;22371;5615,308;6,266 0.39245349;261 0.39245349;8 3.8e-07;306 0....,,,26018 10043 30420 969
2,0,47444,1,1,0,0,0,750,1302,0,...,104002 104002 104002 104002 104002 104002 1040...,25160,17501,,5077;17881;5132;25008;61;17812,9;6;222;222,16 0.33405718;332 0.33405718;8 7e-08;306 0.0;2...,21277 21277 1873 3110 32556 22468 20201 12785 ...,,23112 23112 23112 23112 23112 23112 23112 2311...
3,0,10744,1,1,0,0,1,3750,5191,0,...,142955 80924 45012 25794 16650 8097,13097,5013,24390;14546;6355,17918;131;22816,228;12;159;6,12 0.867993;228 0.867993;8 6.3e-07;306 0.0;207...,7357 22439 31380 12371 18881 26018 2223 10045 ...,6244 28921 12837 199 30086,2203 26439 17681 17681 10226 8481 3703 1872
4,0,25764,1,1,0,0,0,250,800,0,...,150623 17721 287 66591 4438,22216,7900,9868;27100;11411,23257;20650;23257;8693;26486;25995;7144;19699,322;13;159;6,266 0.78305119;41 0.78305119;8 0.0;306 0.0;207...,17297 28718 10155 5967 17297 14126 26193 20860...,13519 14749 12 19791 20857 14040 22468 89 8481...,5990 15640 30749 29499 66 4956 7578 969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7317877,19999,68661,12,2,0,0,0,6369,6566,0,...,114413 125236 8109 44398 93563 101258 44399 11...,25018,8666,12091,12091;8420,328;13;159;212;147;101;6,13 0.69649464;328 0.69649464;8 0.0;306 0.0;207...,26067 13505 12837 32505 13505 1348 3882 6243 2...,20857 12371 30420 11192 11192 11192 20839 1980...,27077 10050 10205 10198 1882 10042 20857 6332 ...
7317878,19999,49070,12,2,0,0,0,13212,13708,0,...,104002 104002 104002 104002 104002 104002 1040...,15451,11704,22225;27025;17277;208,20621;4345;15026,340;235;159;145;6,23 0.27723727;202 0.27723727;8 0.0;306 0.0;207...,26067 26084 25985 32505 26084 22439 3879 23963...,862 28161 12678 1882 10043 22495 20857 22495 2...,23112 23112 23112 23112 23112 23112 23112 2311...
7317879,19999,48150,12,2,0,0,0,0,32215,0,...,16547 54606 128427 14759 33190 25290 85868 110...,25160,17501,22225;21855;11411,23854;15073;3963,340;235;159;6,235 0.9883284;340 0.9883284;8 0.00018654;306 0...,5967 15640 26018 6243 8311 1882 5967 15640 260...,1553 20096 5327 17283 17442 9019 27077 5967 15...,3775 28896 12371 10517 1415 8484 32379 14748 1...
7317880,19999,44211,12,2,0,0,0,0,5618,0,...,,25160,17501,7377;1188;21020;2298;8452,25911;2298;11496;6969;3963,170;239;159;169;213,18 0.47717199;143 0.47717199,5513 5977 18657 2606 18006 1873 21483 12783 24...,,


In [61]:
a = s.groupby('userid')

In [67]:
b = list(a['feedid'])

In [74]:
c[:5]

[[[67410,
   69691,
   47444,
   10744,
   25764,
   28537,
   1081,
   65777,
   21148,
   92336,
   59132,
   66325,
   79512,
   18918,
   38474,
   79044,
   45301,
   24129,
   99169,
   53530,
   71097,
   15333,
   61413,
   52029,
   102701,
   100280,
   19941,
   20244,
   54012,
   28382,
   54254,
   42074,
   70640,
   9395,
   98529,
   59222,
   85886,
   57835,
   96627,
   100197,
   6546,
   53891,
   41489,
   78932,
   80098,
   59694,
   30509,
   13203,
   100763,
   25159,
   21951,
   66862,
   8898,
   15766,
   72803,
   51796,
   54350,
   12332,
   3809,
   53406,
   25616,
   83280,
   75014,
   105930,
   88080,
   105062,
   22798,
   24970,
   12815,
   32423,
   41443,
   50020,
   27772,
   78573,
   74510,
   62857,
   16234,
   105631,
   34014,
   80546,
   75200,
   40278,
   36225,
   34422,
   28994,
   105928,
   7840,
   27904,
   100481,
   86626,
   48670,
   63397,
   58421,
   88393,
   91159,
   13859,
   73983,
   7551,
   51748,
   48276