In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """ 
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    
    return df


In [3]:
data = []
for i in tqdm(range(1, 32)):
    data.append(reduce_mem_usage(pd.read_csv('../data/train_table_%02d.gz'%i)))
data = pd.concat(data)
data = pd.concat([data, reduce_mem_usage(pd.read_csv('../data/test_table.gz'))])

100%|██████████| 31/31 [07:23<00:00, 14.04s/it]


In [None]:
encoder = LabelEncoder()
data['id'] = encoder.fit_transform(data['link'])

In [None]:
data[['link','id']].to_csv('map.csv', index=False)

In [219]:
## 先把每天的矩阵给复原 15583 * 720
mat = np.zeros(shape=(32, 15584, 800)) - 1

In [220]:
idmap = pd.read_csv('map.csv')

In [221]:
idmap = dict(zip(idmap['link'], idmap['id']))

In [222]:
for date in tqdm(range(0, 31)):
    data = pd.read_csv('../data/train_table_%02d.gz'%(date+1))
    data['id'] = data['link'].apply(lambda x:idmap[x])
    for i, row in data.iterrows():
        link = int(row['id'])
        label = row['label']
        if label != -1:
            mat[date, link, int(row['predict_time'])] = label
        for loc in range(1, 6):
            mat[date, link, int(row['current_feature_time_%d'%loc])] = int(row['current_feature_status_%d'%loc])
        for w in range(1, 5):
            week = (5 - w) * 7
            if date - week >= 0:
                for loc in range(1, 6):
                    origin = mat[date - week, link, int(row['history%d_feature_time_%d'%(w, loc)])]
                    value = int(row['history%d_feature_status_%d'%(w, loc)])
                    if origin != -1 and origin != value:
                        print(origin, value, w)
                    else:
                        mat[date - week, link, int(row['history%d_feature_time_%d'%(w, loc)])] = value









  0%|          | 0/31 [00:00<?, ?it/s][A[A[A[A[A[A[A[A







  3%|▎         | 1/31 [02:09<1:04:46, 129.54s/it][A[A[A[A[A[A[A[A







  6%|▋         | 2/31 [04:20<1:02:49, 129.99s/it][A[A[A[A[A[A[A[A







 10%|▉         | 3/31 [06:31<1:00:49, 130.33s/it][A[A[A[A[A[A[A[A







 13%|█▎        | 4/31 [08:44<59:01, 131.15s/it]  [A[A[A[A[A[A[A[A







 16%|█▌        | 5/31 [10:58<57:13, 132.07s/it][A[A[A[A[A[A[A[A







 19%|█▉        | 6/31 [13:06<54:24, 130.58s/it][A[A[A[A[A[A[A[A







 23%|██▎       | 7/31 [15:05<50:55, 127.33s/it][A[A[A[A[A[A[A[A







 26%|██▌       | 8/31 [19:13<1:02:39, 163.48s/it][A[A[A[A[A[A[A[A







 29%|██▉       | 9/31 [23:21<1:09:10, 188.67s/it][A[A[A[A[A[A[A[A







 32%|███▏      | 10/31 [27:27<1:12:08, 206.11s/it][A[A[A[A[A[A[A[A







 35%|███▌      | 11/31 [31:37<1:13:02, 219.10s/it][A[A[A[A[A[A[A[A







 39%|███▊      | 12/31 [35:47<1:12:

In [240]:
date = 31
data = pd.read_csv('../data/test_table.gz')
data['id'] = data['link'].apply(lambda x:idmap[x])
count = 0
for i, row in data.iterrows():
    link = int(row['id'])
    label = row['label']
    if label != -1:
        mat[date, link, int(row['predict_time'])] = label
    for loc in range(1, 6):
        mat[date, link, int(row['current_feature_time_%d'%loc])] = int(row['current_feature_status_%d'%loc])
    for w in range(1, 5):
        week = (5 - w) * 7
        if date - week >= 0:
            for loc in range(1, 6):
                origin = mat[date - week, link, int(row['history%d_feature_time_%d'%(w, loc)])]
                value = int(row['history%d_feature_status_%d'%(w, loc)])
                if origin != -1 and origin != value:
                    count += 1
                else:
                    mat[date - week, link, int(row['history%d_feature_time_%d'%(w, loc)])] = value

In [256]:
np.save('mat.npy',mat)