# Data Reformat
reformat unstructure data into structure
- <a href='#1'>1. task1</a> 
- <a href='#2'>2. task2</a> 
- <a href='#3'>3. task3</a>

In [28]:
%load_ext autoreload
%autoreload 2
import sys
import os 

import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

sys.path.append('../')
import conf
from utils import (
                    correct_column_type_by_value_range
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
# global settings
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns',1000)
pd.set_option('display.width',100)
sns.set(rc={'figure.figsize':(11,4)})

In [30]:
# global variables
DEFAULT_MISSING_VALUE = 0
FONT = fm.FontProperties(fname = os.path.join(conf.LIB_DIR,'simsun.ttc'))

In [31]:
# functions
def __dummy():
    pass



In [32]:
! du -sh ../data/train_preliminary/*

4.0K	../data/train_preliminary/README
75M	../data/train_preliminary/ad.csv
553M	../data/train_preliminary/click_log.csv
9.4M	../data/train_preliminary/user.csv


In [33]:
! du -sh ../data/test/*

4.0K	../data/test/README
79M	../data/test/ad.csv
654M	../data/test/click_log.csv


In [7]:
# train - ad.csv
train_ad_df = pd.read_csv(os.path.join(conf.ROUND_ONE_TRAIN_DATA_DIR, 'ad.csv'))

In [8]:
train_ad_df.head(200)

Unnamed: 0,creative_id,ad_id,product_id,product_category,advertiser_id,industry
0,1,1,\N,5,381,78
1,4,4,\N,5,108,202
2,7,7,\N,5,148,297
3,8,8,\N,5,713,213
4,9,9,\N,5,695,213
...,...,...,...,...,...,...
195,355,556,1374,2,20,248
196,357,383,\N,9,5797,155
197,358,559,\N,18,5932,114
198,360,562,29002,3,238,322


In [9]:
train_ad_df.dtypes

creative_id          int64
ad_id                int64
product_id          object
product_category     int64
advertiser_id        int64
industry            object
dtype: object

In [10]:
train_ad_df.describe()

Unnamed: 0,creative_id,ad_id,product_category,advertiser_id
count,2481135.0,2481135.0,2481135.0,2481135.0
mean,2192008.0,1888107.0,7.805932,27416.24
std,1285497.0,1099764.0,6.770557,15346.31
min,1.0,1.0,1.0,2.0
25%,1076370.0,939192.5,2.0,14974.0
50%,2191948.0,1889017.0,5.0,25197.0
75%,3304420.0,2839458.0,18.0,38254.0
max,4445718.0,3812200.0,18.0,62965.0


In [11]:
train_ad_df.dtypes

creative_id          int64
ad_id                int64
product_id          object
product_category     int64
advertiser_id        int64
industry            object
dtype: object

In [12]:
len(train_ad_df['product_id'].unique())

33273

In [13]:
round(len(train_ad_df[train_ad_df.product_id=='\\N'])/len(train_ad_df),2)

0.37

In [14]:
len(train_ad_df)

2481135

In [15]:
mask = train_ad_df.product_id=='\\N'
train_ad_df[mask]

Unnamed: 0,creative_id,ad_id,product_id,product_category,advertiser_id,industry
0,1,1,\N,5,381,78
1,4,4,\N,5,108,202
2,7,7,\N,5,148,297
3,8,8,\N,5,713,213
4,9,9,\N,5,695,213
...,...,...,...,...,...,...
2481107,4445665,3812148,\N,18,19750,74
2481108,4445666,3812149,\N,18,19750,74
2481109,4445667,3812150,\N,18,19750,74
2481118,4445690,3812172,\N,5,16192,288


In [16]:
# map nan value into 0 and change data type into int
train_ad_df['product_id'] = train_ad_df['product_id'].apply(lambda x: np.nan if x =='\\N' else int(x))
train_ad_df['industry'] = train_ad_df['industry'].apply(lambda x: np.nan if x=='\\N' else int(x))

In [17]:
train_ad_df.dtypes

creative_id           int64
ad_id                 int64
product_id          float64
product_category      int64
advertiser_id         int64
industry            float64
dtype: object

In [18]:
### test - ad.csv 
test_ad_df = pd.read_csv(os.path.join(conf.TEST_DATA_DIR, 'ad.csv'))

In [19]:
len(test_ad_df)

2618159

In [20]:
test_ad_df['product_id'] = test_ad_df['product_id'].apply(lambda x: np.nan if x =='\\N' else int(x))
test_ad_df['industry'] = test_ad_df['industry'].apply(lambda x: np.nan if x=='\\N' else int(x))

In [21]:
# train - click_log.csv
train_click_log_df = pd.read_csv(os.path.join(conf.ROUND_ONE_TRAIN_DATA_DIR, 'click_log.csv'))

In [22]:
train_click_log_df.head()

Unnamed: 0,time,user_id,creative_id,click_times
0,9,30920,567330,1
1,65,30920,3072255,1
2,56,30920,2361327,1
3,6,309204,325532,1
4,59,309204,2746730,1


In [23]:
train_click_log_df.dtypes

time           int64
user_id        int64
creative_id    int64
click_times    int64
dtype: object

In [24]:
train_click_log_df.shape

(30082771, 4)

In [25]:
# test - click_log.csv
test_click_log_df = pd.read_csv(os.path.join(conf.TEST_DATA_DIR, 'click_log.csv'))

In [26]:
len(test_click_log_df)

33585512

In [34]:
# train - user.csv
train_user_df = pd.read_csv(os.path.join(conf.ROUND_ONE_TRAIN_DATA_DIR, 'user.csv'))

In [35]:
train_user_df.head()

Unnamed: 0,user_id,age,gender
0,1,4,1
1,2,10,1
2,3,7,2
3,4,5,1
4,5,4,1


In [36]:
len(train_user_df)

900000

In [37]:
train_user_df.dtypes

user_id    int64
age        int64
gender     int64
dtype: object

In [39]:
train_user_df.head()

Unnamed: 0,user_id,age,gender
0,1,4,1
1,2,10,1
2,3,7,2
3,4,5,1
4,5,4,1


In [41]:
train_user_df['y'] = list(zip(train_user_df['gender'], train_user_df['age']))

In [45]:
label_map_dict = {0: (1, 1),
                  1: (1, 2),
                  2: (1, 3),
                  3: (1, 4),
                  4: (1, 5),
                  5: (1, 6),
                  6: (1, 7),
                  7: (1, 8),
                  8: (1, 9),
                  9: (1, 10),
                  10: (2, 1),
                  11: (2, 2),
                  12: (2, 3),
                  13: (2, 4),
                  14: (2, 5),
                  15: (2, 6),
                  16: (2, 7),
                  17: (2, 8),
                  18: (2, 9),
                  19: (2, 10)}

In [49]:
label_map_dict.items()

dict_items([(0, (1, 1)), (1, (1, 2)), (2, (1, 3)), (3, (1, 4)), (4, (1, 5)), (5, (1, 6)), (6, (1, 7)), (7, (1, 8)), (8, (1, 9)), (9, (1, 10)), (10, (2, 1)), (11, (2, 2)), (12, (2, 3)), (13, (2, 4)), (14, (2, 5)), (15, (2, 6)), (16, (2, 7)), (17, (2, 8)), (18, (2, 9)), (19, (2, 10))])

In [52]:
reverse_label_map_dict = dict([(value,key)for key, value in label_map_dict.items()])

In [53]:
reverse_label_map_dict

{(1, 1): 0,
 (1, 2): 1,
 (1, 3): 2,
 (1, 4): 3,
 (1, 5): 4,
 (1, 6): 5,
 (1, 7): 6,
 (1, 8): 7,
 (1, 9): 8,
 (1, 10): 9,
 (2, 1): 10,
 (2, 2): 11,
 (2, 3): 12,
 (2, 4): 13,
 (2, 5): 14,
 (2, 6): 15,
 (2, 7): 16,
 (2, 8): 17,
 (2, 9): 18,
 (2, 10): 19}

In [54]:
train_user_df['y'] = train_user_df['y'].apply(lambda x : reverse_label_map_dict[x])

In [55]:
train_user_df.head()

Unnamed: 0,user_id,age,gender,y
0,1,4,1,3
1,2,10,1,9
2,3,7,2,16
3,4,5,1,4
4,5,4,1,3


In [56]:
train_user_df.to_feather(os.path.join(conf.DATA_DIR, 'label_round_one_df.feather'))

In [31]:
# merge train dfs
raw_train_df = train_click_log_df.merge(train_ad_df,how='left',on='creative_id')

In [32]:
raw_train_df = raw_train_df.merge(train_user_df,how='left',on='user_id')

In [27]:
raw_train_df.head()

NameError: name 'raw_train_df' is not defined

In [34]:
mask = raw_train_df['industry']==None
raw_train_df[mask]

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry,age,gender


In [35]:
# generate label by combining age + gender * 10
raw_train_df['y'] = raw_train_df['gender'] * 10 + raw_train_df['age']

In [None]:
raw_train_df

In [36]:
raw_train_df.head()

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry,age,gender,y
0,9,30920,567330,1,504423,30673.0,3,32638,319.0,2,1,12
1,65,30920,3072255,1,2642300,1261.0,2,6783,6.0,2,1,12
2,56,30920,2361327,1,2035918,1261.0,2,6783,6.0,2,1,12
3,6,309204,325532,1,292523,27081.0,3,32066,242.0,6,1,16
4,59,309204,2746730,1,2362208,,18,14682,88.0,6,1,16


In [37]:
correct_column_type_by_value_range(raw_train_df)

2020-05-08 20:35:50,339 - utils.utils - INFO - correct_column_type_by_value_range开始
2020-05-08 20:36:09,679 - utils.utils - INFO - col_types: time                    int8
user_id                int64
creative_id            int64
click_times            int16
ad_id                  int32
product_id           float64
product_category    category
advertiser_id          int32
industry            category
age                     int8
gender                  int8
y                       int8
dtype: object
2020-05-08 20:36:09,682 - utils.utils - INFO - correct_column_type_by_value_range已完成，共用时0:00:19


In [38]:
raw_train_df.dtypes

time                    int8
user_id                int64
creative_id            int64
click_times            int16
ad_id                  int32
product_id           float64
product_category    category
advertiser_id          int32
industry            category
age                     int8
gender                  int8
y                       int8
dtype: object

In [39]:
raw_train_df.reset_index(drop=True, inplace=True)

In [40]:
raw_train_df.to_feather(os.path.join(conf.DATA_DIR,'raw_train_round_one_df.feather'))

In [None]:
# merge test dfs
raw_test_df = test_click_log_df.merge(test_ad_df,how='left',on='creative_id')

In [None]:
correct_column_type_by_value_range(raw_test_df)

In [None]:
raw_test_df.head()

In [None]:
raw_test_df.dtypes

In [None]:
raw_test_df.reset_index(drop=True, inplace=True)

In [None]:
raw_test_df.to_feather(os.path.join(conf.DATA_DIR, 'raw_test_df.feather'))