In [1]:
import pandas as pd
import time
from collections import defaultdict

In [2]:
%%time
train = pd.read_csv('./train_preprocess2.csv',
                    dtype={
                        'C1_fix':object,
                        'banner_pos_fix':object,
                        'site_id_fix_fix':object,
                        'site_domain_fix_fix':object,
                        'site_category_fix_fix':object,
                        'app_id_fix_fix':object,
                        'app_domain_fix_fix':object,
                        'app_category_fix_fix':object,
                        'device_id_fix_fix':object,
                        'device_ip_fix_fix':object,
                        'device_model_fix_fix':object,
                        'device_type_fix':object,
                        'device_conn_type_fix':object,
                        'C14_fix_fix':object,
                        'C15_fix':object,
                        'C16_fix':object,
                        'C17_fix_fix':object,
                        'C18_fix':object,
                        'C19_fix_fix':object,
                        'C20_fix_fix':object,
                        'C21_fix':object,
                        'hour_fix_fix':object
                    })
train.shape

CPU times: user 2min, sys: 25.6 s, total: 2min 26s
Wall time: 2min 29s


(40428967, 24)

In [3]:
train.dtypes

id                       float64
click                      int64
C1_fix                    object
banner_pos_fix            object
site_id_fix_fix           object
site_domain_fix_fix       object
site_category_fix_fix     object
app_id_fix_fix            object
app_domain_fix_fix        object
app_category_fix_fix      object
device_id_fix_fix         object
device_ip_fix_fix         object
device_model_fix_fix      object
device_type_fix           object
device_conn_type_fix      object
C14_fix_fix               object
C15_fix                   object
C16_fix                   object
C17_fix_fix               object
C18_fix                   object
C19_fix_fix               object
C20_fix_fix               object
C21_fix                   object
hour_fix_fix              object
dtype: object

In [4]:
f_cols = train.columns.tolist()
f_cols.remove('id')
f_cols.remove('click')

In [5]:
%%time
train[f_cols].nunique()

CPU times: user 1min 35s, sys: 32.3 s, total: 2min 7s
Wall time: 2min 10s


C1_fix                      4
banner_pos_fix              6
site_id_fix_fix           475
site_domain_fix_fix       411
site_category_fix_fix       6
app_id_fix_fix            277
app_domain_fix_fix         12
app_category_fix_fix        4
device_id_fix_fix          23
device_ip_fix_fix        7638
device_model_fix_fix     1578
device_type_fix             3
device_conn_type_fix        2
C14_fix_fix               652
C15_fix                     5
C16_fix                     6
C17_fix_fix               152
C18_fix                     2
C19_fix_fix                17
C20_fix_fix                45
C21_fix                    20
hour_fix_fix               24
dtype: int64

## 模型訓練

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.preprocessing import StandardScaler

In [4]:
%%time
train_distinct = train.drop(columns=['id']).drop_duplicates()
train_distinct.shape

CPU times: user 1min 22s, sys: 36.9 s, total: 1min 58s
Wall time: 2min 2s


(10570684, 23)

In [5]:
train_distinct.click.value_counts()

0    7608952
1    2961732
Name: click, dtype: int64

In [6]:
%%time
train_sample = train_distinct.sample(frac=0.1, replace=True, random_state=42)
train_sample.shape

CPU times: user 4.87 s, sys: 102 ms, total: 4.97 s
Wall time: 5.05 s


(1057068, 23)

In [7]:
train_sample.click.value_counts()

0    760412
1    296656
Name: click, dtype: int64

In [10]:
train_sample[f_cols].nunique()

C1_fix                      4
banner_pos_fix              6
site_id_fix_fix           474
site_domain_fix_fix       410
site_category_fix_fix       6
app_id_fix_fix            273
app_domain_fix_fix         12
app_category_fix_fix        4
device_id_fix_fix          22
device_ip_fix_fix        7616
device_model_fix_fix     1578
device_type_fix             3
device_conn_type_fix        2
C14_fix_fix               652
C15_fix                     5
C16_fix                     6
C17_fix_fix               152
C18_fix                     2
C19_fix_fix                17
C20_fix_fix                45
C21_fix                    20
hour_fix_fix               24
dtype: int64

In [11]:
train_sample.head()

Unnamed: 0,click,C1_fix,banner_pos_fix,site_id_fix_fix,site_domain_fix_fix,site_category_fix_fix,app_id_fix_fix,app_domain_fix_fix,app_category_fix_fix,device_id_fix_fix,...,device_conn_type_fix,C14_fix_fix,C15_fix,C16_fix,C17_fix_fix,C18_fix,C19_fix_fix,C20_fix_fix,C21_fix,hour_fix_fix
22319365,1,1005,1,0a742914,510bd839,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,0,22147,DNP,DNP,2551,2,167,-1,23,13
22880423,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,0,20108,DNP,DNP,2299,2,1327,-1,52,16
38135508,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,0,22261,DNP,DNP,2545,DNP,431,-1,221,11
13940040,0,1005,0,DNP,DNP,DNP,DNP,DNP,DNP,a99f214a,...,0,22188,DNP,DNP,DNP,2,167,-1,23,5
7392491,1,1005,1,DNP,DNP,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,0,DNP,DNP,DNP,DNP,DNP,35,DNP,221,12


In [8]:
x = train_sample.drop(columns=['click'])
y = train_sample[['click']]

In [9]:
%%time
x = pd.get_dummies(x)
x.shape, y.shape

CPU times: user 4min 8s, sys: 31.9 s, total: 4min 39s
Wall time: 4min 50s


((1057068, 11333), (1057068, 1))

In [11]:
%%time
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

CPU times: user 1min 46s, sys: 55.6 s, total: 2min 41s
Wall time: 2min 52s


In [None]:
%%time
para_search = {'criterion':['gini','entropy'],'max_depth':[i for i in range(10, 20, 2)]}
gs = GridSearchCV(dt(), param_grid=para_search, scoring='recall', verbose=1, n_jobs=-1)
gs.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
