In [1]:
import pandas as pd
import time
from collections import defaultdict

In [2]:
%%time
train = pd.read_csv('./train_preprocess2.csv',
                    dtype={
                        'C1_fix':object,
                        'banner_pos_fix':object,
                        'site_id_fix_fix':object,
                        'site_domain_fix_fix':object,
                        'site_category_fix_fix':object,
                        'app_id_fix_fix':object,
                        'app_domain_fix_fix':object,
                        'app_category_fix_fix':object,
                        'device_id_fix_fix':object,
                        'device_ip_fix_fix':object,
                        'device_model_fix_fix':object,
                        'device_type_fix':object,
                        'device_conn_type_fix':object,
                        'C14_fix_fix':object,
                        'C15_fix':object,
                        'C16_fix':object,
                        'C17_fix_fix':object,
                        'C18_fix':object,
                        'C19_fix_fix':object,
                        'C20_fix_fix':object,
                        'C21_fix':object,
                        'hour_fix_fix':object
                    })
train.shape

CPU times: user 1min 56s, sys: 25.2 s, total: 2min 21s
Wall time: 2min 24s


(40428967, 24)

In [3]:
train.dtypes

id                       float64
click                      int64
C1_fix                    object
banner_pos_fix            object
site_id_fix_fix           object
site_domain_fix_fix       object
site_category_fix_fix     object
app_id_fix_fix            object
app_domain_fix_fix        object
app_category_fix_fix      object
device_id_fix_fix         object
device_ip_fix_fix         object
device_model_fix_fix      object
device_type_fix           object
device_conn_type_fix      object
C14_fix_fix               object
C15_fix                   object
C16_fix                   object
C17_fix_fix               object
C18_fix                   object
C19_fix_fix               object
C20_fix_fix               object
C21_fix                   object
hour_fix_fix              object
dtype: object

In [4]:
f_cols = train.columns.tolist()
f_cols.remove('id')
f_cols.remove('click')

In [5]:
%%time
train[f_cols].nunique()

CPU times: user 1min 36s, sys: 19.7 s, total: 1min 55s
Wall time: 1min 57s


C1_fix                     4
banner_pos_fix             4
site_id_fix_fix           27
site_domain_fix_fix       25
site_category_fix_fix      4
app_id_fix_fix            17
app_domain_fix_fix         8
app_category_fix_fix       3
device_id_fix_fix          2
device_ip_fix_fix         14
device_model_fix_fix      95
device_type_fix            3
device_conn_type_fix       2
C14_fix_fix              116
C15_fix                    2
C16_fix                    3
C17_fix_fix               64
C18_fix                    2
C19_fix_fix               14
C20_fix_fix               22
C21_fix                   17
hour_fix_fix              24
dtype: int64

## 模型訓練

In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.preprocessing import StandardScaler

In [7]:
%%time
train_distinct = train.drop(columns=['id']).drop_duplicates()
train_distinct.shape

CPU times: user 1min 17s, sys: 33.2 s, total: 1min 51s
Wall time: 1min 54s


(3234119, 23)

In [22]:
train_distinct.click.value_counts()

0    2170735
1    1063384
Name: click, dtype: int64

In [33]:
%%time
train_sample = train_distinct.sample(frac=0.1, replace=True, random_state=42)
train_sample.shape

CPU times: user 1.19 s, sys: 40.8 ms, total: 1.23 s
Wall time: 1.24 s


(323412, 23)

In [34]:
train_sample.click.value_counts()

0    216686
1    106726
Name: click, dtype: int64

In [35]:
train_sample[f_cols].nunique()

C1_fix                     4
banner_pos_fix             4
site_id_fix_fix           27
site_domain_fix_fix       25
site_category_fix_fix      4
app_id_fix_fix            17
app_domain_fix_fix         8
app_category_fix_fix       3
device_id_fix_fix          2
device_ip_fix_fix         14
device_model_fix_fix      95
device_type_fix            3
device_conn_type_fix       2
C14_fix_fix              116
C15_fix                    2
C16_fix                    3
C17_fix_fix               64
C18_fix                    2
C19_fix_fix               14
C20_fix_fix               22
C21_fix                   17
hour_fix_fix              24
dtype: int64

In [36]:
%%time
x = train_sample.drop(columns=['click'])
y = train_sample[['click']]
x = pd.get_dummies(x)
x.shape, y.shape

CPU times: user 1.76 s, sys: 181 ms, total: 1.94 s
Wall time: 1.96 s


((323412, 472), (323412, 1))

In [28]:
%%time
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

CPU times: user 3.69 s, sys: 550 ms, total: 4.24 s
Wall time: 4.27 s


In [29]:
%%time
para_search = {'criterion':['gini','entropy'],'max_depth':[i for i in range(10, 20, 2)]}
gs = GridSearchCV(dt(), param_grid=para_search, scoring='recall', verbose=1, n_jobs=-1)
gs.fit(x_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 1min 23s, sys: 1.42 s, total: 1min 25s
Wall time: 32min 52s


GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [20, 25, 30, 35, 40, 45]},
             scoring='recall', verbose=1)

In [32]:
gs.best_params_, gs.best_score_

({'criterion': 'gini', 'max_depth': 45}, 0.3063778053963659)