In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import warnings
from sklearn.model_selection import train_test_split
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

%matplotlib inline

In [31]:
talking = pd.read_csv('data/param_tuning18m.csv', parse_dates = ['click_time'])

In [32]:
len(talking)

18000000

In [33]:
heads = talking.columns
print(heads)
for i in list(range(0,5)) + list(range(-1, -6, -1)):
    talking[heads[i]] = talking[heads[i]].astype('category')
talking.info()

Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time',
       'is_attributed'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000000 entries, 0 to 17999999
Data columns (total 8 columns):
ip                 category
app                category
device             category
os                 category
channel            category
click_time         category
attributed_time    category
is_attributed      category
dtypes: category(8)
memory usage: 380.5 MB


In [34]:
# down sample the train set
df_majority = talking[talking["is_attributed"]==0]
df_minority = talking[talking["is_attributed"]==1]

df_majority_downsampled = df_majority.sample(replace=False,    # sample without replacement
                               frac=len(df_minority)/len(df_majority),
                               random_state=42) # reproducible results

balanced_train = pd.concat([df_majority_downsampled, df_minority])
# balanced_train = balanced_train.sample(frac=1)

# balanced_train = balanced_train.drop(columns=['click_time','attributed_time'])
balanced_train["is_attributed"] = balanced_train["is_attributed"].astype("int64")

balanced_train['day'] = balanced_train['click_time'].dt.day
balanced_train['hour'] = balanced_train['click_time'].dt.hour
balanced_train['minute'] = balanced_train['click_time'].dt.minute
balanced_train['second'] = balanced_train['click_time'].dt.second
balanced_train.drop(['click_time'], axis=1, inplace=True)
balanced_train.head()
X = balanced_train.drop(['is_attributed', 'attributed_time'], axis=1)
y = balanced_train['is_attributed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
X_train.shape
y_train.shape
X_test.shape
y_test.shape

(26763,)

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
from sklearn import datasets, neighbors, linear_model

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score

import seaborn as sns; sns.set()
import matplotlib.pyplot as plt


In [37]:
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
def grid_search_rfc(X_train, X_test, y_train, y_test):
    rfc = RandomForestClassifier()   
    grid_params = {'n_estimators': [10, 50, 100],
                   'max_features': ['auto'],
                   'max_depth' : range(5,10),
                   'criterion' :['gini', 'entropy']}
    gs = GridSearchCV(estimator=rfc,  
                      param_grid=grid_params,
                      scoring='roc_auc',
                      cv=5)
    gs.fit(X_train, y_train)
    y_pred = gs.predict(X_test)
    roc_auc = roc_auc_score(y_pred, y_test)
    f1 = f1_score(y_pred, y_test)
    best_params = gs.best_params_
    return roc_auc, f1, best_params


In [40]:
grid_search_rfc(X_train, X_test, y_train, y_test)

(0.9156371461561522,
 0.900346411020704,
 {'criterion': 'gini',
  'max_depth': 9,
  'max_features': 'auto',
  'n_estimators': 100})

In [41]:
def grid_search_gbc(X_train, X_test, y_train, y_test):
    gbc = GradientBoostingClassifier()
    grid_params = {'n_estimators': [10, 50, 100],
                   'learning_rate': [0.01, 0.05, 0.1],
                   'criterion' :['friedman_mse', 'mse']}
    gs = GridSearchCV(estimator=gbc,  
                      param_grid=grid_params,
                      scoring='roc_auc',
                      cv=5)
    gs.fit(X_train, y_train)
    y_pred = gs.predict(X_test)
    roc_auc = roc_auc_score(y_pred, y_test)
    f1 = f1_score(y_pred, y_test)
    best_params = gs.best_params_
    return roc_auc, f1, best_params


In [42]:
grid_search_gbc(X_train, X_test, y_train, y_test)

(0.9160196912128187,
 0.905075859877671,
 {'criterion': 'friedman_mse', 'learning_rate': 0.1, 'n_estimators': 100})

In [46]:
def grid_search_xgb(X_train, X_test, y_train, y_test):
    xg = XGBClassifier()
    grid_params = {'n_estimators': [100, 200, 400],
                   'learning_rate': [0.1, 0.2, 0.4]}
    gs = GridSearchCV(estimator=xg,  
                      param_grid=grid_params,
                      scoring='roc_auc',
                      cv=5)
    gs.fit(X_train.values, y_train)
    roc_auc = gs.score(X_test.values, y_test)
    best_params = gs.best_params_
    return roc_auc, best_params

In [47]:
grid_search_xgb(X_train, X_test, y_train, y_test)


(0.9672566340249282, {'learning_rate': 0.2, 'n_estimators': 400})

In [48]:
def grid_search_lgb(X_train, X_test, y_train, y_test):
    lg = LGBMClassifier()    
    grid_params = {'n_estimators': [10, 100, 400],
                   'learning_rate': [0.01, 0.1, 0.5]}              
    gs = GridSearchCV(estimator=lg,  
                      param_grid=grid_params,
                      scoring='roc_auc',
                      cv=5)
    gs.fit(X_train.values, y_train)
    roc_auc = gs.score(X_test.values, y_test)
    best_params = gs.best_params_    
    return roc_auc, best_params

In [None]:
grid_search_lgb(X_train, X_test, y_train, y_test)

In [49]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [50]:
y_predict = rfc.predict(X_test)
roc_auc_score(y_test, y_predict)

0.9104387660889159

In [51]:
df_tmp = pd.read_csv('data/test_supplement.csv')
df_tmp.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,43570,3,1,18,379,2017-11-09 14:23:39
1,1,80528,3,1,13,379,2017-11-09 14:23:51
2,2,32323,3,1,13,379,2017-11-09 14:25:57
3,3,42887,3,1,17,379,2017-11-09 14:26:03
4,4,119289,58,1,30,120,2017-11-09 14:26:41


In [52]:
X.head()

Unnamed: 0,ip,app,device,os,channel,day,hour,minute,second
1056799,115078,2,1,19,122,8,4,15,26
8558776,119592,23,1,13,153,6,22,56,13
9891634,232,8,1,19,145,8,13,16,5
16500658,182925,14,1,43,134,8,23,56,26
5963873,114276,3,1,17,130,8,0,43,48


In [53]:
clf = XGBClassifier(learning_rate = 0.2, n_estimators =400).fit(X.values,y)
X.values

array([[115078, 2, 1, ..., 4, 15, 26],
       [119592, 23, 1, ..., 22, 56, 13],
       [232, 8, 1, ..., 13, 16, 5],
       ...,
       [10274, 20, 1, ..., 22, 48, 58],
       [296896, 10, 1, ..., 10, 46, 41],
       [241180, 19, 0, ..., 9, 56, 36]], dtype=object)

In [54]:
df_tmp2 = pd.read_csv('data/test.csv', parse_dates = ['click_time'])
df_tmp2.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [55]:
df_sub = pd.read_csv('data/sample_submission.csv')
df_sub.head()

Unnamed: 0,click_id,is_attributed
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [56]:
df_tmp2['day'] = df_tmp2['click_time'].dt.day
df_tmp2['hour'] = df_tmp2['click_time'].dt.hour
df_tmp2['minute'] = df_tmp2['click_time'].dt.minute
df_tmp2['second'] = df_tmp2['click_time'].dt.second
df_tmp2.drop(['click_time','click_id'], axis=1, inplace=True)

In [57]:
df_tmp2.head()

Unnamed: 0,ip,app,device,os,channel,day,hour,minute,second
0,5744,9,1,3,107,10,4,0,0
1,119901,9,1,3,466,10,4,0,0
2,72287,21,1,19,128,10,4,0,0
3,78477,15,1,13,111,10,4,0,0
4,123080,12,1,13,328,10,4,0,0


In [None]:
y_predict_final = clf.predict(df_tmp2.values)

In [None]:
df_sub['is_attributed']=y_predict_final

In [None]:
df_sub.to_csv('result.csv',sep=',', encoding='utf-8', index=False)

In [None]:
df_sub.shape

# df_sub is submitted to kaggle competition, result is about 0.9