In [1]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [2]:
n_rows=300000
df = pd.read_csv("train.csv", nrows=n_rows)

In [3]:
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [4]:
Y = df['click']

In [5]:
X = df.drop(['click','id', 'hour', 'device_id', 'device_ip'],axis=1)

In [6]:
X.head()

Unnamed: 0,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,44956a24,1,2,15706,320,50,1722,0,35,-1,79
1,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,711ee120,1,0,15704,320,50,1722,0,35,100084,79
2,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,8a4875bd,1,0,15704,320,50,1722,0,35,100084,79
3,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,6332421a,1,0,15706,320,50,1722,0,35,100084,79
4,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,07d7df22,779d90c2,1,0,18993,320,50,2161,0,35,-1,157


In [7]:
X.info() #only have categorical features

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   C1                300000 non-null  int64 
 1   banner_pos        300000 non-null  int64 
 2   site_id           300000 non-null  object
 3   site_domain       300000 non-null  object
 4   site_category     300000 non-null  object
 5   app_id            300000 non-null  object
 6   app_domain        300000 non-null  object
 7   app_category      300000 non-null  object
 8   device_model      300000 non-null  object
 9   device_type       300000 non-null  int64 
 10  device_conn_type  300000 non-null  int64 
 11  C14               300000 non-null  int64 
 12  C15               300000 non-null  int64 
 13  C16               300000 non-null  int64 
 14  C17               300000 non-null  int64 
 15  C18               300000 non-null  int64 
 16  C19               300000 non-null  int

In [8]:
X.shape

(300000, 19)

In [9]:
n_train = int(n_rows * 0.9)
X_train = X[:n_train]
Y_train = Y[:n_train]
X_test = X[n_train:]
Y_test = Y[n_train:]

In [10]:
ohe = OneHotEncoder(handle_unknown='ignore')

# Decision Tree

In [11]:
decisionTree = DecisionTreeClassifier(criterion='gini')
clf1 = make_pipeline(ohe,decisionTree)

In [12]:
print(clf1)

Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                ('decisiontreeclassifier', DecisionTreeClassifier())])


In [14]:
parameters = {
    'decisiontreeclassifier__max_depth': [2,4,8,16,32,64],
    'decisiontreeclassifier__min_samples_split': [2,5,10,25,50,100],
    'decisiontreeclassifier__class_weight' : [None, 'balanced']
}

grid_search = GridSearchCV(clf1, parameters, cv=5, n_jobs=-1, scoring='roc_auc',verbose=10)
grid_search.fit(X_train,Y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [15]:
print(grid_search.best_params_)
decision_tree_best = grid_search.best_estimator_
pos_prob = decision_tree_best.predict_proba(X_test)[:, 1]
print(f'The Decision Tree ROC AUC on testing set is: {roc_auc_score(Y_test,pos_prob):.3f}')

{'decisiontreeclassifier__class_weight': 'balanced', 'decisiontreeclassifier__max_depth': 32, 'decisiontreeclassifier__min_samples_split': 100}
The Decision Tree ROC AUC on testing set is: 0.748


# Random Forest

In [16]:
randomForest = RandomForestClassifier(criterion='gini')
clf2 = make_pipeline(ohe,randomForest)

In [17]:
print(clf2)

Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                ('randomforestclassifier', RandomForestClassifier())])


In [19]:
parameters = {
    'randomforestclassifier__n_estimators': [100,200],
    'randomforestclassifier__max_depth': [2,4,8,16,32],
    'randomforestclassifier__min_samples_split': [2,5,10,25,50],
    'randomforestclassifier__class_weight' : [None, 'balanced'],
    'randomforestclassifier__max_features': ['sqrt', 'log2', None],

}

grid_search = GridSearchCV(clf2, parameters, cv=3, n_jobs=-1 , scoring='roc_auc',verbose=10)
grid_search.fit(X_train,Y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [20]:
print(grid_search.best_params_)
decision_tree_best = grid_search.best_estimator_
pos_prob = decision_tree_best.predict_proba(X_test)[:, 1]
print(f'The Random Forest ROC AUC on testing set is: {roc_auc_score(Y_test,pos_prob):.3f}')

{'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__max_depth': 32, 'randomforestclassifier__max_features': None, 'randomforestclassifier__min_samples_split': 50, 'randomforestclassifier__n_estimators': 100}
The Random Forest ROC AUC on testing set is: 0.765


# XGBoost

In [21]:
xgboost = xgb.XGBClassifier()
clf3 = make_pipeline(ohe,xgboost)

In [22]:
print(clf3)

Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                ('xgbclassifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=None, gpu_id=None,
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, learning_rate=0.1,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=None, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_cons

In [23]:
parameters = {
    'xgbclassifier__n_estimators': [200,500,1000,2000],
    'xgbclassifier__max_depth': [1,2,4,8],
    'xgbclassifier__learning_rate': [10**-1,10**-2,10**-3],

}

grid_search = GridSearchCV(clf3, parameters, cv=3, n_jobs=-1 , scoring='roc_auc',verbose=10)
grid_search.fit(X_train,Y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [24]:
print(grid_search.best_params_)
decision_tree_best = grid_search.best_estimator_
pos_prob = decision_tree_best.predict_proba(X_test)[:, 1]
print(f'The XGBoost ROC AUC on testing set is: {roc_auc_score(Y_test,pos_prob):.3f}')

{'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 8, 'xgbclassifier__n_estimators': 1000}
The XGBoost ROC AUC on testing set is: 0.772
