In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load Datasets

In [2]:
n_rows = 300000
df = pd.read_csv("train.csv", nrows= n_rows)
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [3]:
df.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

In [4]:
df.shape

(300000, 24)

In [5]:
df.isna().sum()

id                  0
click               0
hour                0
C1                  0
banner_pos          0
site_id             0
site_domain         0
site_category       0
app_id              0
app_domain          0
app_category        0
device_id           0
device_ip           0
device_model        0
device_type         0
device_conn_type    0
C14                 0
C15                 0
C16                 0
C17                 0
C18                 0
C19                 0
C20                 0
C21                 0
dtype: int64

In [6]:
y = df["click"]
X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1)

In [7]:
df.dtypes

id                  float64
click                 int64
hour                  int64
C1                    int64
banner_pos            int64
site_id              object
site_domain          object
site_category        object
app_id               object
app_domain           object
app_category         object
device_id            object
device_ip            object
device_model         object
device_type           int64
device_conn_type      int64
C14                   int64
C15                   int64
C16                   int64
C17                   int64
C18                   int64
C19                   int64
C20                   int64
C21                   int64
dtype: object

# Split Data
#### Chronogical so we need to split manually not randomly

In [8]:
n_train = int(n_rows* 0.9)
X_train = X[:n_train]
Y_train = y[:n_train]
X_test = X[n_train:]
Y_test = y[n_train:]

# One Hot encoding

In [9]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown="ignore")

In [10]:
X_train_enc = enc.fit_transform(X_train)
X_test_enc = enc.transform(X_test)

In [11]:
print(f"X_train encoded shape: {X_train_enc.shape}\n X_test encoded shape: {X_test_enc.shape}")

X_train encoded shape: (270000, 8204)
 X_test encoded shape: (30000, 8204)


# Model

In [12]:
from sklearn.tree import DecisionTreeClassifier
parameters = {"max_depth": [3,10, 20, 30,None]}
decision_tree = DecisionTreeClassifier(criterion="gini", min_samples_split=30)

In [13]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=5, scoring="roc_auc")

In [14]:
grid_search.fit(X_train_enc, Y_train)

0,1,2
,estimator,DecisionTreeC...ples_split=30)
,param_grid,"{'max_depth': [3, 10, ...]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,30
,min_samples_split,30
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [15]:
from sklearn.metrics import roc_auc_score
decision_tree_best = grid_search.best_estimator_
predict_proba = decision_tree_best.predict_proba(X_test_enc)
proba = predict_proba[:, 1]
print("The ROC AUC on testing set is : {:.3f}".format(roc_auc_score(Y_test, proba)))


The ROC AUC on testing set is : 0.731


## Random Forest model

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
random_forest = RandomForestClassifier(n_estimators=100, criterion="gini", min_samples_split=30, n_jobs=-1)

In [18]:
parameters = {"max_depth": [3,10, 20, 30,None]}
grid_search = GridSearchCV(random_forest,n_jobs=-1,param_grid=parameters,  cv=5, scoring="roc_auc")
grid_search.fit(X_train_enc, Y_train)

0,1,2
,estimator,"RandomForestC...30, n_jobs=-1)"
,param_grid,"{'max_depth': [3, 10, ...]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,30
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
random_forest_best = grid_search.best_estimator_
predict_proba = random_forest_best.predict_proba(X_test_enc)[:, 1]
print(f"ROC AUC on this test set is {roc_auc_score(Y_test, predict_proba):.3f}")

ROC AUC on this test set is 0.758


###  Import XGBoost to build one tree at a time and combines the results along the way

In [20]:
import xgboost as xgb
model = xgb.XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=1000)


In [21]:
model.fit(X_train_enc, Y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [22]:
predict_proba = model.predict_proba(X_test_enc)[:, 1]
print(f"ROC AUC on this test set is {roc_auc_score(Y_test, predict_proba):.3f}")

ROC AUC on this test set is 0.770


As we see the ROC AUC with xgboost is better than the random forest model