In [1]:
# Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# Model Classifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model selection
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import (precision_recall_curve, auc, roc_curve, recall_score, classification_report, precision_recall_fscore_support)

# Model persistence
import pickle

# Others
import warnings
import gc

# Visualization
import seaborn as sns
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import warnings
from random import sample
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

%matplotlib inline

# Load test data and feature engineer it

In [3]:
final_test = pd.read_csv('./projdata/one_time_test2m.csv', parse_dates = ['click_time'])

In [4]:
def fea_eng(Y):
    X = Y
    X['day'] = X['click_time'].dt.day
    X['hour'] = X['click_time'].dt.hour
    X['minute'] = X['click_time'].dt.minute
    X['second'] = X['click_time'].dt.second
    heads = X.columns
    print(heads)
    for i in list(range(0,5)) + [6]:
        X[heads[i]] = X[heads[i]].astype('category')
    return X

In [5]:
final = fea_eng(final_test)

Index(['Unnamed: 0', 'ip', 'app', 'device', 'os', 'channel', 'click_time',
       'attributed_time', 'is_attributed', 'day', 'hour', 'minute', 'second'],
      dtype='object')


In [6]:
# down sample the train set
df_majority = final[final.is_attributed==0]
df_minority = final[final.is_attributed==1]

df_majority_downsampled = df_majority.sample(replace=False,    # sample without replacement
                                frac=len(df_minority)/len(df_majority),     
                                random_state=42) # reproducible results

balanced = pd.concat([df_majority_downsampled, df_minority])
balanced = balanced.sample(frac=1)

# balanced = balanced.drop(columns=['click_time','attributed_time'])
balanced = final.drop(columns=['click_time','attributed_time'])
balanced['is_attributed'] = balanced['is_attributed'].astype('int64')

In [7]:
balanced.columns

Index(['Unnamed: 0', 'ip', 'app', 'device', 'os', 'channel', 'is_attributed',
       'day', 'hour', 'minute', 'second'],
      dtype='object')

In [8]:
test_fe = balanced

In [9]:
test_fe[['day', 'hour', 'minute', 'second']] = test_fe[['day', 'hour', 'minute', 'second']].astype('float')
test_fe[['ip', 'app', 'device', 'os', 'channel']] = test_fe[['ip', 'app', 'device', 'os', 'channel']].astype('category')

In [10]:
# Define all the groupby transformations
groupby_aggregations = [
    
    # V1 - GroupBy Features #
    #########################    
    # Variance in day, for ip-app-channel
    {'groupby': ['ip','app','channel'], 'select': 'day', 'agg': 'var'},
    # Variance in hour, for ip-app-os
    {'groupby': ['ip','app','os'], 'select': 'hour', 'agg': 'var'},
    # Count, for ip-day-hour
    {'groupby': ['ip','day','hour'], 'select': 'channel', 'agg': 'count'},
    # Count, for ip-app
    {'groupby': ['ip', 'app'], 'select': 'channel', 'agg': 'count'},        
    # Count, for ip-app-os
    {'groupby': ['ip', 'app', 'os'], 'select': 'channel', 'agg': 'count'},
    # Count, for ip-app-day-hour
    {'groupby': ['ip','app','day','hour'], 'select': 'channel', 'agg': 'count'},
    # Mean hour, for ip-app-channel
    {'groupby': ['ip','app','channel'], 'select': 'hour', 'agg': 'mean'}, 
    
    # V2 - GroupBy Features #
    #########################
    # Average clicks on app by distinct users; is it an app they return to?
    {'groupby': ['app'], 
     'select': 'ip', 
     'agg': lambda x: np.divide(float(len(x)),float(len(x.unique()))), 
     'agg_name': 'AvgViewPerDistinct'
    },
    # How popular is the app or channel?
    {'groupby': ['app'], 'select': 'channel', 'agg': 'count'},
    {'groupby': ['channel'], 'select': 'app', 'agg': 'count'},
    
    # V3 - GroupBy Features                                              #
    # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #
    ###################################################################### 
    {'groupby': ['ip'], 'select': 'channel', 'agg': 'nunique'}, 
    {'groupby': ['ip'], 'select': 'app', 'agg': 'nunique'}, 
    {'groupby': ['ip','day'], 'select': 'hour', 'agg': 'nunique'}, 
    {'groupby': ['ip','app'], 'select': 'os', 'agg': 'nunique'}, 
    {'groupby': ['ip'], 'select': 'device', 'agg': 'nunique'}, 
    {'groupby': ['app'], 'select': 'channel', 'agg': 'nunique'}, 
    {'groupby': ['ip', 'device', 'os'], 'select': 'app', 'agg': 'nunique'}, 
    {'groupby': ['ip','device','os'], 'select': 'app', 'agg': 'cumcount'}, 
    {'groupby': ['ip'], 'select': 'app', 'agg': 'cumcount'}, 
    {'groupby': ['ip'], 'select': 'os', 'agg': 'cumcount'}, 
    {'groupby': ['ip','day','channel'], 'select': 'hour', 'agg': 'var'}    
]



new_features =[]
# Apply all the groupby transformations
for spec in groupby_aggregations:
    
    # Name of the aggregation we're applying
    agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']
    
    # Name of new feature
    new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select'])
    new_features.append(new_feature)
    
    # Info
    print("Grouping by {}, and aggregating {} with {}".format(
        spec['groupby'], spec['select'], agg_name
    ))
    
    # Unique list of features to select
    all_features = list(set(spec['groupby'] + [spec['select']]))
    
    # Perform the groupby
    gp = test_fe[all_features]. \
        groupby(spec['groupby'])[spec['select']]. \
        agg(spec['agg']). \
        reset_index(). \
        rename(index=str, columns={spec['select']: new_feature})
        
    # Merge back to X_total
    if 'cumcount' == spec['agg']:
        test_fe[new_feature] = gp[0].values
    else:
        # train_fe[spec['groupby']] = train_fe[spec['groupby']].astype('category')
        test_fe = test_fe.merge(gp, on=spec['groupby'], how='left')
        
     # Clear memory
    del gp
    gc.collect()

test_fe.head()

Grouping by ['ip', 'app', 'channel'], and aggregating day with var
Grouping by ['ip', 'app', 'os'], and aggregating hour with var
Grouping by ['ip', 'day', 'hour'], and aggregating channel with count
Grouping by ['ip', 'app'], and aggregating channel with count
Grouping by ['ip', 'app', 'os'], and aggregating channel with count
Grouping by ['ip', 'app', 'day', 'hour'], and aggregating channel with count
Grouping by ['ip', 'app', 'channel'], and aggregating hour with mean
Grouping by ['app'], and aggregating ip with AvgViewPerDistinct
Grouping by ['app'], and aggregating channel with count
Grouping by ['channel'], and aggregating app with count
Grouping by ['ip'], and aggregating channel with nunique
Grouping by ['ip'], and aggregating app with nunique
Grouping by ['ip', 'day'], and aggregating hour with nunique
Grouping by ['ip', 'app'], and aggregating os with nunique
Grouping by ['ip'], and aggregating device with nunique
Grouping by ['app'], and aggregating channel with nunique
Grou

Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,minute,...,ip_nunique_app,ip_day_nunique_hour,ip_app_nunique_os,ip_nunique_device,app_nunique_channel,ip_device_os_nunique_app,ip_device_os_cumcount_app,ip_cumcount_app,ip_cumcount_os,ip_day_channel_var_hour
0,179330694,32069,9,1,13,134,0,9.0,14.0,14.0,...,5,4,2,1,38,3,0,0,0,
1,54470391,147012,29,1,19,101,0,7.0,14.0,14.0,...,1,1,1,1,16,1,0,0,0,
2,100694702,84896,11,1,13,469,0,8.0,9.0,51.0,...,46,24,20,9,15,25,0,0,0,25.563636
3,181781137,55957,14,1,13,467,0,9.0,14.0,58.0,...,23,14,9,2,33,18,0,0,0,4.5
4,100834427,847,12,1,22,245,0,8.0,9.0,54.0,...,11,7,4,2,30,4,0,0,0,13.766667


In [11]:
test_fe['ip_app_channel_var_day'] = test_fe['ip_app_channel_var_day'].fillna(0) 
test_fe['ip_app_os_var_hour'] = test_fe['ip_app_os_var_hour'].fillna(0) 
test_fe['ip_day_channel_var_hour'] = test_fe['ip_day_channel_var_hour'].fillna(0)
# test_fe = test_fe.fillna(0)

In [12]:
test_fe.head()

Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,minute,...,ip_nunique_app,ip_day_nunique_hour,ip_app_nunique_os,ip_nunique_device,app_nunique_channel,ip_device_os_nunique_app,ip_device_os_cumcount_app,ip_cumcount_app,ip_cumcount_os,ip_day_channel_var_hour
0,179330694,32069,9,1,13,134,0,9.0,14.0,14.0,...,5,4,2,1,38,3,0,0,0,0.0
1,54470391,147012,29,1,19,101,0,7.0,14.0,14.0,...,1,1,1,1,16,1,0,0,0,0.0
2,100694702,84896,11,1,13,469,0,8.0,9.0,51.0,...,46,24,20,9,15,25,0,0,0,25.563636
3,181781137,55957,14,1,13,467,0,9.0,14.0,58.0,...,23,14,9,2,33,18,0,0,0,4.5
4,100834427,847,12,1,22,245,0,8.0,9.0,54.0,...,11,7,4,2,30,4,0,0,0,13.766667


In [14]:
X = test_fe.drop(['is_attributed'],axis=1)
y = test_fe[['is_attributed']]
X.columns

Index(['Unnamed: 0', 'ip', 'app', 'device', 'os', 'channel', 'day', 'hour',
       'minute', 'second', 'ip_app_channel_var_day', 'ip_app_os_var_hour',
       'ip_day_hour_count_channel', 'ip_app_count_channel',
       'ip_app_os_count_channel', 'ip_app_day_hour_count_channel',
       'ip_app_channel_mean_hour', 'app_AvgViewPerDistinct_ip',
       'app_count_channel', 'channel_count_app', 'ip_nunique_channel',
       'ip_nunique_app', 'ip_day_nunique_hour', 'ip_app_nunique_os',
       'ip_nunique_device', 'app_nunique_channel', 'ip_device_os_nunique_app',
       'ip_device_os_cumcount_app', 'ip_cumcount_app', 'ip_cumcount_os',
       'ip_day_channel_var_hour'],
      dtype='object')

# Load training data and feature engineer it

In [15]:
talking = pd.read_csv('./projdata/param_tuning18m.csv') 
talking['click_time'] = pd.to_datetime(talking['click_time'], errors='coerce')

In [16]:
# transform click_time into day, hour, minute and second
talking['day'] = talking['click_time'].dt.day
talking['hour'] = talking['click_time'].dt.hour
talking['minute'] = talking['click_time'].dt.minute
talking['second'] = talking['click_time'].dt.second
talking.head()

Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second
0,92895533,83604,64,1,13,459,2017-11-08 07:18:18,,0,8,7,18,18
1,22237114,28021,3,1,13,409,2017-11-07 03:54:37,,0,7,3,54,37
2,173801870,44456,3,1,15,480,2017-11-09 12:39:05,,0,9,12,39,5
3,44049831,134076,9,1,13,258,2017-11-07 10:55:26,,0,7,10,55,26
4,80845475,138561,6,1,13,459,2017-11-08 03:33:21,,0,8,3,33,21


In [17]:
# ip, app, device, os, channel, is_attributed and the time features we just created
# are encoded to be integers, we tranform them into categorical data
heads = talking.columns
print(heads)
for i in list(range(0,5)) + [6]:
    talking[heads[i]] = talking[heads[i]].astype('category')
talking.info()

Index(['Unnamed: 0', 'ip', 'app', 'device', 'os', 'channel', 'click_time',
       'attributed_time', 'is_attributed', 'day', 'hour', 'minute', 'second'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000000 entries, 0 to 17999999
Data columns (total 13 columns):
Unnamed: 0         category
ip                 category
app                category
device             category
os                 category
channel            int64
click_time         category
attributed_time    object
is_attributed      int64
day                int64
hour               int64
minute             int64
second             int64
dtypes: category(6), int64(6), object(1)
memory usage: 2.0+ GB


In [18]:
# down sample the train set
df_majority = talking[talking.is_attributed==0]
df_minority = talking[talking.is_attributed==1]

df_majority_downsampled = df_majority.sample(replace=False,    # sample without replacement
                                frac=len(df_minority)/len(df_majority),     
                                random_state=42) # reproducible results

balanced_train = pd.concat([df_majority_downsampled, df_minority])
balanced_train = balanced_train.sample(frac=1)

balanced_train = balanced_train.drop(columns=['click_time','attributed_time'])
balanced_train['is_attributed'] = balanced_train['is_attributed'].astype('int64')

In [19]:
# split 20m into 2m train data and 18m validation data.
train = balanced_train.sample(frac=0.1, replace=False, random_state = 42)
valid = balanced_train.drop(train.index)

In [21]:
train_fe = train

In [22]:
train_fe[['day', 'hour', 'minute', 'second']] = train_fe[['day', 'hour', 'minute', 'second']].astype('float')
train_fe[['ip', 'app', 'device', 'os', 'channel']] = train_fe[['ip', 'app', 'device', 'os', 'channel']].astype('category')

In [23]:
# Define all the groupby transformations
groupby_aggregations = [
    
    # V1 - GroupBy Features #
    #########################    
    # Variance in day, for ip-app-channel
    {'groupby': ['ip','app','channel'], 'select': 'day', 'agg': 'var'},
    # Variance in hour, for ip-app-os
    {'groupby': ['ip','app','os'], 'select': 'hour', 'agg': 'var'},
    # Count, for ip-day-hour
    {'groupby': ['ip','day','hour'], 'select': 'channel', 'agg': 'count'},
    # Count, for ip-app
    {'groupby': ['ip', 'app'], 'select': 'channel', 'agg': 'count'},        
    # Count, for ip-app-os
    {'groupby': ['ip', 'app', 'os'], 'select': 'channel', 'agg': 'count'},
    # Count, for ip-app-day-hour
    {'groupby': ['ip','app','day','hour'], 'select': 'channel', 'agg': 'count'},
    # Mean hour, for ip-app-channel
    {'groupby': ['ip','app','channel'], 'select': 'hour', 'agg': 'mean'}, 
    
    # V2 - GroupBy Features #
    #########################
    # Average clicks on app by distinct users; is it an app they return to?
    {'groupby': ['app'], 
     'select': 'ip', 
     'agg': lambda x: np.divide(float(len(x)),float(len(x.unique()))), 
     'agg_name': 'AvgViewPerDistinct'
    },
    # How popular is the app or channel?
    {'groupby': ['app'], 'select': 'channel', 'agg': 'count'},
    {'groupby': ['channel'], 'select': 'app', 'agg': 'count'},
    
    # V3 - GroupBy Features                                              #
    # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #
    ###################################################################### 
    {'groupby': ['ip'], 'select': 'channel', 'agg': 'nunique'}, 
    {'groupby': ['ip'], 'select': 'app', 'agg': 'nunique'}, 
    {'groupby': ['ip','day'], 'select': 'hour', 'agg': 'nunique'}, 
    {'groupby': ['ip','app'], 'select': 'os', 'agg': 'nunique'}, 
    {'groupby': ['ip'], 'select': 'device', 'agg': 'nunique'}, 
    {'groupby': ['app'], 'select': 'channel', 'agg': 'nunique'}, 
    {'groupby': ['ip', 'device', 'os'], 'select': 'app', 'agg': 'nunique'}, 
    {'groupby': ['ip','device','os'], 'select': 'app', 'agg': 'cumcount'}, 
    {'groupby': ['ip'], 'select': 'app', 'agg': 'cumcount'}, 
    {'groupby': ['ip'], 'select': 'os', 'agg': 'cumcount'}, 
    {'groupby': ['ip','day','channel'], 'select': 'hour', 'agg': 'var'}    
]



new_features =[]
# Apply all the groupby transformations
for spec in groupby_aggregations:
    
    # Name of the aggregation we're applying
    agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']
    
    # Name of new feature
    new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select'])
    new_features.append(new_feature)
    
    # Info
    print("Grouping by {}, and aggregating {} with {}".format(
        spec['groupby'], spec['select'], agg_name
    ))
    
    # Unique list of features to select
    all_features = list(set(spec['groupby'] + [spec['select']]))
    
    # Perform the groupby
    gp = train_fe[all_features]. \
        groupby(spec['groupby'])[spec['select']]. \
        agg(spec['agg']). \
        reset_index(). \
        rename(index=str, columns={spec['select']: new_feature})
        
    # Merge back to X_total
    if 'cumcount' == spec['agg']:
        train_fe[new_feature] = gp[0].values
    else:
        # train_fe[spec['groupby']] = train_fe[spec['groupby']].astype('category')
        train_fe = train_fe.merge(gp, on=spec['groupby'], how='left')
        
     # Clear memory
    del gp
    gc.collect()

train_fe.head()

Grouping by ['ip', 'app', 'channel'], and aggregating day with var
Grouping by ['ip', 'app', 'os'], and aggregating hour with var
Grouping by ['ip', 'day', 'hour'], and aggregating channel with count
Grouping by ['ip', 'app'], and aggregating channel with count
Grouping by ['ip', 'app', 'os'], and aggregating channel with count
Grouping by ['ip', 'app', 'day', 'hour'], and aggregating channel with count
Grouping by ['ip', 'app', 'channel'], and aggregating hour with mean
Grouping by ['app'], and aggregating ip with AvgViewPerDistinct
Grouping by ['app'], and aggregating channel with count
Grouping by ['channel'], and aggregating app with count
Grouping by ['ip'], and aggregating channel with nunique
Grouping by ['ip'], and aggregating app with nunique
Grouping by ['ip', 'day'], and aggregating hour with nunique
Grouping by ['ip', 'app'], and aggregating os with nunique
Grouping by ['ip'], and aggregating device with nunique
Grouping by ['app'], and aggregating channel with nunique
Grou

Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,minute,...,ip_nunique_app,ip_day_nunique_hour,ip_app_nunique_os,ip_nunique_device,app_nunique_channel,ip_device_os_nunique_app,ip_device_os_cumcount_app,ip_cumcount_app,ip_cumcount_os,ip_day_channel_var_hour
0,17665809,50433,12,1,19,178,0,7.0,2.0,27.0,...,2,1,1,1,24,1,0,0,0,
1,18064910,72357,11,1,13,319,0,7.0,2.0,35.0,...,2,2,1,1,10,1,0,0,0,
2,152809159,298842,5,1,7,113,1,9.0,6.0,6.0,...,1,1,1,1,2,1,0,0,0,
3,179835675,298180,10,1,49,113,1,9.0,14.0,23.0,...,1,1,1,1,5,1,0,0,0,
4,63769839,266359,19,980,38,213,1,7.0,18.0,35.0,...,1,1,1,1,8,1,0,0,0,


In [24]:
base_features = ['ip', 'app', 'device', 'os', 'channel']

In [25]:
train_fe['ip_app_channel_var_day'] = train_fe['ip_app_channel_var_day'].fillna(0) 
train_fe['ip_app_os_var_hour'] = train_fe['ip_app_os_var_hour'].fillna(0) 
train_fe['ip_day_channel_var_hour'] = train_fe['ip_day_channel_var_hour'].fillna(0)

In [26]:
train_gbc = train_fe[['channel','ip','channel_count_app','app','app_count_channel','is_attributed']]
# valid_gbc = train_fe[['channel','ip','channel_count_app','app','app_count_channel','is_attributed']]
x_full_train, x_full_test, y_full_train, y_full_test = train_test_split(train_fe.drop(['is_attributed'],axis=1),train_fe['is_attributed'], test_size=0.2, random_state=42)
x_reduced_train, x_reduced_test, y_reduced_train, y_reduced_test = train_test_split(train_gbc.drop(['is_attributed'],axis=1),train_gbc['is_attributed'], test_size=0.2, random_state=42)

In [27]:
train_gbc.head()

Unnamed: 0,channel,ip,channel_count_app,app,app_count_channel,is_attributed
0,178,50433,143,12,615,0
1,319,72357,22,11,141,0
2,113,298842,542,5,283,1
3,113,298180,542,10,338,1
4,213,266359,1292,19,1314,1


In [28]:
gbc_full = GradientBoostingClassifier()
gbc_reduced = GradientBoostingClassifier()

In [29]:
x_full_train.head()

Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,day,hour,minute,second,...,ip_nunique_app,ip_day_nunique_hour,ip_app_nunique_os,ip_nunique_device,app_nunique_channel,ip_device_os_nunique_app,ip_device_os_cumcount_app,ip_cumcount_app,ip_cumcount_os,ip_day_channel_var_hour
6599,105226872,275356,84,0,24,101,8.0,11.0,12.0,42.0,...,1,1,1,1,1,1,0,0,0,0.0
5948,44110841,99160,9,1,13,466,7.0,10.0,56.0,27.0,...,1,1,1,1,23,1,0,0,0,0.0
1420,118866975,201182,9,1,13,232,8.0,15.0,1.0,54.0,...,5,5,2,2,23,1,0,0,0,0.0
2389,105071056,68702,35,1,13,274,8.0,11.0,9.0,51.0,...,1,1,1,1,2,1,0,0,0,0.0
2404,104457349,175035,18,1,15,121,8.0,10.0,58.0,44.0,...,1,1,1,1,9,1,0,0,0,0.0


In [30]:
gbc_full.fit(x_full_train, y_full_train)
roc_auc_score(gbc_full.predict(x_full_test),y_full_test)

0.9163272405287458

In [31]:
gbc_reduced.fit(x_reduced_train, y_reduced_train)
roc_auc_score(gbc_reduced.predict(x_reduced_test),y_reduced_test)

0.9157955328375449

In [32]:
roc_auc_score(gbc_full.predict(x_full_test),y_full_test)

0.9163272405287458

In [33]:
train_lgb = train_fe[['ip',
 'os',
 'channel_count_app',
 'minute',
 'app',
 'second',
 'channel',
 'hour',
 'app_count_channel',
 'app_AvgViewPerDistinct_ip',
 'is_attributed']]
valid_lgb = train_fe[['ip',
 'os',
 'channel_count_app',
 'minute',
 'app',
 'second',
 'channel',
 'hour',
 'app_count_channel',
 'app_AvgViewPerDistinct_ip',
 'is_attributed']]

In [34]:
X_train = train_lgb.drop(['is_attributed'],axis=1)
y_train = train_lgb[['is_attributed']]
X_test = valid_lgb.drop(['is_attributed'],axis=1)
y_test = valid_lgb[['is_attributed']]

In [35]:
lgb_full = LGBMClassifier()
lgb_reduced = LGBMClassifier()

In [36]:
roc_auc_score(lgb_full.fit(X_train.values, y_train).predict(X_test.values), y_test)
roc_auc_score(lgb_reduced.fit(X_train.values, y_train).predict(X_test.values), y_test)

0.9506644949626099

In [37]:
train_rfc = train_fe[['app',
 'channel_count_app',
 'app_nunique_channel',
 'app_count_channel',
 'ip',
 'channel',
 'app_AvgViewPerDistinct_ip',
 'second',
 'minute',
 'os',
 'is_attributed']]
valid_rfc = train_fe[['app',
 'channel_count_app',
 'app_nunique_channel',
 'app_count_channel',
 'ip',
 'channel',
 'app_AvgViewPerDistinct_ip',
 'second',
 'minute',
 'os',
 'is_attributed']]

In [38]:
X_train = train_rfc.drop(['is_attributed'],axis=1)
y_train = train_rfc[['is_attributed']]
X_test = valid_rfc.drop(['is_attributed'],axis=1)
y_test = valid_rfc[['is_attributed']]

In [39]:
roc_auc_score(gbc_full.predict(X),y)

0.5024880623300575