## Introduction

The goal of this notebook is to demonstrate the use of the various features we have seen in the previous notebooks with a LightGBM classifier.

At the end of this notebook, you will be able to :
 - Use all features we have created in our previous notebooks
 - Train a LightGBM classifier
 
Hope you will enjoy this notebook and remember to come back to us and ask questions.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import time
import gc
gc.enable()

## Load the data (only 10 million rows)

In [2]:
# define name and path of the training file
file_path = "../input/train.csv.zip"
# define column types
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
# Define columns we will use in the notebook
cols=['ip', 'app', 'os', 'device', 'channel', 'click_time', 'is_attributed']
# read the last rows of the dataset
train_df = pd.read_csv(
    file_path,
    # nrows would select the first rows not the last rows so you need to use skiprows parameter
    nrows=10000000, 
    # Use skiprows with the range function to access the last rows
    # skiprows=range(1, 184903890 - 10000000),
    dtype=dtypes, 
    usecols=cols
)

In [3]:
# Check memory usage
'Memory usage for train_df %5.3f GB' % (train_df.memory_usage().sum() / 1024 ** 3) 

'Memory usage for train_df 0.196 GB'

## Create AverageManager class

Here is a modified version of the AverageManager that simply returns counts instead of averages.

The biggest issue with target averaging is overfitting since you use the target information nad this can be viewed as a leak.

In [4]:
class AverageManager(object):
    """ Class that will manage target averages for selected feature and target encode these features """
    def __init__(self, features, target):
        """ 
        Init an average manager for the given features and the specified target
        :param features : expected to be a list of list of features to group by the data
        :param target : name of the target feature
        """
        # Check features : 
        for f_ in features:
            if type(f_) != list:
                raise ValueError('Features are expected to be provided as a list')
        # averages contains the average data
        self.averages = {
            tuple(f_): None for f_ in features
        }
        # Prior contains the estimated prior of the target 
        self.prior = {'cum_sum': 0.0, 'nb_samples': 0.0}
        # Conatins the name of the target column in the DataFrames
        self.target = target
        
    def update_averages(self, df):
        """Update averages information using samples available in df"""
        # update prior
        self.prior['cum_sum'] += df[self.target].sum()
        self.prior['nb_samples'] += df.shape[0]
        
        for f_ in self.averages.keys():
            # Create the groupby
            the_group = df[list(f_) + [self.target]].groupby(list(f_)).agg(['sum', 'count'])
            the_group.columns = the_group.columns.droplevel(0)
    
            # Update the average
            if self.averages[f_] is None:
                self.averages[f_] = the_group
            else:
                # pandas .add method makes sure apps that are not in both the_group and current averages
                # take value of 0 before the addition takes place
                self.averages[f_] = the_group.add(self.averages[f_], fill_value=0.0)
            
            del the_group
            gc.collect()
            
    def apply_averages(self, df):
        """Apply calculated averages on df to target encode the features"""
        encoded = pd.DataFrame()
        for f_ in self.averages.keys():
            # Check averages are fitted
            if self.averages[f_] is None:
                raise ValueError('Averages have not been fitted yet')
            # Compute the average
            # self.averages[f_]['average'] = self.averages[f_]['sum'] / self.averages[f_]['count']
            self.averages[f_]['average'] = self.averages[f_]['count']
            # Now we need to encode for potetially several columns
            feat_name = '_' + '_'.join(list(f_))
            # Compute feataure on df
            # df[feat_name] = df[list(f_)].apply(lambda row: '_'.join(row.astype(str)), axis=1, raw=True)
            add_str_feature(df, list(f_), feat_name)
            # Compute feature on the average
            the_average = self.averages[f_].reset_index()
            # the_average[feat_name] = the_average[list(f_)].apply(lambda row: '_'.join(row.astype(str)), axis=1, raw=True)
            add_str_feature(the_average, list(f_), feat_name)
            the_average.set_index(feat_name, inplace=True)
            # finally map
            encoded[feat_name] = df[feat_name].map(the_average['average']).astype(np.float32)
            prior = self.prior['cum_sum'] / self.prior['nb_samples']
            encoded[feat_name].fillna(prior, inplace=True)
            # Drop feat_name from df
            del df[feat_name]
            gc.collect()
        
        return encoded
        

def add_str_feature(df_, features, name):
    """
    It does the same as : 
    df[feat_name] = df[list(f_)].apply(lambda row: '_'.join(row.astype(str)), axis=1, raw=True)
    However:
     - The addition of series is faster than the apply statement 
     - apply(lambda x: str(x)) is faster than df_[f].astype(str)
     
    Without this function it would take 7.5 minutes to complete 3 chunks when it takes 3.5 minutes using it!
    """
    df_[name] = ''
    for f in features:
        df_[name] += df_[f].apply(lambda x: str(x)) + '_'

In [5]:
def get_working_features():
    return [
        ['ip'], ['device'], ['channel'], ['os'], ['app'],
        ['app', 'channel'], ['app', 'os'], ['os', 'channel'], ['ip', 'app']
    ]


def get_target_feature():
    return 'is_attributed'

## Train LightGBM with raw features

In [6]:
# Create folds
folds = KFold(n_splits=5, shuffle=True, random_state=1)

raw_features = ['ip', 'app', 'channel', 'os', 'device']
# Test SGDClassifier on the data
scores = []
oof_predictions = np.zeros(train_df.shape[0])
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_df)):
    clf = LGBMClassifier(
        num_leaves=31,
    )
    clf.fit(train_df[raw_features].iloc[trn_idx].values, train_df[get_target_feature()].iloc[trn_idx].values)
    oof_predictions[val_idx] = clf.predict_proba(train_df[raw_features].iloc[val_idx].values)[:,1]
    scores.append(roc_auc_score(train_df[get_target_feature()].iloc[val_idx].values, 
                                oof_predictions[val_idx]))
    print("Fold %2d AUC score : %.6f" % (n_fold, scores[-1]))
    del clf
    gc.collect()
    
oof_score = roc_auc_score(train_df[get_target_feature()], oof_predictions)
print("LGBM with raw features OOF AUC %.6f AVG AUC %.6f +/- %.6f"
      % (oof_score, np.mean(scores), np.std(scores)))

Fold  0 AUC score : 0.950219
Fold  1 AUC score : 0.953623
Fold  2 AUC score : 0.949385
Fold  3 AUC score : 0.949467
Fold  4 AUC score : 0.948756
LGBM with raw features OOF AUC 0.950156 AVG AUC 0.950290 +/- 0.001730


## Train LightGBM with raw features and count features

In [7]:
# Init average manager
avg_man = AverageManager(features=get_working_features(), target=get_target_feature())
# Fit average manager to the data
avg_man.update_averages(train_df)
# Target encode the data
occurences = avg_man.apply_averages(train_df)
# Add target encoding to train_df
train_df = pd.concat([train_df, occurences], axis=1)

del occurences, avg_man
gc.collect()

77

In [8]:
# Create folds
folds = KFold(n_splits=5, shuffle=True, random_state=1)

excluded_features = ['click_time', 'is_attributed', 'attributed_time']
raw_features = [f_ for f_ in train_df if f_ not in excluded_features]

# Test SGDClassifier on the data
scores = []
oof_predictions = np.zeros(train_df.shape[0])
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_df)):
    clf = LGBMClassifier(
        num_leaves=31,
    )
    clf.fit(train_df[raw_features].iloc[trn_idx].values, train_df[get_target_feature()].iloc[trn_idx].values)
    oof_predictions[val_idx] = clf.predict_proba(train_df[raw_features].iloc[val_idx].values)[:,1]
    scores.append(roc_auc_score(train_df[get_target_feature()].iloc[val_idx].values, 
                                oof_predictions[val_idx]))
    print("Fold %2d AUC score : %.6f" % (n_fold + 1, scores[-1]))
    del clf
    gc.collect()
    
oof_score = roc_auc_score(train_df[get_target_feature()], oof_predictions)
print("LGBM with raw features OOF AUC %.6f AVG AUC %.6f +/- %.6f"
      % (oof_score, np.mean(scores), np.std(scores)))

Fold  1 AUC score : 0.964646
Fold  2 AUC score : 0.967757
Fold  3 AUC score : 0.966901
Fold  4 AUC score : 0.965455
Fold  5 AUC score : 0.962971
LGBM with raw features OOF AUC 0.965370 AVG AUC 0.965546 +/- 0.001684


## Train LightGBM with raw features, count features and time features

#### Compute click_rate feature

In [9]:
# Convert click_time to pd.DateTime
train_df['click_time'] = pd.to_datetime(train_df['click_time'])
train_df['day'] = train_df['click_time'].dt.day

In [10]:
# Create ip_app_day feature
train_df['ip_app_day'] = train_df['ip'].apply(lambda x: str(x)) + '_' \
                         + train_df['app'].apply(lambda x: str(x)) + '_'\
                         + train_df['day'].apply(lambda x: str(x)) 

# Compute min, max click_time and counts for each ip and app
ip_day_stats = train_df[['ip_app_day', 'click_time']].groupby('ip_app_day').agg(['min', 'max', 'count'])
ip_day_stats.columns = ip_day_stats.columns.droplevel(0)

# Convert min and max to integers
ip_day_stats['max'] = ip_day_stats['max'].astype(np.int64) // 1e9
ip_day_stats['min'] = ip_day_stats['min'].astype(np.int64) // 1e9
# Compute the click rate
ip_day_stats['click_rate_per_ip_app_day'] = (ip_day_stats['max'] - ip_day_stats['min']) / ip_day_stats['count'] 
# Map ip_app values with click_rate
train_df['ip_app_day'] = train_df['ip_app_day'].map(ip_day_stats['click_rate_per_ip_app_day'])

#### Compute time_difference feature

In [11]:
# Convert click time to integer
train_df['click_time'] = (train_df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)
# Create time difference feature
train_df['time_diff'] = (train_df.groupby(['ip', 'app', 'device', 'os']).click_time.shift(1)\
                         - train_df.click_time).astype(np.float32).fillna(0)

In [12]:
del train_df['click_time']
gc.collect()

78

#### Train LightGBM

In [14]:
# Create folds
folds = KFold(n_splits=5, shuffle=True, random_state=1)

excluded_features = ['click_time', 'is_attributed', 'attributed_time']
raw_features = [f_ for f_ in train_df if f_ not in excluded_features]

# Test SGDClassifier on the data
scores = []
oof_predictions = np.zeros(train_df.shape[0])
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_df)):
    clf = LGBMClassifier(
        n_estimators=200,
        num_leaves=31,
        colsample_bytree=.8,
        subsample=.8
    )
    clf.fit(
        train_df[raw_features].iloc[trn_idx].values, 
        train_df[get_target_feature()].iloc[trn_idx].values,
        eval_set=[(train_df[raw_features].iloc[val_idx].values, train_df[get_target_feature()].iloc[val_idx].values)],
        eval_metric='auc', 
        verbose=False,
        early_stopping_rounds=50
    )
    oof_predictions[val_idx] = clf.predict_proba(train_df[raw_features].iloc[val_idx].values)[:,1]
    scores.append(roc_auc_score(train_df[get_target_feature()].iloc[val_idx].values, 
                                oof_predictions[val_idx]))
    print("Fold %2d AUC score : %.6f" % (n_fold + 1, scores[-1]))
    del clf
    gc.collect()
    
oof_score = roc_auc_score(train_df[get_target_feature()], oof_predictions)
print("LGBM with raw features OOF AUC %.6f AVG AUC %.6f +/- %.6f"
      % (oof_score, np.mean(scores), np.std(scores)))

Fold  1 AUC score : 0.962632
Fold  2 AUC score : 0.965644
Fold  3 AUC score : 0.966927
Fold  4 AUC score : 0.963998
Fold  5 AUC score : 0.962223
LGBM with raw features OOF AUC 0.962868 AVG AUC 0.964285 +/- 0.001783


This last Light GBM has worse performance than the previous one but you need to keep in mind we are just using the first 10 million rows, which is certainly not enough.