In [1]:
import os
import gc
import time
import psutil
import warnings
import numpy as np
import pandas as pd

from utils import timer, reduce_memory_usage
from feature_engineering import extract_datetime_components, add_group_stats

warnings.filterwarnings('ignore')

In [2]:
with timer('Load full training data'):
    nov_7_start = 9308568
    nov_7_end   = 68941877

    nov_8_start = 68941878
    nov_8_end   = 131886952
    
    read_dtypes = {
        'ip':            'uint32',
        'app':           'uint16',
        'device':        'uint16',
        'os':            'uint16',
        'channel':       'uint16',
        'is_attributed': 'uint8',
        'click_id':      'uint32'
    }
    
    train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
    
    # read data on 11/07
    copy_filename = 'train_nov_7.csv'
    if not os.path.exists(copy_filename):
        train = pd.read_csv('train.csv.zip',
                            dtype=read_dtypes,
                            skiprows=range(1, nov_7_start + 1),
                            nrows=nov_7_end - nov_7_start,
                            usecols=train_cols)
    else:
        train = pd.read_csv(copy_filename,
                            dtype=read_dtypes,
                            usecols=train_cols)
    print(f'Training data size: {train.shape}')

Training data size: (59633309, 7)
[Load full training data done in 75.669 s.]


In [3]:
with timer('Save a copy of current training data'):
    if not os.path.exists(copy_filename):
        train.to_csv(copy_filename, index=False)
    else:
        print(f'{copy_filename} already saved.')

train_nov_7.csv already saved.
[Save a copy of current training data done in 0.001 s.]


In [4]:
sample_size = 15000000
train_subset = train.iloc[:sample_size]
print(train_subset.shape)

(15000000, 7)


In [5]:
with timer('Extract time components'):
    train_subset = extract_datetime_components(train_subset, cols=['click_time'])

[Extract time components done in 18.916 s.]


In [6]:
train_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000000 entries, 0 to 14999999
Data columns (total 11 columns):
 #   Column            Dtype         
---  ------            -----         
 0   ip                uint32        
 1   app               uint16        
 2   device            uint16        
 3   os                uint16        
 4   channel           uint16        
 5   click_time        datetime64[ns]
 6   is_attributed     uint8         
 7   click_time_year   int16         
 8   click_time_month  int8          
 9   click_time_day    int8          
 10  click_time_hour   int8          
dtypes: datetime64[ns](1), int16(1), int8(3), uint16(4), uint32(1), uint8(1)
memory usage: 371.9 MB


In [7]:
from itertools import combinations, permutations

categorical_features = ['ip', 'os', 'app', 'device', 'channel']
for i in combinations(categorical_features, 4):
    print(i)

('ip', 'os', 'app', 'device')
('ip', 'os', 'app', 'channel')
('ip', 'os', 'device', 'channel')
('ip', 'app', 'device', 'channel')
('os', 'app', 'device', 'channel')


In [8]:
groupby_extraction_setting = [
    
    # 2 way combination
    (['ip'], [('os', 'count')]),
    (['os'], [('ip', 'count')]),
    (['app'], [('ip', 'count')]),
    (['device'], [('ip', 'count')]),
    (['channel'], [('ip', 'count')]),
    
    # 3 way combination
    (['ip', 'os'], [('app', 'count')]),
    (['ip', 'app'], [('os', 'count')]),
    (['ip', 'device'], [('os', 'count')]),
    (['ip', 'channel'], [('os', 'count')]),
    
    # 4 way combination
    (['ip', 'os', 'app'], [('device', 'count')]),
    (['ip', 'os', 'device'], [('app', 'count')]),
    (['ip', 'os', 'channel'], [('app', 'count')]),
    (['ip', 'app', 'device'], [('os', 'count')]),
    (['ip', 'app', 'channel'], [('os', 'count')]),
    
    # 5 way combination
    (['ip', 'os', 'app', 'device'], [('channel', 'count')]),
    (['ip', 'os', 'app', 'channel'], [('device', 'count')]),
    (['ip', 'os', 'device', 'channel'], [('app', 'count')]),
    (['ip', 'app', 'device', 'channel'], [('os', 'count')])
]

In [9]:
with timer('Extract simple groupby count features'):
    for setting in groupby_extraction_setting:
        cols = setting[0]
        for value, method in setting[1]:
            train_subset, _ = add_group_stats(train_subset, cols, value, method)

[Extract simple groupby count features done in 596.606 s.]


In [10]:
with timer('Extract groupby timedelta features'):
    groupby_extraction_for_timedetla = [
        ['ip', 'device', 'os'],
        ['ip', 'app', 'channel']
    ]

    for groupby_cols in groupby_extraction_for_timedetla:
        click_time = train_subset[groupby_cols + ['click_time']].sort_values('click_time')
        prevfix = '_'.join(groupby_cols)
        click_time[f'{prevfix}_click_time_prev'] = click_time.groupby(groupby_cols)['click_time'].shift(1)
        train_subset[f'{prevfix}_click_time_prev_diff'] = (click_time['click_time'] - click_time[f'{prevfix}_click_time_prev']).dt.seconds

[Extract groupby timedelta features done in 42.853 s.]


In [11]:
train_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000000 entries, 0 to 14999999
Data columns (total 31 columns):
 #   Column                               Dtype         
---  ------                               -----         
 0   ip                                   uint32        
 1   app                                  uint16        
 2   device                               uint16        
 3   os                                   uint16        
 4   channel                              uint16        
 5   click_time                           datetime64[ns]
 6   is_attributed                        uint8         
 7   click_time_year                      int16         
 8   click_time_month                     int8          
 9   click_time_day                       int8          
 10  click_time_hour                      int8          
 11  ip_count                             int32         
 12  os_count                             int32         
 13  app_count                

In [17]:
with timer('Save training features'):
    features_filename = 'train_features_nov_7.csv'
    features_dtype_filename = 'train_features_dtypes_nov_7.csv'
    if not os.path.exists(features_filename):
        train_subset.to_csv(features_filename, index=False)
        train_subset_dtypes = pd.DataFrame(train_subset.dtypes).reset_index()
        train_subset_dtypes.columns = ['features', 'dtype']
        train_subset_dtypes.to_csv(features_dtype_filename, index=False)
    else:
        print(f'{features_filename} already saved.')

[Save training features done in 442.977 s.]
