In [1]:
import pickle
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [2]:
times = ['time%s' % i for i in range(1, 11)]
sites = ['site%s' % i for i in range(1, 11)]

train = pd.read_csv('../../data/alice/train_sessions.csv', parse_dates = times, index_col='session_id')
test = pd.read_csv('../../data/alice/test_sessions.csv', parse_dates = times, index_col='session_id')

train.sort_values(by='time1', inplace=True)

idx = train.shape[0]
data = pd.concat([train, test], sort=False) # leave train.target for eda

train.shape, test.shape, data.shape

((253561, 21), (82797, 20), (336358, 21))

In [3]:
data[sites] = data[sites].fillna(0).astype(np.uint16) # float->int (55.0 -> 55)

# for each row combine site_ids into one string separated by space
data['words'] = data[sites].astype(np.str).apply(' '.join, axis=1)

#words = CountVectorizer(max_features=50000, ngram_range=(1, 3)).fit_transform(data['words'])
words = TfidfVectorizer(max_features=50000, ngram_range=(1, 3)).fit_transform(data['words'])

data.drop(['words'], inplace=True, axis=1)
words

<336358x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 4433718 stored elements in Compressed Sparse Row format>

In [4]:
model = LogisticRegression(random_state=17, solver='liblinear')
time_split = TimeSeriesSplit(n_splits=10)

X_train = words[:idx]
y_train = train.target

cv_scores = cross_val_score(model, X_train, y_train, cv=time_split, scoring='roc_auc')
cv_scores, cv_scores.mean()

(array([0.81423928, 0.65396238, 0.87473037, 0.93492792, 0.84785245,
        0.88841651, 0.92460283, 0.87528125, 0.92859017, 0.92143771]),
 0.866404088770744)

In [8]:
data['min'] = data[times].min(axis=1)
data['max'] = data[times].max(axis=1)
data['seconds'] = ((data['max'] - data['min']) / np.timedelta64(1, 's'))
data['minutes'] = ((data['max'] - data['min']) / np.timedelta64(1, 'm')).round(2)
data.drop(['min','max'], inplace=True, axis=1)

data['month'] = data['time1'].apply(lambda ts: ts.month+(12*(ts.year-2013))).astype(np.int8)
data['yyyymm'] = data['time1'].apply(lambda ts: 100 * ts.year + ts.month).astype(np.int32) # wtf! why this works?
data['mm'] = data['time1'].apply(lambda ts: ts.month).astype(np.int8)
data['yyyy'] = data['time1'].apply(lambda ts: ts.year).astype(np.int32)

data['dayofweek'] = data['time1'].apply(lambda ts: ts.dayofweek).astype(np.int8)
data['weekend'] = data['time1'].apply(lambda ts: ts.dayofweek > 5).astype(np.int8)

data['hour'] = data['time1'].apply(lambda ts: ts.hour).astype(np.int8)

In [10]:
hosts = pd.read_pickle('../../data/alice/site_dic.pkl')
hosts = pd.DataFrame(data=list(hosts.keys()), index=list(hosts.values()), columns=['name']) # switch key and value

hosts['split'] = hosts['name'].str.split('.')
hosts['len'] = hosts['split'].map(lambda x: len(x)).astype(np.int8)
hosts['domain'] = hosts['split'].map(lambda x: x[-1])

hosts.drop(['name','split'], inplace=True, axis=1)
hosts.index.rename('site1', inplace=True) # rename index for the future merge
data = pd.merge(data, hosts, how='left', on='site1')

In [11]:
data['short'] = data['minutes'].map(lambda x: x < 0.8).astype(np.int8)
data['long'] = data['minutes'].map(lambda x: x >= 0.8).astype(np.int8)

In [12]:
data["online_day"] = data['time1'].apply(lambda ts: ts.dayofweek in [0,1,3,4]).astype(np.int8)
data["mon"] = data['time1'].apply(lambda ts: ts.dayofweek in [0]).astype(np.int8) # monday
data["wen"] = data['time1'].apply(lambda ts: ts.dayofweek in [2]).astype(np.int8) # wensday
data["sun"] = data['time1'].apply(lambda ts: ts.dayofweek in [6]).astype(np.int8) # sunday

In [13]:
data['big_site'] = data['len'].apply(lambda x: x > 5).astype(np.int8)
data['typical_site'] = data['len'].apply(lambda x: x == 3).astype(np.int8)

In [14]:
data['typical_domain'] = data['domain'].map(lambda x: x in ('com', 'fr', 'net', 'uk', 'org', 'tv')).astype(np.int)

In [15]:
''' wtf?
data['morning'] = data['time1'].apply(lambda ts: (ts.hour >= 8) & (ts.hour < 12)).astype(np.int8)
data['day'] = data['time1'].apply(lambda ts: (ts.hour >= 12) & (ts.hour < 15)).astype(np.int8)
data['evening'] = data['time1'].apply(lambda ts: (ts.hour >= 15) & (ts.hour < 19)).astype(np.int8)
data['night'] = data['time1'].apply(lambda ts: (ts.hour >= 19) | (ts.hour < 8)).astype(np.int8) # or!
'''

data['morning'] = data['time1'].apply(lambda ts: (ts.hour >= 7) & (ts.hour < 12)).astype(np.int8)
data['day'] = data['time1'].apply(lambda ts: (ts.hour >= 12) & (ts.hour < 18)).astype(np.int8)
data['evening'] = data['time1'].apply(lambda ts: (ts.hour >= 18) & (ts.hour < 23)).astype(np.int8)
data['night'] = data['time1'].apply(lambda ts: (ts.hour >= 23) | (ts.hour < 7)).astype(np.int8) # or!

In [16]:
data.columns

Index(['site1', 'time1', 'site2', 'time2', 'site3', 'time3', 'site4', 'time4',
       'site5', 'time5', 'site6', 'time6', 'site7', 'time7', 'site8', 'time8',
       'site9', 'time9', 'site10', 'time10', 'target', 'seconds', 'minutes',
       'month', 'yyyymm', 'mm', 'yyyy', 'dayofweek', 'weekend', 'hour', 'len',
       'domain', 'short', 'long', 'online_day', 'mon', 'wen', 'sun',
       'big_site', 'typical_site', 'typical_domain', 'morning', 'day',
       'evening', 'night'],
      dtype='object')

In [17]:
data.drop(times + sites + ['target'], inplace=True, axis=1)
#data.to_pickle('dump.pkl')
data.columns

Index(['seconds', 'minutes', 'month', 'yyyymm', 'mm', 'yyyy', 'dayofweek',
       'weekend', 'hour', 'len', 'domain', 'short', 'long', 'online_day',
       'mon', 'wen', 'sun', 'big_site', 'typical_site', 'typical_domain',
       'morning', 'day', 'evening', 'night'],
      dtype='object')

In [20]:
data.sample(5)

Unnamed: 0,seconds,minutes,month,yyyymm,mm,yyyy,dayofweek,weekend,hour,len,...,mon,wen,sun,big_site,typical_site,typical_domain,morning,day,evening,night
95127,280.0,4.67,13,201401,1,2014,2,0,16,4,...,0,1,0,0,0,0,0,1,0,0
10362,9.0,0.15,4,201304,4,2013,4,0,13,3,...,0,0,0,0,1,1,0,1,0,0
127240,9.0,0.15,14,201402,2,2014,0,0,16,3,...,1,0,0,0,1,1,0,1,0,0
198804,4.0,0.07,15,201403,3,2014,0,0,17,3,...,1,0,0,0,1,1,0,1,0,0
32766,333.0,5.55,11,201311,11,2013,1,0,10,3,...,0,0,0,0,1,1,1,0,0,0


In [36]:
#data = pd.get_dummies(data, columns=['domain'])

In [21]:
data = pd.get_dummies(data, columns=[
    'yyyy',
    'mm',
    'dayofweek',
    'hour',
    'len',
    'domain'
])



features_to_scale = [
    'seconds',
    'minutes',
    'month',
    'yyyymm'#,
    #'dayofweek',
    #'hour',
    #'len'
]

data[features_to_scale] = StandardScaler().fit_transform(data[features_to_scale])

In [23]:
data.sample(5)

Unnamed: 0,seconds,minutes,month,yyyymm,weekend,short,long,online_day,mon,wen,...,domain_tw,domain_ua,domain_ug,domain_uk,domain_us,domain_va,domain_vc,domain_vn,domain_ws,domain_za
169118,-0.454969,-0.454288,0.06786,0.501532,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
161761,0.3431,0.34378,-0.169077,0.476232,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
164042,0.680745,0.681425,-0.169077,0.476232,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
199632,-0.069577,-0.069578,0.06786,0.501532,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5087,-0.444737,-0.444057,-2.775387,-2.028476,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
%%time

#model = LogisticRegression(random_state=17, solver='liblinear')
#time_split = TimeSeriesSplit(n_splits=10)

X_train = csr_matrix(hstack([words[:idx], data[:idx]]))
y_train = train.target

params = {
    'C': np.logspace(-2, 2, 10),
    'penalty': ['l1','l2']
}

grid = GridSearchCV(estimator=model, param_grid=params, scoring='roc_auc', cv=time_split, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

grid.best_estimator_, grid.best_score_, grid.best_params_

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 56.2min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 58.1min finished


Wall time: 58min 27s


(LogisticRegression(C=4.6415888336127775, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 0.9127606549387459,
 {'C': 4.6415888336127775, 'penalty': 'l2'})

In [26]:
%%time
model = grid.best_estimator_
model.fit(X_train, y_train)

X_test = csr_matrix(hstack([words[idx:], data[idx:]]))
y_test = model.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({"session_id": test.index, "target": y_test})
submission.to_csv('submission.csv', index=False)

Wall time: 21.4 s
