In [1]:
import pandas as pd
import numpy as np
from pathlib2 import Path
import datetime
from dateutil.relativedelta import relativedelta

In [11]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [2]:
PATH_TO_DATA = Path('../../data/alice/')

times = ['time%s' % i for i in range(1, 11)]
train_df = pd.read_csv(PATH_TO_DATA / 'train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv(PATH_TO_DATA / 'test_sessions.csv',
                      index_col='session_id', parse_dates=times)

In [97]:
train_df['time1'].min(), train_df['time1'].max(), test_df['time1'].min(), test_df['time1'].max()

(Timestamp('2013-01-12 08:05:57'),
 Timestamp('2014-04-30 23:39:53'),
 Timestamp('2014-05-01 17:14:03'),
 Timestamp('2014-12-05 23:26:53'))

In [3]:
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

In [4]:
train_df = train_df.sort_values(by='time1')

In [26]:
sites = ['site%s' % i for i in range(1, 11)]
X_val = train_df[sites].fillna(0).astype('int').astype('str')

In [36]:
X_val.values.tolist()

[['56', '55', '0', '0', '0', '0', '0', '0', '0', '0'],
 ['56', '55', '56', '55', '0', '0', '0', '0', '0', '0'],
 ['946', '946', '951', '946', '946', '945', '948', '784', '949', '946'],
 ['945', '948', '949', '948', '945', '946', '947', '945', '946', '946'],
 ['947', '950', '948', '947', '950', '952', '946', '951', '946', '947'],
 ['952', '947', '953', '946', '947', '946', '953', '955', '946', '947'],
 ['953', '947', '946', '953', '955', '947', '953', '946', '953', '1033'],
 ['946', '947', '954', '953', '946', '954', '946', '956', '957', '956'],
 ['946', '956', '946', '946', '955', '954', '946', '946', '946', '948'],
 ['948', '946', '948', '784', '49', '53', '812', '982', '52', '52'],
 ['52', '52', '52', '747', '747', '747', '23', '747', '568', '23'],
 ['513', '1116', '747', '23', '747', '747', '29', '49', '52', '21'],
 ['4222', '3358', '4222', '3356', '4222', '3870', '21', '3870', '3358', '21'],
 ['38667', '181', '23', '181', '38667', '23', '182', '181', '38667', '55'],
 ['56', '55', '

In [34]:
" ".join([site for site in X_val.values[4]])

'947 950 948 947 950 952 946 951 946 947'

In [25]:
train_df.info()
#test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253561 entries, 21669 to 204762
Data columns (total 21 columns):
site1     253561 non-null int64
time1     253561 non-null datetime64[ns]
site2     250098 non-null float64
time2     250098 non-null datetime64[ns]
site3     246919 non-null float64
time3     246919 non-null datetime64[ns]
site4     244321 non-null float64
time4     244321 non-null datetime64[ns]
site5     241829 non-null float64
time5     241829 non-null datetime64[ns]
site6     239495 non-null float64
time6     239495 non-null datetime64[ns]
site7     237297 non-null float64
time7     237297 non-null datetime64[ns]
site8     235224 non-null float64
time8     235224 non-null datetime64[ns]
site9     233084 non-null float64
time9     233084 non-null datetime64[ns]
site10    231052 non-null float64
time10    231052 non-null datetime64[ns]
target    253561 non-null int64
dtypes: datetime64[ns](10), float64(9), int64(2)
memory usage: 42.6 MB


In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler

In [6]:
class DataPreparator(BaseEstimator, TransformerMixin):
    """
    Fill NaN with zero values.
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        sites = ['site%s' % i for i in range(1, 11)]
        return X[sites].fillna(0).astype('int')

In [7]:
class ListPreparator(BaseEstimator, TransformerMixin):
    """
    Prepare a CountVectorizer friendly 2D-list from data.
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.values.tolist()
        # Convert dataframe rows to strings
        return [" ".join([str(site) for site in row]) for row in X]

In [8]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    """
    Add new attributes to training and test set.
    """
    def fit(self, X, y=None):
        return self 
    def transform(self, X, y=None):
        # intraday features
        hour = X['time1'].apply(lambda ts: ts.hour)
        morning = ((hour >= 7) & (hour <= 11)).astype('int')
        day = ((hour >= 12) & (hour <= 18)).astype('int')
        evening = ((hour >= 19) & (hour <= 23)).astype('int')
        
        # season features
        month = X['time1'].apply(lambda ts: ts.month)
        summer = ((month >= 6) & (month <= 8)).astype('int')
        
        #winter = ((month == 12) | ((month <= 2) & (month >= 1))).astype('int')
        #spring = ((month >= 3) & (month <= 5)).astype('int')
        #autumn = ((month >= 9) & (month <= 11)).astype('int')
        
        # day of the week features
        weekday = X['time1'].apply(lambda ts: ts.weekday()).astype('int')
        
        # year features
        year = X['time1'].apply(lambda ts: ts.year).astype('int')
        #year_month = X['time1'].apply(lambda t: 100 * t.year + t.month).astype('int')
        
        #winter.values, spring.values, autumn.values,
        X = np.c_[morning.values, day.values, evening.values, summer.values, \
                  weekday.values, year.values]
        return X

In [9]:
class ScaledAttributesAdder(BaseEstimator, TransformerMixin):
    """
    Add new features, that should be scaled.
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # session time features
        times = ['time%s' % i for i in range(1, 11)]
        # session duration: take to the power of 1/5 to normalize the distribution
        session_duration = (X[times].max(axis=1) - X[times].min(axis=1)).astype('timedelta64[ms]').astype(int) ** 0.2
        # number of sites visited in a session
        number_of_sites = X[times].isnull().sum(axis=1).apply(lambda x: 10 - x)
        # average time spent on one site during a session
        time_per_site = (session_duration / number_of_sites) ** 0.2
        
        X = np.c_[session_duration.values]
        return X

In [10]:
vectorizer_pipeline = Pipeline([
    ("preparator", DataPreparator()),
    ("list_preparator", ListPreparator()),
    ("vectorizer", CountVectorizer(ngram_range=(1, 3), max_features=100000))
])

attributes_pipeline = Pipeline([
    ("adder", AttributesAdder())
])

scaled_attributes_pipeline = Pipeline([
    ("adder", ScaledAttributesAdder()),
    ("scaler", StandardScaler())
])

In [11]:
full_pipeline = FeatureUnion(transformer_list=[
('vectorizer_pipeline', vectorizer_pipeline),
('attributes_pipeline', attributes_pipeline),
('scaled_attributes_pipeline', scaled_attributes_pipeline)
])

In [12]:
%%time

X_train = full_pipeline.fit_transform(train_df)
X_test = full_pipeline.transform(test_df)

y_train = train_df["target"].astype('int').values

CPU times: user 17.2 s, sys: 259 ms, total: 17.4 s
Wall time: 17.5 s


In [13]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [14]:
%%time
time_split = TimeSeriesSplit(n_splits=8)

#c_values = np.logspace(-2, 2, 20)
logit = LogisticRegression(C=0.18, random_state=17, solver='liblinear')
#logit = LogisticRegression(random_state=17, solver='lbfgs')

#logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_val},
#                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, 
                        scoring='roc_auc', n_jobs=1)

cv_scores.mean()

#logit_grid_searcher.fit(X_train, y_train)

CPU times: user 1min 15s, sys: 37.8 s, total: 1min 53s
Wall time: 30 s


0.9120484745412575

In [89]:
logit.fit(X_train, y_train)

LogisticRegression(C=0.18, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [90]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]

write_to_submission_file(logit_test_pred, 'custom_pipeline_1.csv')

___

In [25]:
# !!!!!!!!
try_train_df = train_df.copy()
try_train_df = try_train_df.sort_values(by='time1')

try_train_df = train_df[train_df['time1'] >= '2013-05-01 00:00:00']

In [26]:
## ТЕСТ ОБРЕЗКИ ТРЕЙН_ДФ

X_train = full_pipeline.fit_transform(try_train_df)
X_test = full_pipeline.transform(test_df)

y_train = try_train_df["target"].astype('int').values

In [46]:
%time
time_split = TimeSeriesSplit(n_splits=8)

c_val = np.logspace(-4, 2, 20)
logit_cv = LogisticRegressionCV(Cs=c_val, cv=time_split, scoring='roc_auc', solver='liblinear', n_jobs=-1, random_state=17).fit(X_train, y_train)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs


In [47]:
logit_cv.predict(X_train)

array([0, 0, 0, ..., 0, 0, 0])

In [48]:
logit_cv.score(X_train, y_train)



0.9896078026252935

In [50]:
logit_cv.

{'Cs': array([1.00000000e-04, 2.06913808e-04, 4.28133240e-04, 8.85866790e-04,
        1.83298071e-03, 3.79269019e-03, 7.84759970e-03, 1.62377674e-02,
        3.35981829e-02, 6.95192796e-02, 1.43844989e-01, 2.97635144e-01,
        6.15848211e-01, 1.27427499e+00, 2.63665090e+00, 5.45559478e+00,
        1.12883789e+01, 2.33572147e+01, 4.83293024e+01, 1.00000000e+02]),
 'class_weight': None,
 'cv': TimeSeriesSplit(max_train_size=None, n_splits=8),
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1.0,
 'l1_ratios': None,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': -1,
 'penalty': 'l2',
 'random_state': 17,
 'refit': True,
 'scoring': 'roc_auc',
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0}

In [37]:
logit_cv_pp = logit_cv.predict_proba(X_train)

In [43]:
logit_cv_pp[:, 1]

array([1.51545158e-05, 2.89139224e-04, 2.41186726e-03, ...,
       2.21495996e-03, 1.52838074e-06, 2.13227675e-05])

In [44]:
logit_cv_pp#[:, 1]

array([[9.99984845e-01, 1.51545158e-05],
       [9.99710861e-01, 2.89139224e-04],
       [9.97588133e-01, 2.41186726e-03],
       ...,
       [9.97785040e-01, 2.21495996e-03],
       [9.99998472e-01, 1.52838074e-06],
       [9.99978677e-01, 2.13227675e-05]])

In [49]:
logit_test_pred = logit_cv.predict_proba(X_test)[:, 1]

write_to_submission_file(logit_test_pred, 'pipeline_cut_try_logitcv.csv')

In [35]:
logit_cv.score(X_train, y_train)



0.9895027132458616

In [67]:
%%time
time_split = TimeSeriesSplit(n_splits=8)

logit = LogisticRegression(C=3.56, random_state=17, solver='liblinear')
#logit = LogisticRegression(random_state=17, solver='lbfgs')

cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, 
                        scoring='roc_auc', n_jobs=-1)

cv_scores.mean()
#0.95362 liblinear n_splits=8 C=1
#0.93711 lbfgs
#0.935049 iblinear n_splits=6
#0.93271 iblinear n_splits=5
#0.93211 iblinear n_splits=9
#0.94867 liblinear n_splits=8 C=0.1
#0.95149 liblinear n_splits=8 C=0.2
#0.9535 liblinear n_splits=8 C=0.5
#0.95398 liblinear n_splits=8 C=0.6

Wall time: 15 s


0.9576464237589468

In [58]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.944043051154898, {'C': 4.3428571428571425})

In [22]:
logit_test_pred = logit_grid_searcher.best_estimator_.predict_proba(X_test)[:, 1]

write_to_submission_file(logit_test_pred, 'pipeline_cut_train_gscv.csv')

In [48]:
train_df['time1']

session_id
1        2014-02-20 10:02:45
2        2014-02-22 11:19:50
3        2013-12-16 16:40:17
4        2014-03-28 10:52:12
5        2014-02-28 10:53:05
                 ...        
253557   2013-11-25 10:26:54
253558   2013-03-12 16:01:15
253559   2013-09-12 14:05:03
253560   2013-12-19 15:20:22
253561   2014-04-25 09:56:52
Name: time1, Length: 253561, dtype: datetime64[ns]

In [65]:
time = '2014-02-20 10:02:45'
time = pd.to_datetime(time) + relativedelta(months=3)

In [92]:
train_df['time1'].max()

Timestamp('2014-04-30 23:39:53')

In [21]:
def cut_train_df(train_df, test_df, pipeline, date):
    print('Train data frame starts at ', date)
    
    train_df.sort_values(by='time1')
    train_df = train_df[train_df['time1'] >= date]
    
    X_train = full_pipeline.fit_transform(train_df)
    X_test = full_pipeline.transform(test_df)

    y_train = train_df["target"].astype('int').values
    
    time_split = TimeSeriesSplit(n_splits=8)

    logit = LogisticRegression(random_state=17, solver='liblinear')

    cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, 
                        scoring='roc_auc', n_jobs=1)  
    print('CV mean score: ', cv_scores.mean())
    
    logit.fit(X_train, y_train)
    
    logit_test_pred = logit.predict_proba(X_test)[:, 1]

    write_to_submission_file(logit_test_pred, 'custom_pipeline_cut_{}.csv'.format(date))
    
    return cv_scores.mean()

In [22]:
%%time
results = {}
date = train_df['time1'].min()
while date < pd.to_datetime('2014-04-01 00:00:00'):
    cv_s = cut_train_df(train_df, test_df, full_pipeline, date)
    results[date] = cv_s
    date = date + relativedelta(months=1)

Train data frame starts at  2013-01-12 08:05:57
CV mean score:  0.9677516470573522
Train data frame starts at  2013-02-12 08:05:57
CV mean score:  0.9667072829727531
Train data frame starts at  2013-03-12 08:05:57
CV mean score:  0.9660186820080507
Train data frame starts at  2013-04-12 08:05:57
CV mean score:  0.9677917323116562
Train data frame starts at  2013-05-12 08:05:57
CV mean score:  0.9742345155104627
Train data frame starts at  2013-06-12 08:05:57
CV mean score:  0.9748798329327567
Train data frame starts at  2013-07-12 08:05:57
CV mean score:  0.9751020897868399
Train data frame starts at  2013-08-12 08:05:57
CV mean score:  0.9749307387803395
Train data frame starts at  2013-09-12 08:05:57
CV mean score:  0.9752516749458143
Train data frame starts at  2013-10-12 08:05:57
CV mean score:  0.9726688916353792
Train data frame starts at  2013-11-12 08:05:57
CV mean score:  0.9732411211968601
Train data frame starts at  2013-12-12 08:05:57
CV mean score:  0.9815645171315289
Trai