In [3]:
import pandas as pd
import numpy as np
from pathlib2 import Path

In [83]:
#!pip install eli5
import eli5

In [4]:
PATH_TO_DATA = Path('../../data/alice/')

times = ['time%s' % i for i in range(1, 11)]
train_df = pd.read_csv(PATH_TO_DATA / 'train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv(PATH_TO_DATA / 'test_sessions.csv',
                      index_col='session_id', parse_dates=times)

In [97]:
train_df['time1'].min(), train_df['time1'].max(), test_df['time1'].min(), test_df['time1'].max()

(Timestamp('2013-01-12 08:05:57'),
 Timestamp('2014-04-30 23:39:53'),
 Timestamp('2014-05-01 17:14:03'),
 Timestamp('2014-12-05 23:26:53'))

In [99]:
try_train_df = train_df[train_df['time1'] >= '2014-01-01 00:00:00']

In [5]:
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

In [102]:
# !!!!!!!!
try_train_df = try_train_df.sort_values(by='time1')

In [8]:
train_df = train_df.sort_values(by='time1')

In [10]:
#train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82797 entries, 1 to 82797
Data columns (total 20 columns):
site1     82797 non-null int64
time1     82797 non-null datetime64[ns]
site2     81308 non-null float64
time2     81308 non-null datetime64[ns]
site3     80075 non-null float64
time3     80075 non-null datetime64[ns]
site4     79182 non-null float64
time4     79182 non-null datetime64[ns]
site5     78341 non-null float64
time5     78341 non-null datetime64[ns]
site6     77566 non-null float64
time6     77566 non-null datetime64[ns]
site7     76840 non-null float64
time7     76840 non-null datetime64[ns]
site8     76151 non-null float64
time8     76151 non-null datetime64[ns]
site9     75484 non-null float64
time9     75484 non-null datetime64[ns]
site10    74806 non-null float64
time10    74806 non-null datetime64[ns]
dtypes: datetime64[ns](10), float64(9), int64(1)
memory usage: 13.3 MB


In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler

In [12]:
class DataPreparator(BaseEstimator, TransformerMixin):
    """
    Fill NaN with zero values.
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        sites = ['site%s' % i for i in range(1, 11)]
        return X[sites].fillna(0).astype('int')

In [13]:
class ListPreparator(BaseEstimator, TransformerMixin):
    """
    Prepare a CountVectorizer friendly 2D-list from data.
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.values.tolist()
        # Convert dataframe rows to strings
        return [" ".join([str(site) for site in row]) for row in X]

In [86]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    """
    Add new attributes to training and test set.
    """
    def fit(self, X, y=None):
        return self 
    def transform(self, X, y=None):
        # intraday features
        hour = X['time1'].apply(lambda ts: ts.hour)
        morning = ((hour >= 7) & (hour <= 11)).astype('int')
        day = ((hour >= 12) & (hour <= 18)).astype('int')
        evening = ((hour >= 19) & (hour <= 23)).astype('int')
        
        # season features
        month = X['time1'].apply(lambda ts: ts.month)
        summer = ((month >= 6) & (month <= 8)).astype('int')
        
        #winter = ((month == 12) | ((month <= 2) & (month >= 1))).astype('int')
        #spring = ((month >= 3) & (month <= 5)).astype('int')
        #autumn = ((month >= 9) & (month <= 11)).astype('int')
        
        # day of the week features
        weekday = X['time1'].apply(lambda ts: ts.weekday()).astype('int')
        
        # year features
        year = X['time1'].apply(lambda ts: ts.year).astype('int')
        
        X = np.c_[morning.values, day.values, evening.values, summer.values, #winter.values, spring.values, autumn.values, 
                  weekday.values, year.values]
        return X

In [15]:
class ScaledAttributesAdder(BaseEstimator, TransformerMixin):
    """
    Add new features, that should be scaled.
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # session time features
        times = ['time%s' % i for i in range(1, 11)]
        # session duration: take to the power of 1/5 to normalize the distribution
        session_duration = (X[times].max(axis=1) - X[times].min(axis=1)).astype('timedelta64[ms]').astype(int) ** 0.2
        # number of sites visited in a session
        number_of_sites = X[times].isnull().sum(axis=1).apply(lambda x: 10 - x)
        # average time spent on one site during a session
        time_per_site = (session_duration / number_of_sites) ** 0.2
        
        X = np.c_[session_duration.values]
        return X

In [106]:
vectorizer_pipeline = Pipeline([
    ("preparator", DataPreparator()),
    ("list_preparator", ListPreparator()),
    ("vectorizer", CountVectorizer(ngram_range=(1, 3), max_features=50000))
])

attributes_pipeline = Pipeline([
    ("adder", AttributesAdder())
])

scaled_attributes_pipeline = Pipeline([
    ("adder", ScaledAttributesAdder()),
    ("scaler", StandardScaler())
])

In [107]:
full_pipeline = FeatureUnion(transformer_list=[
('vectorizer_pipeline', vectorizer_pipeline),
('attributes_pipeline', attributes_pipeline),
('scaled_attributes_pipeline', scaled_attributes_pipeline)
])

In [89]:
%%time

X_train = full_pipeline.fit_transform(train_df)
X_test = full_pipeline.transform(test_df)

y_train = train_df["target"].astype('int').values

Wall time: 29.9 s


In [108]:
## ТЕСТ ОБРЕЗКИ ТРЕЙН_ДФ

X_train = full_pipeline.fit_transform(try_train_df)
X_test = full_pipeline.transform(test_df)

y_train = try_train_df["target"].astype('int').values

In [41]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [116]:
%%time
time_split = TimeSeriesSplit(n_splits=8)

logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, 
                        scoring='roc_auc', n_jobs=-1)

cv_scores.mean()

Wall time: 16.5 s


0.9520573232181172

In [90]:
%%time

c_values = np.logspace(-2, 2, 20)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

logit_grid_searcher.fit(X_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 14.0min finished


Wall time: 14min 13s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=0.21544346900318834,
                                          class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=17, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_st...
             param_grid={'C': array([1.00000000e-02, 1.62377674e-02, 2.63665090e-02, 4.28133240e-02,
       6.95192796e-02, 1.12883789e-01, 1.83298071e-01, 2.97635144e-01,
       4.83293024e-01, 7.84759970e-01, 1.27427499e+00, 2.06913808e+00,
       3.35981829e+00, 5

In [91]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9190735087900102, {'C': 0.18329807108324356})

In [93]:
lr = logit_grid_searcher.best_estimator_
lr

LogisticRegression(C=0.18329807108324356, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [92]:
logit_grid_searcher.fit(X_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [94]:
logit_test_pred = lr.predict_proba(X_test)[:, 1]

write_to_submission_file(logit_test_pred, 'custom_pipeline_gscv.csv')

In [117]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [118]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]

write_to_submission_file(logit_test_pred, 'pipeline_cut_train.csv')

In [85]:
eli5.show_weights(logit_grid_searcher.best_estimator_)

Weight?,Feature
+1.943,x50001
+1.423,x3917
+1.379,x21071
+1.129,x24377
+1.049,x50006
+0.989,x21198
+0.961,x41890
+0.930,x49607
+0.919,x21067
+0.907,x21065
