In [1]:
# Import libraries and set desired options
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [2]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
# Read the training and test data sets
train_df = pd.read_csv('D:/Python projects/mlcourse_ai_solutions/alice_catch_me_if_you_can_competition/train_sessions.csv',
                       index_col='session_id', parse_dates=['time1'])
test_df = pd.read_csv('D:/Python projects/mlcourse_ai_solutions/alice_catch_me_if_you_can_competition/test_sessions.csv',
                      index_col='session_id', parse_dates=['time1'])

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,,...,,,,,,,,,,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [4]:
train_df.describe()

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,target
count,253561.0,250098.0,246919.0,244321.0,241829.0,239495.0,237297.0,235224.0,233084.0,231052.0,253561.0
mean,3243.550144,3272.400883,3303.987757,3325.109422,3343.406035,3352.312637,3395.546771,3430.157765,3438.400088,3460.388861,0.009059
std,7247.006212,7307.480404,7347.809918,7371.245606,7379.75855,7380.413775,7435.727671,7473.4179,7471.062652,7492.447548,0.094747
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,53.0,53.0,53.0,52.0,52.0,52.0,55.0,55.0,55.0,55.0,0.0
50%,677.0,677.0,677.0,677.0,678.0,679.0,679.0,704.0,733.0,733.0,0.0
75%,1980.0,1980.0,1986.5,1995.0,2054.0,2077.0,2197.0,2287.0,2366.0,2403.0,0.0
max,41601.0,41600.0,41599.0,41599.0,41599.0,41600.0,41600.0,41601.0,41601.0,41601.0,1.0


In [5]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)

In [6]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

CPU times: total: 10 s
Wall time: 10.1 s


((253561, 50000), (82797, 50000))

In [7]:
y_train = train_df['target'].astype('int').values

In [8]:
time_split = TimeSeriesSplit(n_splits=10)

In [9]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

In [10]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [11]:
%%time

cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=3) # hangs with n_jobs > 1, and locally this runs much faster

CPU times: total: 93.8 ms
Wall time: 14.4 s


In [12]:
cv_scores, cv_scores.mean()

(array([0.83141992, 0.64670745, 0.87992157, 0.9631551 , 0.84221498,
        0.87840596, 0.94475732, 0.85322119, 0.92987618, 0.90752852]),
 0.8677208183759575)

In [13]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, random_state=17, solver='liblinear')

In [14]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]

In [15]:
write_to_submission_file(logit_test_pred, 'subm1.csv') # 0.91288

In [16]:
df_prediction1 = pd.read_csv('subm1.csv')

In [17]:
df_prediction1.head(7)

Unnamed: 0,session_id,target
0,1,0.001465269
1,2,2.048972e-08
2,3,4.702726e-08
3,4,1.203842e-07
4,5,1.071117e-05
5,6,0.0005706284
6,7,0.001915689


In [18]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [19]:
%%time
X_train_new = add_time_features(train_df.fillna(0), X_train)
X_test_new = add_time_features(test_df.fillna(0), X_test)

CPU times: total: 2.12 s
Wall time: 2.12 s


In [20]:
X_train_new.shape, X_test_new.shape

((253561, 50004), (82797, 50004))

In [21]:
%%time
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=5) # hangs with n_jobs > 1, and locally this runs much faster

CPU times: total: 188 ms
Wall time: 14.4 s


In [22]:
cv_scores, cv_scores.mean()

(array([0.87652264, 0.75122947, 0.93062102, 0.97864183, 0.90399707,
        0.93831505, 0.96249405, 0.92731291, 0.9488626 , 0.9404352 ]),
 0.9158431838375949)

In [23]:
logit.fit(X_train_new, y_train)

LogisticRegression(C=1, random_state=17, solver='liblinear')

In [24]:
logit_test_pred2 = logit.predict_proba(X_test_new)[:, 1]

In [25]:
write_to_submission_file(logit_test_pred2, 'subm2.csv') # 0.93843

In [26]:
df_prediction2 = pd.read_csv('subm2.csv')

In [27]:
df_prediction2.head(7)

Unnamed: 0,session_id,target
0,1,3.751524e-05
1,2,3.938543e-08
2,3,7.206124e-08
3,4,6.552724e-09
4,5,1.268885e-05
5,6,1.743057e-05
6,7,0.003862438


In [28]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=5, cv=time_split, verbose=1)

In [29]:
%%time
logit_grid_searcher.fit(X_train_new, y_train) # WTF? Locally, it's 3min 30s

Fitting 10 folds for each of 10 candidates, totalling 100 fits
CPU times: total: 26.4 s
Wall time: 2min 47s


GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None),
             estimator=LogisticRegression(C=1, random_state=17,
                                          solver='liblinear'),
             n_jobs=5,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             scoring='roc_auc', verbose=1)

In [30]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.917376908118336, {'C': 0.21544346900318834})

In [31]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm3.csv') # 0.94242

In [32]:
df_prediction3 = pd.read_csv('subm3.csv')

In [33]:
df_prediction3.head(7)

Unnamed: 0,session_id,target
0,1,0.0002373741
1,2,4.97947e-07
2,3,1.988728e-06
3,4,9.393411e-07
4,5,0.0001782196
5,6,7.416179e-05
6,7,0.005921728


In [34]:
X_train_new

<253561x50004 sparse matrix of type '<class 'numpy.int64'>'
	with 3633114 stored elements in COOrdinate format>

In [35]:
X_train

<253561x50000 sparse matrix of type '<class 'numpy.int64'>'
	with 3379553 stored elements in Compressed Sparse Row format>

In [36]:
df_X_train = pd.DataFrame.sparse.from_spmatrix(X_train)

In [37]:
df_X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49990,49991,49992,49993,49994,49995,49996,49997,49998,49999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
253557,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
253558,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
253559,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
X_train

<253561x50000 sparse matrix of type '<class 'numpy.int64'>'
	with 3379553 stored elements in Compressed Sparse Row format>

In [39]:
# Import libraries and set desired options
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack
# !pip install eli5
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display_html

In [40]:
def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict,
                            vectorizer_params):
    times = ['time%s' % i for i in range(1, 11)]
    train_df = pd.read_csv(path_to_train, index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test, index_col='session_id', parse_dates=times)

    # Sort the data by time
    train_df = train_df.sort_values(by='time1')
    
    # read site -> id mapping provided by competition organizers 
    with open(path_to_site_dict, 'rb') as f:
        site2id = pickle.load(f)
    # create an inverse id _> site mapping
    id2site = {v:k for (k, v) in site2id.items()}
    # we treat site with id 0 as "unknown"
    id2site[0] = 'unknown'
    
    # Transform data into format which can be fed into TfidfVectorizer
    # This time we prefer to represent sessions with site names, not site ids. 
    # It's less efficient but thus it'll be more convenient to interpret model weights.
    sites = ['site%s' % i for i in range(1, 11)]
    train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    # we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
    # so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
    # to be split into 'mail', 'google' and 'com')
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_sessions)
    X_test = vectorizer.transform(test_sessions)
    y_train = train_df['target'].astype('int').values
    
    # we'll need site visit times for further feature engineering
    train_times, test_times = train_df[times], test_df[times]
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times

In [41]:
%%time
X_train_sites, X_test_sites, y_train, vectorizer, train_times, test_times = prepare_sparse_features(
    path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
    path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
    path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl'),
    vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}
)


NameError: name 'PATH_TO_DATA' is not defined