## Alice Freeride

Import libraries and set desired options

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display_html
import warnings

In [2]:
warnings.filterwarnings("ignore")
sns.set()
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
sns.set_context('notebook', font_scale=1.5, 
                rc={
                    "figure.figsize": (10, 6),
                    "image.cmap": 'viridis',
                    "axes.titlesize": 18
                }
               )
from matplotlib import rcParams
rcParams['figure.figsize'] = 10, 6

In [3]:
train_df = pd.read_csv('../data/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('../data/test_sessions.csv',
                      index_col='session_id')

In [4]:
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

In [5]:
train_df.head(2)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,2014-02-20 10:02:45,,,,,,,,,...,,,,,,,,,,0
2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0


In [6]:
test_df.head(2)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,29,2014-10-04 11:19:53,35.0,2014-10-04 11:19:53,22.0,2014-10-04 11:19:54,321.0,2014-10-04 11:19:54,23.0,2014-10-04 11:19:54,2211.0,2014-10-04 11:19:54,6730.0,2014-10-04 11:19:54,21.0,2014-10-04 11:19:54,44582.0,2014-10-04 11:20:00,15336.0,2014-10-04 11:20:00
2,782,2014-07-03 11:00:28,782.0,2014-07-03 11:00:53,782.0,2014-07-03 11:00:58,782.0,2014-07-03 11:01:06,782.0,2014-07-03 11:01:09,782.0,2014-07-03 11:01:10,782.0,2014-07-03 11:01:23,782.0,2014-07-03 11:01:29,782.0,2014-07-03 11:01:30,782.0,2014-07-03 11:01:53


In [7]:
data = pd.concat([train_df, test_df])

Transform data into a better format.

In [8]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%s' % i for i in range(1, 11)]
data[sites] = data[sites].fillna(0).astype('int').astype('str');
data[times] = data[times].apply(pd.to_datetime);

In [9]:
data.head(4)

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time1,time10,time2,time3,time4,time5,time6,time7,time8,time9
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,2014-02-20 10:02:45,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:19:50,2014-02-22 11:20:16,2014-02-22 11:19:50,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15
3,14769,14768,39,14768,14769,37,39,14768,14768,14768,...,2013-12-16 16:40:17,2013-12-16 16:40:24,2013-12-16 16:40:18,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:20,2013-12-16 16:40:21,2013-12-16 16:40:22
4,782,782,782,782,782,782,782,782,782,782,...,2014-03-28 10:52:12,2014-03-28 10:56:42,2014-03-28 10:52:42,2014-03-28 10:53:12,2014-03-28 10:53:42,2014-03-28 10:54:12,2014-03-28 10:54:42,2014-03-28 10:55:12,2014-03-28 10:55:42,2014-03-28 10:56:12


In [11]:
times_min = data[times].min(axis=1)
times_max = data[times].max(axis=1)

Creating a sparse matrix in the form of a bag of words

In [12]:
def join_str(q):
    w = ' '.join(q)
    return w
    
sessions_text = data[sites].apply(join_str, axis=1)
print("Number of sessions: {}".format(sessions_text.shape[0]))

Number of sessions: 336358


In [13]:
vectorizer = TfidfVectorizer()
vec_sessions_data = vectorizer.fit_transform(sessions_text)

In [14]:
print('shape of matrix: {}'.format(vec_sessions_data.shape))

shape of matrix: (336358, 48362)


Adding new features like:
1. session_len
2. session_start_hour
3. session_end_hour
4. day_of_week
5. week
6. day_of_month
7. month
8. unique_sites

In [15]:
def session_length(q):
    time_stamps = q[times].values
    session_length = time_stamps.max() - time_stamps.min()
    return session_length.total_seconds()
    
data['session_len'] = data.apply(session_length, axis=1)
data.head(2)

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time10,time2,time3,time4,time5,time6,time7,time8,time9,session_len
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:20:16,2014-02-22 11:19:50,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,26.0


In [16]:
def session_start_hour(q):
    time_stamps = q[times].values
    session_start_hour = time_stamps.min().hour
    return session_start_hour

data['session_start_hour'] = data.apply(session_start_hour, axis=1)
data.head(2)

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time2,time3,time4,time5,time6,time7,time8,time9,session_len,session_start_hour
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:19:50,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,26.0,11.0


In [17]:
def session_end_hour(q):
    time_stamps = q[times].values
    session_end_hour = time_stamps.max().hour
    return session_end_hour

data['session_end_hour'] = data.apply(session_end_hour, axis=1)
data.head(2)

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time3,time4,time5,time6,time7,time8,time9,session_len,session_start_hour,session_end_hour
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,26.0,11.0,11.0


In [18]:
def day_of_week(q):
    time_stamps = q[times].values
    day_of_week = time_stamps.min().weekday()
    return day_of_week

data['dow'] = data.apply(day_of_week, axis=1)
data.head(2)

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time4,time5,time6,time7,time8,time9,session_len,session_start_hour,session_end_hour,dow
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,NaT,NaT,NaT,NaT,NaT,NaT,,,,
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,26.0,11.0,11.0,5.0


In [20]:
def week(q):
    time_stamps = q[times].values
    week = time_stamps.min().week
    return week

data['week'] = data.apply(week, axis=1)
data.head(2)

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time5,time6,time7,time8,time9,session_len,session_start_hour,session_end_hour,dow,week
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,NaT,NaT,NaT,NaT,NaT,,,,,
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,26.0,11.0,11.0,5.0,8.0


In [21]:
def day_of_month(q):
    time_stamps = q[times].values
    day_of_month = time_stamps.min().day
    return day_of_month

data['dom'] = data.apply(day_of_month, axis=1)
data.head(2)

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time6,time7,time8,time9,session_len,session_start_hour,session_end_hour,dow,week,dom
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,NaT,NaT,NaT,NaT,,,,,,
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,26.0,11.0,11.0,5.0,8.0,22.0


In [22]:
def month(q):
        time_stamps = q[times].values
        month = time_stamps.min().month
        return month
    
data['month'] = data.apply(month, axis=1)
data.head(2)

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time7,time8,time9,session_len,session_start_hour,session_end_hour,dow,week,dom,month
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,NaT,NaT,NaT,,,,,,,
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,26.0,11.0,11.0,5.0,8.0,22.0,2.0


In [23]:
def is_weekend(q):
    day_of_week = q['dow']
    
    if (day_of_week >= 5):
        return 1
    return 0

data['is_weekend'] = data.apply(is_weekend, axis=1)
data.head(2)

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time8,time9,session_len,session_start_hour,session_end_hour,dow,week,dom,month,is_weekend
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,NaT,NaT,,,,,,,,0
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:19:52,2014-02-22 11:20:15,26.0,11.0,11.0,5.0,8.0,22.0,2.0,1


In [24]:
def unique_sites(q):
    unique_sites = q[sites].values
    unique_sites = set([i for i in unique_sites if int(i) > 0])
    return len(unique_sites)

data['unique_sites'] = data.apply(unique_sites, axis=1)
data.head(2)

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time9,session_len,session_start_hour,session_end_hour,dow,week,dom,month,is_weekend,unique_sites
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,NaT,,,,,,,,0,1
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:20:15,26.0,11.0,11.0,5.0,8.0,22.0,2.0,1,7


Remove empty cells if they exist

In [25]:
data['session_len'] = data['session_len'].fillna(round(data['session_len'].mean())).astype('int')
data['session_start_hour'] = data['session_start_hour'].fillna(
    round(data['session_start_hour'].mean())).astype('int')
data['session_end_hour'] = data['session_end_hour'].fillna(round(data['session_end_hour'].mean())).astype('int')
data['dow'] = data['dow'].fillna(round(data['dow'].mean())).astype('int')
data['week'] = data['week'].fillna(round(data['week'].mean())).astype('int')
data['dom'] = data['dom'].fillna(round(data['dom'].mean())).astype('int')
data['month'] = data['month'].fillna(round(data['month'].mean())).astype('int')
data['is_weekend'] = data['is_weekend'].fillna(round(data['is_weekend'].mean())).astype('int')

In [26]:
data.head()

Unnamed: 0_level_0,site1,site10,site2,site3,site4,site5,site6,site7,site8,site9,...,time9,session_len,session_start_hour,session_end_hour,dow,week,dom,month,is_weekend,unique_sites
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,0,0,0,0,0,0,0,0,0,...,NaT,119,12,12,2,22,15,5,0,1
2,890,1518,941,3847,941,942,3846,3847,3846,1516,...,2014-02-22 11:20:15,26,11,11,5,8,22,2,1,7
3,14769,14768,39,14768,14769,37,39,14768,14768,14768,...,2013-12-16 16:40:22,7,16,16,0,51,16,12,0,4
4,782,782,782,782,782,782,782,782,782,782,...,2014-03-28 10:56:12,270,10,10,4,13,28,3,0,1
5,22,178,177,175,178,177,178,175,177,177,...,2014-02-28 10:57:06,246,10,10,4,9,28,2,0,4


Using pd.get_dummies to convert categorical variable into dummy/indicator variables

In [27]:
data = pd.get_dummies(data, columns=['session_start_hour', 'session_end_hour', 'dow', 'dom', 'month'])

day_of_week = data.filter(like='dow').columns
start_hour = data.filter(like='session_start_hour').columns
end_hour = data.filter(like='session_end_hour').columns
day_of_month = data.filter(like='dom').columns
dum_month = ['month_1','month_2','month_3','month_4','month_5','month_6',
              'month_7','month_8','month_9','month_10','month_11','month_12']
print(day_of_week, '\n')
print(start_hour, '\n')
print(end_hour, '\n')
print(day_of_month, '\n')
print(dum_month)

Index(['dow_0', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6'], dtype='object') 

Index(['session_start_hour_7', 'session_start_hour_8', 'session_start_hour_9',
       'session_start_hour_10', 'session_start_hour_11',
       'session_start_hour_12', 'session_start_hour_13',
       'session_start_hour_14', 'session_start_hour_15',
       'session_start_hour_16', 'session_start_hour_17',
       'session_start_hour_18', 'session_start_hour_19',
       'session_start_hour_20', 'session_start_hour_21',
       'session_start_hour_22', 'session_start_hour_23'],
      dtype='object') 

Index(['session_end_hour_7', 'session_end_hour_8', 'session_end_hour_9',
       'session_end_hour_10', 'session_end_hour_11', 'session_end_hour_12',
       'session_end_hour_13', 'session_end_hour_14', 'session_end_hour_15',
       'session_end_hour_16', 'session_end_hour_17', 'session_end_hour_18',
       'session_end_hour_19', 'session_end_hour_20', 'session_end_hour_21',
       'session_end_hour_22', '

In [28]:
data.head(3).T

session_id,1,2,3
site1,718,890,14769
site10,0,1518,14768
site2,0,941,39
site3,0,3847,14768
site4,0,941,14769
site5,0,942,37
site6,0,3846,39
site7,0,3847,14768
site8,0,3846,14768
site9,0,1516,14768


Convert into an array and stack them horizontally

In [29]:
columns = np.hstack(( ['unique_sites', 'session_len', 'is_weekend'], day_of_week, start_hour, end_hour,
                             day_of_month, dum_month ))

In [30]:
scaler = StandardScaler()

In [31]:
data_scaler = scaler.fit_transform(data[columns])

In [32]:
# using compressed sparse row matrix
columns_data = csr_matrix(data_scaler)

print('shape of new scaled matrix: {}'.format(columns_data.shape))

shape of new scaled matrix: (336358, 81)


In [33]:
print('Recall the shape of the vectorized matrix we calculated earlier? \n')
print('shape of matrix: {}'.format(vec_sessions_data.shape))

Recall the shape of the vectorized matrix we calculated earlier? 

shape of matrix: (336358, 48362)


In [34]:
data2 = hstack((vec_sessions_data, columns_data))

print('shape of new hstack matrix: {}'.format(data2.shape))

shape of new hstack matrix: (336358, 48443)


Convert this matrix to Compressed Sparse Column format using matrix.tocsc()

In [35]:
X_train = data2.tocsc()[:train_df.shape[0]]
y_train = train_df['target']
X_test = data2.tocsc()[train_df.shape[0]:]

A helper function for writing predictions to a file.

In [36]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

Predicting with Stochastic Gradient Descent (SGD)

In [37]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.30, random_state=10)

In [38]:
%%time
sdg = SGDClassifier(loss='log', random_state=10, n_jobs=-1)
sdg.fit(X_train1, y_train1)

CPU times: user 666 ms, sys: 39.9 ms, total: 706 ms
Wall time: 751 ms


In [39]:
sdg_pred = sdg.predict_proba(X_test1)[:, 1]

Using accuracy_score

In [38]:
# accuracy_score(y_test1, sdg_pred)
# Classification metrics can't handle a mix of binary and continuous targets use roc_auc_score instead

Using roc_auc_score

In [40]:
roc1 = roc_auc_score(y_test1, sdg_pred)
roc1

0.9635414842914114

Predicting with LogisticRegression

In [41]:
%%time
logit = LogisticRegression(random_state=10, solver='liblinear', n_jobs=-1)
logit.fit(X_train1, y_train1)

CPU times: user 31.7 s, sys: 437 ms, total: 32.2 s
Wall time: 25.2 s


In [42]:
logit_pred = logit.predict_proba(X_test1)[:, 1]

Using roc_auc_score

In [43]:
roc2 = roc_auc_score(y_test1, logit_pred)
roc2

0.9869251900387004

Getting the best parameters via GridSearchCV

In [60]:
%%time
c_values = [3, 5, 7, 10, 13, 15, 17, 20]

logit_grid = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=4, cv=3, verbose=1)

CPU times: user 389 µs, sys: 941 µs, total: 1.33 ms
Wall time: 1.68 ms


In [51]:
logit_grid.fit(X_train1, y_train1)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  4.8min
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed: 10.7min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=10, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'C': array([1.00000e-02, 1.62378e-02, 2.63665e-02, 4.28133e-02, 6.95193e-02,
       1.12884e-01, 1.83298e-01, 2.97635e-01, 4.83293e-01, 7.84760e-01,
       1.27427e+00, 2.06914e+00, 3.35982e+00, 5.45559e+00, 8.85867e+00,
       1.43845e+01, 2.33572e+01, 3.79269e+01, 6.15848e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [52]:
logit_grid.best_score_, logit_grid.best_estimator_.C

(0.9913559011238979, 23.357214690901213)

In [61]:
%%time
logit_grid.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed:  7.9min finished


CPU times: user 1min 22s, sys: 1.95 s, total: 1min 24s
Wall time: 9min 1s


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=10, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'C': [3, 5, 7, 10, 13, 15, 17, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [62]:
logit_grid.best_score_, logit_grid.best_estimator_.C, logit_grid.best_params_

(0.9923353346297403, 20, {'C': 20})

In [63]:
final_model = logit_grid.best_estimator_
logit_grid_pred = logit_grid.best_estimator_.predict_proba(X_test)[:, 1]

Write it to the submission file

In [64]:
write_to_submission_file(logit_grid_pred, 'final4_baseline.csv')