In [1]:
# Import libraries and set desired options
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

## 1. Data Downloading and Transformation

In [2]:
# Read the training and test data sets, change paths if needed
train_df = pd.read_csv('../data/Catch Me If You Can/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('../data/Catch Me If You Can/test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


There are some empty values in the table, it means that some sessions contain less than ten websites. Replace empty values with 0 and change columns types to integer. Also load the websites dictionary and check how it looks like:

In [3]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype(np.uint16)
test_df[sites] = test_df[sites].fillna(0).astype(np.uint16)

# Load websites dictionary
with open(r"../data/Catch Me If You Can/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


### 4.1. What are the dimensions of the training and test sets (in exactly this order)?

In [6]:
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

### Answer(4.1): (253561, 21) and (82797, 20) 

## 2. Brief Exploratory Data Analysis

Before we start training models, we have to perform Exploratory Data Analysis (EDA). Today, we are going to perform a shorter version, but we will use other techniques as we move forward. Let's check which websites in the training data set are the most visited. As you can see, they are Google services and a bioinformatics website (a website with 'zero'-index is our missed values, just ignore it):


In [14]:
train_df[sites].head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [22]:
# Top websites in the training data set
top_sites = pd.Series(train_df[sites].values.flatten()
                     ).value_counts().sort_values(ascending=False).head(5)
print(top_sites)
sites_dict.loc[top_sites.drop(0).index]

21     123776
0      122730
23      87619
782     77055
22      58258
dtype: int64


Unnamed: 0,site
21,www.google.fr
23,www.google.com
782,annotathon.org
22,apis.google.com


### 4.2. What kind of websites does Alice visit the most?

In [71]:
Alice_sessions = train_df[train_df['target']==1]

In [72]:
Alice_sessions[sites].head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
251175,270,270,270,21,21,7832,21,7832,30,7832
196388,29,7832,37,7832,7832,29,7832,29,7832,7832
172448,29,7832,7832,29,37,7832,29,7832,29,270
70129,167,167,1515,167,37,1514,855,1515,855,1514
206254,1520,1522,1522,1515,1515,1524,1514,1515,1520,1521


In [70]:
# Top websites in the training data set
top_sites_Alice = pd.Series(Alice_sessions[sites].values.flatten()
                     ).value_counts().sort_values(ascending=False).head(5)
print(top_sites_Alice)
sites_dict.loc[top_sites_Alice.index]

77    1382
80    1354
76    1307
29     897
21     857
dtype: int64


Unnamed: 0,site
77,i1.ytimg.com
80,s.youtube.com
76,www.youtube.com
29,www.facebook.com
21,www.google.fr


### Answer(4.2): videohostings 

Now let us look at the timestamps and try to characterize sessions as timeframes:

In [69]:
# Create a separate dataframe where we will work with timestamps
time_df = pd.DataFrame(index=train_df.index)
time_df['target'] = train_df['target']

# Find sessions' starting and ending
time_df['min'] = train_df[times].min(axis=1)
time_df['max'] = train_df[times].max(axis=1)

# Calculate sessions' duration in seconds
time_df['seconds'] = (time_df['max'] - time_df['min']) / np.timedelta64(1, 's')

time_df.head()

Unnamed: 0_level_0,target,min,max,seconds
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
21669,0,2013-01-12 08:05:57,2013-01-12 08:05:57,0.0
54843,0,2013-01-12 08:37:23,2013-01-12 09:07:09,1786.0
77292,0,2013-01-12 08:50:13,2013-01-12 08:50:17,4.0
114021,0,2013-01-12 08:50:17,2013-01-12 08:50:20,3.0
146670,0,2013-01-12 08:50:20,2013-01-12 08:50:22,2.0


In order to perform the next task, generate descriptive statistics as you did in the first assignment.

### 4.3. Select all correct statements:

on average, Alice's session is shorter than that of other users?
**-True**

In [66]:
#for Alice
time_df[time_df['target']==1]['seconds'].mean()

52.29647366129734

In [67]:
#for another
time_df[time_df['target']==0]['seconds'].mean()

139.28237232552215

more than 1% of all sessions in the dataset belong to Alice?
**-False**

In [68]:
procents = 100*(time_df[time_df['target']==1]['target'].count()/time_df.shape[0])
print('Alice sessions of all: ',procents, '%')

Alice sessions of all:  0.905896411514389 %


minimum and maximum durations of Alice's and other users' sessions are approximately the same?
**-True**

In [81]:
#for Alice
min_duration = time_df[time_df['target']==1]['seconds'].min()
max_duration = time_df[time_df['target']==1]['seconds'].max()
print('minimum duration for Alice:%ds' % min_duration,'\nmaximum duration for Alice:%ds' % max_duration)

minimum duration for Alice:0s 
maximum duration for Alice:1763s


In [83]:
#for another
min_duration_an = time_df[time_df['target']==0]['seconds'].min()
max_duration_an = time_df[time_df['target']==0]['seconds'].max()
print('minimum duration for another:%ds' % min_duration_an,'\nmaximum duration for another:%ds' % max_duration_an)

minimum duration for another:0s 
maximum duration for another:1800s


variation about the mean session duration for all users (including Alice) is approximately the same?
**-ХЗ**

In [93]:
#time_df[time_df['target']==0].describe()

In [92]:
#time_df.describe()

less than a quarter of Alice's sessions are greater than or equal to 40 seconds?
**-True**

In [84]:
time_df[time_df['target']==1].describe()

Unnamed: 0,target,seconds
count,2297.0,2297.0
mean,1.0,52.296474
std,0.0,153.309014
min,1.0,0.0
25%,1.0,4.0
50%,1.0,11.0
75%,1.0,38.0
max,1.0,1763.0


In [131]:
Alice_time_df = time_df[time_df['target']==1]

In [130]:
Quant = Alice_time_df['seconds'].quantile(.75)
Quant1 = Alice_time_df['seconds'].quantile(1)

Alice_time_df[Alice_time_df['seconds'].between(Quant, Quant1)][['seconds']].min()

seconds    38.0
dtype: float64

In order to train our first model, we need to prepare the data. First of all, exclude the target variable from the training set. Now both training and test sets have the same number of columns, therefore aggregate them into one dataframe. Thus, all transformations will be performed simultaneously on both training and test data sets.

In [133]:
# Our target variable
y_train = train_df['target']

# United dataframe of the initial data 
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [134]:
# Dataframe with indices of visited websites in session
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [168]:
range(0, sites_flatten.shape[0]  + 10, 10)

range(0, 3363590, 10)

In [184]:
sites_flatten

array([  56,   55,    0, ..., 1098, 1098, 1098], dtype=uint16)

In [186]:
# sequence of indices
sites_flatten = full_sites.values.flatten()

# and the matrix we are looking for 
# (make sure you understand which of the `csr_matrix` constructors is used here)
# a further toy example will help you with it
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                sites_flatten,
                                range(0, sites_flatten.shape[0]  + 10, 10)))[:, 1:]

In [172]:
# How much memory does a sparse matrix occupy?
print('{0} elements * {1} bytes = {2} bytes'.format(full_sites_sparse.count_nonzero(), 8, 
                                                    full_sites_sparse.count_nonzero() * 8))
# Or just like this:
print('sparse_matrix_size = {0} bytes'.format(full_sites_sparse.data.nbytes))

1866898 elements * 8 bytes = 14935184 bytes
sparse_matrix_size = 7467592 bytes


In [287]:
# data, create the list of ones, length of which equal to the number of elements in the initial dataframe (9)
# By summing the number of ones in the cell, we get the frequency,
# number of visits to a particular site per session
data = [1] * 9

# To do this, you need to correctly distribute the ones in cells
# Indices - website ids, i.e. columns of a new matrix. We will sum ones up grouping them by sessions (ids)
indices = [1, 0, 0, 1, 3, 1, 2, 3, 4]

# Indices for the division into rows (sessions)
# For example, line 0 is the elements between the indices [0; 3) - the rightmost value is not included
# Line 1 is the elements between the indices [3; 6)
# Line 2 is the elements between the indices [6; 9) 
indptr = [0, 3, 6, 9]

# Aggregate these three variables into a tuple and compose a matrix
# To display this matrix on the screen transform it into the usual "dense" matrix
small_example = csr_matrix((data, indices, indptr))[:, 1:]

In [289]:
small_example.todense()

matrix([[1, 0, 0, 0],
        [2, 0, 1, 0],
        [0, 1, 1, 1]], dtype=int32)

In [288]:
small_example.count_nonzero()

6

In [285]:
small_example.shape

(3, 4)

### 4.4. What is the sparseness of the matrix in our small example?

In [291]:
((small_example.shape[0]*small_example.shape[1])-small_example.count_nonzero())\
            /(small_example.shape[0]*small_example.shape[1])

0.5

### Answer(4.4): 0.5