In [1]:
# Import libraries and set desired options
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack
# !pip install eli5
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display_html

In [2]:
# Read the training and test data sets
train_df = pd.read_csv('D:/Python projects/mlcourse_ai_solutions/alice_catch_me_if_you_can_competition/train_sessions.csv',
                       index_col='session_id', parse_dates=['time1'])
test_df = pd.read_csv('D:/Python projects/mlcourse_ai_solutions/alice_catch_me_if_you_can_competition/test_sessions.csv',
                      index_col='session_id', parse_dates=['time1'])

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,,...,,,,,,,,,,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [32]:
train_df_alice = train_df.loc[train_df["target"] == 1]
train_df_other_users = train_df.loc[train_df["target"] == 0]

## Exploring Alice dataset

In [33]:
train_df_alice.describe()

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,target
count,2297.0,2294.0,2287.0,2286.0,2280.0,2273.0,2269.0,2263.0,2262.0,2258.0,2297.0
mean,3119.592947,2999.513078,2951.274158,3072.664042,2912.150439,3035.263968,3176.235346,2930.43217,3089.833333,2981.44287,1.0
std,7120.997108,6644.107233,6628.797591,6825.23371,6648.011922,6765.622736,6953.305236,6723.534647,6873.117594,6660.374884,0.0
min,1.0,3.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,3.0,1.0
25%,76.0,76.0,75.0,76.0,75.75,76.0,76.0,76.0,75.0,76.0,1.0
50%,170.0,252.0,162.0,261.0,143.5,177.0,240.0,143.0,144.5,162.0,1.0
75%,1519.0,1677.0,1522.0,1797.5,1521.25,1523.0,2077.0,1519.0,1569.0,1520.0,1.0
max,27381.0,27387.0,27381.0,27379.0,27389.0,27389.0,27389.0,27388.0,27389.0,27381.0,1.0


In [34]:
train_df_alice.head(10)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
251175,270,2013-02-12 16:25:10,270.0,2013-02-12 16:25:11,270.0,2013-02-12 16:32:10,21.0,2013-02-12 16:32:11,21.0,2013-02-12 16:32:24,...,2013-02-12 16:32:25,21.0,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:26,30.0,2013-02-12 16:32:27,7832.0,2013-02-12 16:32:27,1
196388,29,2013-02-12 16:32:27,7832.0,2013-02-12 16:32:28,37.0,2013-02-12 16:32:29,7832.0,2013-02-12 16:32:34,7832.0,2013-02-12 16:32:35,...,2013-02-12 16:32:35,7832.0,2013-02-12 16:32:42,29.0,2013-02-12 16:32:42,7832.0,2013-02-12 16:32:51,7832.0,2013-02-12 16:32:53,1
172448,29,2013-02-12 16:32:53,7832.0,2013-02-12 16:33:11,7832.0,2013-02-12 16:33:12,29.0,2013-02-12 16:33:13,37.0,2013-02-12 16:33:15,...,2013-02-12 16:33:24,29.0,2013-02-12 16:33:24,7832.0,2013-02-12 16:33:33,29.0,2013-02-12 16:33:34,270.0,2013-02-12 16:33:46,1
70129,167,2013-02-12 16:33:50,167.0,2013-02-12 16:33:51,1515.0,2013-02-12 16:33:52,167.0,2013-02-12 16:33:52,37.0,2013-02-12 16:33:52,...,2013-02-12 16:33:52,855.0,2013-02-12 16:33:52,1515.0,2013-02-12 16:33:53,855.0,2013-02-12 16:33:53,1514.0,2013-02-12 16:33:53,1
206254,1520,2013-02-12 16:33:55,1522.0,2013-02-12 16:33:56,1522.0,2013-02-12 16:34:01,1515.0,2013-02-12 16:34:12,1515.0,2013-02-12 16:34:13,...,2013-02-12 16:34:24,1514.0,2013-02-12 16:34:24,1515.0,2013-02-12 16:34:24,1520.0,2013-02-12 16:34:25,1521.0,2013-02-12 16:34:25,1
167235,1516,2013-02-12 16:33:55,1515.0,2013-02-12 16:33:55,1514.0,2013-02-12 16:33:55,1518.0,2013-02-12 16:33:55,1521.0,2013-02-12 16:33:55,...,2013-02-12 16:33:55,1519.0,2013-02-12 16:33:55,1524.0,2013-02-12 16:33:55,1517.0,2013-02-12 16:33:55,855.0,2013-02-12 16:33:55,1
54979,1524,2013-02-12 16:34:25,1519.0,2013-02-12 16:34:25,1518.0,2013-02-12 16:34:25,1516.0,2013-02-12 16:34:25,1523.0,2013-02-12 16:34:25,...,2013-02-12 16:34:25,23.0,2013-02-12 16:34:25,392.0,2013-02-12 16:34:25,855.0,2013-02-12 16:34:25,1514.0,2013-02-12 16:34:40,1
77902,1519,2013-02-12 16:34:40,1524.0,2013-02-12 16:34:40,855.0,2013-02-12 16:34:40,1515.0,2013-02-12 16:34:40,1514.0,2013-02-12 16:34:41,...,2013-02-12 16:34:41,392.0,2013-02-12 16:34:41,1519.0,2013-02-12 16:34:42,1514.0,2013-02-12 16:34:51,855.0,2013-02-12 16:34:51,1
203387,855,2013-02-12 16:34:52,855.0,2013-02-12 16:35:02,1520.0,2013-02-12 16:35:02,1514.0,2013-02-12 16:35:02,1515.0,2013-02-12 16:35:02,...,2013-02-12 16:35:08,855.0,2013-02-12 16:35:09,1515.0,2013-02-12 16:35:27,1514.0,2013-02-12 16:35:27,1521.0,2013-02-12 16:35:27,1
104441,1520,2013-02-12 16:35:27,855.0,2013-02-12 16:35:27,1524.0,2013-02-12 16:35:27,1514.0,2013-02-12 16:35:28,1521.0,2013-02-12 16:35:28,...,2013-02-12 16:35:28,1524.0,2013-02-12 16:35:30,1520.0,2013-02-12 16:35:33,855.0,2013-02-12 16:35:48,1514.0,2013-02-12 16:35:48,1


In [43]:
sites = ["site%s" % i for i in range(1, 11)]
train_df_alice_only_sites = train_df_alice[sites]
train_df_other_users_sites = train_df_other_users[sites]

In [44]:
train_df_alice_only_sites.head(60)

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
251175,270,270.0,270.0,21.0,21.0,7832.0,21.0,7832.0,30.0,7832.0
196388,29,7832.0,37.0,7832.0,7832.0,29.0,7832.0,29.0,7832.0,7832.0
172448,29,7832.0,7832.0,29.0,37.0,7832.0,29.0,7832.0,29.0,270.0
70129,167,167.0,1515.0,167.0,37.0,1514.0,855.0,1515.0,855.0,1514.0
206254,1520,1522.0,1522.0,1515.0,1515.0,1524.0,1514.0,1515.0,1520.0,1521.0
167235,1516,1515.0,1514.0,1518.0,1521.0,1523.0,1519.0,1524.0,1517.0,855.0
54979,1524,1519.0,1518.0,1516.0,1523.0,1517.0,23.0,392.0,855.0,1514.0
77902,1519,1524.0,855.0,1515.0,1514.0,1524.0,392.0,1519.0,1514.0,855.0
203387,855,855.0,1520.0,1514.0,1515.0,1514.0,855.0,1515.0,1514.0,1521.0
104441,1520,855.0,1524.0,1514.0,1521.0,855.0,1524.0,1520.0,855.0,1514.0


In [45]:
train_df_alice_only_sites.mode()

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
0,80,77.0,80.0,77.0,77.0,80.0,77.0,76.0,77.0,77.0


In [46]:
train_df_alice_only_sites.loc[train_df_alice_only_sites.site1 == 80, 'site1'].count()

142

In [47]:
train_df_alice_only_sites.loc[train_df_alice_only_sites.site2 == 77, 'site2'].count()

130

In [48]:
train_df_alice_only_sites.loc[train_df_alice_only_sites.site3 == 80, 'site3'].count()

153

## Exploring other users dataset

In [49]:
train_df_other_users.head(10)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,,...,,,,,,,,,,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0
242171,952,2013-01-12 08:50:22,947.0,2013-01-12 08:50:23,953.0,2013-01-12 08:50:23,946.0,2013-01-12 08:50:23,947.0,2013-01-12 08:50:24,...,2013-01-12 08:50:24,953.0,2013-01-12 08:50:24,955.0,2013-01-12 08:50:24,946.0,2013-01-12 08:50:25,947.0,2013-01-12 08:50:25,0
57157,953,2013-01-12 08:50:25,947.0,2013-01-12 08:50:26,946.0,2013-01-12 08:50:26,953.0,2013-01-12 08:50:26,955.0,2013-01-12 08:50:26,...,2013-01-12 08:50:27,953.0,2013-01-12 08:50:27,946.0,2013-01-12 08:50:27,953.0,2013-01-12 08:50:28,1033.0,2013-01-12 08:50:28,0
240201,946,2013-01-12 08:50:28,947.0,2013-01-12 08:50:28,954.0,2013-01-12 08:50:28,953.0,2013-01-12 08:50:29,946.0,2013-01-12 08:50:29,...,2013-01-12 08:50:29,946.0,2013-01-12 08:50:30,956.0,2013-01-12 08:50:30,957.0,2013-01-12 08:50:31,956.0,2013-01-12 08:50:31,0
210686,946,2013-01-12 08:50:31,956.0,2013-01-12 08:50:32,946.0,2013-01-12 08:50:32,946.0,2013-01-12 08:50:33,955.0,2013-01-12 08:50:33,...,2013-01-12 08:50:33,946.0,2013-01-12 08:50:34,946.0,2013-01-12 08:50:35,946.0,2013-01-12 08:50:36,948.0,2013-01-12 08:50:36,0
98804,948,2013-01-12 08:50:37,946.0,2013-01-12 08:50:37,948.0,2013-01-12 08:50:38,784.0,2013-01-12 08:50:49,49.0,2013-01-12 08:50:59,...,2013-01-12 08:51:03,812.0,2013-01-12 08:51:03,982.0,2013-01-12 08:51:03,52.0,2013-01-12 08:51:03,52.0,2013-01-12 08:51:04,0


In [50]:
train_df_other_users_sites.head(30)

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55.0,,,,,,,,
54843,56,55.0,56.0,55.0,,,,,,
77292,946,946.0,951.0,946.0,946.0,945.0,948.0,784.0,949.0,946.0
114021,945,948.0,949.0,948.0,945.0,946.0,947.0,945.0,946.0,946.0
146670,947,950.0,948.0,947.0,950.0,952.0,946.0,951.0,946.0,947.0
242171,952,947.0,953.0,946.0,947.0,946.0,953.0,955.0,946.0,947.0
57157,953,947.0,946.0,953.0,955.0,947.0,953.0,946.0,953.0,1033.0
240201,946,947.0,954.0,953.0,946.0,954.0,946.0,956.0,957.0,956.0
210686,946,956.0,946.0,946.0,955.0,954.0,946.0,946.0,946.0,948.0
98804,948,946.0,948.0,784.0,49.0,53.0,812.0,982.0,52.0,52.0


In [51]:
train_df_other_users_sites.loc[train_df_other_users_sites.site1 == 80, "site1"].count()

3000

In [52]:
train_df_other_users_sites.loc[train_df_other_users_sites.site2 == 77, "site2"].count()

1571

In [53]:
train_df_other_users_sites.loc[train_df_other_users_sites.site3 == 80, "site3"].count()

3042