In [128]:
import pickle
from pathlib import Path
import os

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

sns.set_theme(style="whitegrid")

In [129]:
data_dir = "./data/"
train_df = pd.read_csv(os.path.join(data_dir, "train_sessions.csv"))

In [130]:
for time_column in ["time%s" % i for i in range(1, 11)]:
    train_df[time_column] = pd.to_datetime(train_df[time_column])
train_df.sort_values(by="time1", inplace=True)

In [131]:
for site_column in ["site%s" % i for i in range(1, 11)]:
    train_df[site_column] = train_df[site_column].fillna(0).astype("int")

In [133]:
# read pickle with site -> id mapping
with open(os.path.join(data_dir, "site_dic.pkl"), "rb") as input_file:
    site_dict = pickle.load(input_file)
website_df = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=["site"])
    

In [67]:
# calculate most visited website
site_columns = ["site%s" % i for i in range(1, 11)]
top_visited_website = train_df[site_columns].stack().value_counts().sort_values(ascending=False)
# remove 0
top_visited_website = top_visited_website[top_visited_website.index != 0]

In [71]:
for site_id, visit_count in top_visited_website.items():
    print(f"website_id: {site_id}, count: {visit_count}, wegsite: {website_df.loc[site_id].site}") 
    

website_id: 21, count: 123776, wegsite: www.google.fr
website_id: 23, count: 87619, wegsite: www.google.com
website_id: 782, count: 77055, wegsite: annotathon.org
website_id: 22, count: 58258, wegsite: apis.google.com
website_id: 29, count: 54094, wegsite: www.facebook.com
website_id: 167, count: 46405, wegsite: www.bing.com
website_id: 780, count: 43841, wegsite: blast.ncbi.nlm.nih.gov
website_id: 778, count: 38194, wegsite: www.ncbi.nlm.nih.gov
website_id: 52, count: 36085, wegsite: clients1.google.com
website_id: 812, count: 35178, wegsite: mail.google.com
website_id: 80, count: 31391, wegsite: s.youtube.com
website_id: 570, count: 30616, wegsite: plus.google.com
website_id: 55, count: 27812, wegsite: safebrowsing-cache.google.com
website_id: 39, count: 25275, wegsite: accounts.google.com
website_id: 37, count: 23726, wegsite: twitter.com
website_id: 30, count: 23495, wegsite: platform.twitter.com
website_id: 786, count: 23026, wegsite: www.phylogeny.fr
website_id: 35, count: 22470,

In [152]:
alice_sessions = train_df[train_df.target == 1]
other_sessions = train_df[train_df.target == 0]

In [153]:
alice_sessions.shape

(2297, 22)

In [161]:
time_columns = ["time%s" % i for i in range(1, 11)]
for time_column in time_columns:
    # alice_sessions[time_column].fillna(pd.Timestamp.min, inplace=True)
    # alice_sessions[time_column] = alice_sessions[time_column].fillna(pd.Timestamp.min)
    train_df[time_column] = train_df[time_column].fillna(pd.Timestamp.min)

train_df['duration'] = train_df.apply(lambda row: (row[time_columns].max() - row['time1']).seconds, axis=1)

In [162]:
alice_sessions = train_df[train_df.target == 1]
other_sessions = train_df[train_df.target == 0]

In [163]:
alice_sessions.duration.describe()

count    2297.000000
mean       52.296474
std       153.309014
min         0.000000
25%         4.000000
50%        11.000000
75%        38.000000
max      1763.000000
Name: duration, dtype: float64

In [165]:
other_sessions.duration.describe()

count    251264.000000
mean        139.282372
std         296.653518
min           0.000000
25%           7.000000
50%          28.000000
75%         114.000000
max        1800.000000
Name: duration, dtype: float64

In [166]:
print("alise_sessions", alice_sessions.shape)
print("other_sessions", other_sessions.shape)

alise_sessions (2297, 23)
other_sessions (251264, 23)
