In [66]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from datetime import date, time, datetime

In [67]:
train_users = pd.read_csv("train_users.csv")
test_users = pd.read_csv("test_users.csv")
sessions = pd.read_csv("sessions.csv")

In [68]:
train_users.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [69]:
train_users.drop(['country_destination'], axis = 1)
#Concatenating train and test data for EDA
df_all = pd.concat((train_users, test_users), axis = 0, ignore_index= True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [70]:
dates_list = []
year_list = []
month_list = []

for i in df_all["date_account_created"]:
    date_time_obj = datetime.strptime(i, '%Y-%m-%d')
    year = date_time_obj.date().year
    month = date_time_obj.date().month
    dates_list.append(date_time_obj)
    year_list.append(year)
    month_list.append(month)

df_all["datetime_account_created"] = dates_list
df_all["year_account_created"] = year_list
df_all["month_account_created"] = month_list

In [71]:
dates_list = []
year_list = []
month_list = []

for i in df_all["timestamp_first_active"]:
    time_stamp_obj = pd.to_datetime(i , format='%Y%m%d%H%M%S')
    year = time_stamp_obj.date().year
    month = time_stamp_obj.date().month
    dates_list.append(time_stamp_obj)
    year_list.append(year)
    month_list.append(month)
    
df_all["datetime_first_active"] = dates_list
df_all["year_first_active"] = year_list
df_all["month_first_active"] = month_list

In [72]:
df_all["created_after_active"] = df_all["datetime_account_created"] - df_all["datetime_first_active"]

In [73]:
df_all['age'].quantile(0.5)

33.0

In [74]:
df_all['age'] = np.where(df_all['age'] > 110, 34, df_all['age'])
df_all['age'].describe()

count    158681.000000
mean         36.838115
std          13.766690
min           1.000000
25%          28.000000
50%          33.000000
75%          42.000000
max         110.000000
Name: age, dtype: float64

In [75]:
by_signup_device = df_all.groupby(['signup_method', 'first_device_type'])

In [76]:
def impute_median(series):
    return series.fillna(series.median())

In [77]:
df_all.age = by_signup_device['age'].transform(impute_median)

In [78]:
df_all['first_affiliate_tracked'].value_counts()

untracked        143181
linked            62064
omg               54859
tracked-other      6655
product            2353
marketing           281
local ops            69
Name: first_affiliate_tracked, dtype: int64

In [79]:
tracked = []

for i in df_all['first_affiliate_tracked']:
    if i == "untracked" or i == "":
        isTracked = 0
    else:
        isTracked = 1
    tracked.append(isTracked)

df_all['is_first_affiliate_tracked'] = tracked

In [80]:
df_all.isnull().sum()

affiliate_channel                  0
affiliate_provider                 0
age                                1
country_destination            62096
date_account_created               0
date_first_booking            186639
first_affiliate_tracked         6085
first_browser                      0
first_device_type                  0
gender                             0
id                                 0
language                           0
signup_app                         0
signup_flow                        0
signup_method                      0
timestamp_first_active             0
datetime_account_created           0
year_account_created               0
month_account_created              0
datetime_first_active              0
year_first_active                  0
month_first_active                 0
created_after_active               0
is_first_affiliate_tracked         0
dtype: int64

In [81]:
df_all['created_after_active'].describe()

count                      275547
mean     -1 days +15:36:42.825191
std       10 days 12:30:22.643182
min             -1 days +00:00:01
25%             -1 days +03:53:56
50%             -1 days +08:24:51
75%             -1 days +19:22:57
max            1455 days 09:53:18
Name: created_after_active, dtype: object

In [82]:
df_all = df_all.rename(columns={"id":"user_id"})

In [83]:
sessions.user_id.value_counts()

vcmr2jh5ix    2085
yu5bdalz2b    1811
s5ez13snz0    1685
bp4mwnlbxs    1526
jastc45uok    1518
tu41qoa28z    1410
wrccc4m1uw    1390
k3w4kskp4o    1382
9zh7tdb4jr    1250
pwgji1tuko    1196
fm89u29kv7    1185
dzhwxl5uie    1184
a30puilbmc    1182
ytplei6boz    1182
fsa2gw8kk4    1179
337zc4etfh    1169
jrqykh9y8x    1169
aqaf70t5o1    1138
g1nf1o6wku    1130
b8meb8ctyi    1081
kquztvwe0z    1075
fhs3afctov    1043
mpzaszcsz1    1010
nq0c3g76js     998
4cgolfx94c     976
66kf2va3x5     974
edkc9jwn5x     968
3jg5g7my08     950
5ilh34e5h9     941
jvo4exq16y     932
              ... 
ms6iyueskd       1
uy379oaywr       1
q6mj8zlbb4       1
zp5l4p84en       1
rdarpa2b1q       1
qf9fvtrtpw       1
po9691e1ro       1
lth8r66qj2       1
5y8afixyb4       1
k19cmjvcj8       1
mx6bwdccw8       1
d2ej924mim       1
i9whrewyzu       1
bfd9g020f1       1
b36nw4lraq       1
jti8fyfhyl       1
3zwj91oehu       1
jxdfl1ivgr       1
h4agxsm49o       1
hgqbictsnb       1
9bjx1squch       1
wdpax4jbsk  

In [84]:
seconds = sessions.groupby('user_id', as_index=False).agg({"secs_elapsed": "sum"})
seconds.head()

Unnamed: 0,user_id,secs_elapsed
0,0035hobuyj,5724670.0
1,007gj7kqdk,370406.0
2,009a40t3dk,2079299.0
3,00allnceb8,0.0
4,00fhpdik5t,221087.0


In [85]:
df_all = pd.merge(df_all, seconds, on="user_id", how="left")

In [86]:
df_all['secs_elapsed'] = df_all['secs_elapsed'].fillna(0)

In [87]:
df_all['day_account_created'] = df_all['datetime_account_created'].dt.day_name()
df_all['day_first_active'] = df_all['datetime_first_active'].dt.day_name()

In [88]:
day_acct_created = pd.get_dummies(df_all['day_account_created'])
day_first_active = pd.get_dummies(df_all['day_first_active'])

In [89]:
day_acct_created = day_acct_created.rename(columns={"Monday": "acct_created_Mon", "Tuesday": "acct_created_Tues", "Wednesday": "acct_created_Wed",
                                "Thursday": "acct_created_Thurs", "Friday": "acct_created_Fri", "Saturday": "acct_created_Sat", 
                                 "Sunday": "acct_created_Sun"})
day_first_active = day_first_active.rename(columns={"Monday": "first_active_Mon", "Tuesday": "first_active_Tues", "Wednesday": "first_active_Wed",
                                "Thursday": "first_active_Thurs", "Friday": "first_active_Fri", "Saturday": "first_active_Sat", 
                                 "Sunday": "first_active_Sun"})

In [90]:
df_all = pd.concat([df_all.reset_index(drop=True),day_acct_created.reset_index(drop=True)], axis=1)
df_all = pd.concat([df_all.reset_index(drop=True),day_first_active.reset_index(drop=True)], axis=1)

In [91]:
df_all.head()

Unnamed: 0,affiliate_channel,affiliate_provider,age,country_destination,date_account_created,date_first_booking,first_affiliate_tracked,first_browser,first_device_type,gender,...,acct_created_Thurs,acct_created_Tues,acct_created_Wed,first_active_Fri,first_active_Mon,first_active_Sat,first_active_Sun,first_active_Thurs,first_active_Tues,first_active_Wed
0,direct,direct,33.0,NDF,2010-06-28,,untracked,Chrome,Mac Desktop,-unknown-,...,0,0,0,0,0,0,0,1,0,0
1,seo,google,38.0,NDF,2011-05-25,,untracked,Chrome,Mac Desktop,MALE,...,0,0,1,0,0,1,0,0,0,0
2,direct,direct,56.0,US,2010-09-28,2010-08-02,untracked,IE,Windows Desktop,FEMALE,...,0,1,0,0,0,0,0,0,1,0
3,direct,direct,42.0,other,2011-12-05,2012-09-08,untracked,Firefox,Mac Desktop,FEMALE,...,0,0,0,0,0,1,0,0,0,0
4,direct,direct,41.0,US,2010-09-14,2010-02-18,untracked,Chrome,Mac Desktop,-unknown-,...,0,1,0,0,0,0,0,0,1,0
