In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_train = pd.read_csv('./dataset/amex/train.csv')
df_train.head(5)

Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,age_level,user_depth,city_development_index,var_1,is_click
0,140690,2017-07-02 00:00,858557,C,359520,13787,4,,10.0,Female,4.0,3.0,3.0,0,0
1,333291,2017-07-02 00:00,243253,C,105960,11085,5,,8.0,Female,2.0,2.0,,0,0
2,129781,2017-07-02 00:00,243253,C,359520,13787,4,,8.0,Female,2.0,2.0,,0,0
3,464848,2017-07-02 00:00,1097446,I,359520,13787,3,,3.0,Male,3.0,3.0,2.0,1,0
4,90569,2017-07-02 00:01,663656,C,405490,60305,3,,2.0,Male,2.0,3.0,2.0,1,0


In [3]:
df_train.dtypes

session_id                  int64
DateTime                   object
user_id                     int64
product                    object
campaign_id                 int64
webpage_id                  int64
product_category_1          int64
product_category_2        float64
user_group_id             float64
gender                     object
age_level                 float64
user_depth                float64
city_development_index    float64
var_1                       int64
is_click                    int64
dtype: object

### Preprocessing history dataset

In [4]:
df_history = pd.read_csv('./dataset/amex/historical_user_logs.csv')
df_history.head()

Unnamed: 0,DateTime,user_id,product,action
0,2017-05-28 15:44,704,B,view
1,2017-05-29 07:08,499679,F,view
2,2017-05-29 07:10,499679,G,view
3,2017-05-29 07:10,499679,G,view
4,2017-05-29 07:10,499679,G,view


In [5]:
df_history.shape

(24287534, 4)

In [6]:
df_history['viewed'] = 1

In [7]:
df_history.loc[df_history['action'] == "interest", 'interested'] = 1

In [8]:
df_history.head()

Unnamed: 0,DateTime,user_id,product,action,viewed,interested
0,2017-05-28 15:44,704,B,view,1,
1,2017-05-29 07:08,499679,F,view,1,
2,2017-05-29 07:10,499679,G,view,1,
3,2017-05-29 07:10,499679,G,view,1,
4,2017-05-29 07:10,499679,G,view,1,


In [9]:
sum(df_history['interested'].isnull())

23854100

In [10]:
df_history['interested'].fillna(0)

0           0.0
1           0.0
2           0.0
3           0.0
4           0.0
5           0.0
6           0.0
7           0.0
8           0.0
9           0.0
10          0.0
11          0.0
12          0.0
13          0.0
14          0.0
15          0.0
16          0.0
17          0.0
18          0.0
19          0.0
20          0.0
21          0.0
22          0.0
23          0.0
24          0.0
25          0.0
26          0.0
27          0.0
28          0.0
29          0.0
           ... 
24287504    0.0
24287505    0.0
24287506    0.0
24287507    0.0
24287508    0.0
24287509    0.0
24287510    0.0
24287511    0.0
24287512    0.0
24287513    0.0
24287514    0.0
24287515    0.0
24287516    0.0
24287517    0.0
24287518    0.0
24287519    0.0
24287520    0.0
24287521    0.0
24287522    0.0
24287523    0.0
24287524    0.0
24287525    0.0
24287526    0.0
24287527    0.0
24287528    0.0
24287529    0.0
24287530    0.0
24287531    0.0
24287532    0.0
24287533    0.0
Name: interested, Length

In [11]:
df_his = df_history.groupby(['user_id','product'],as_index=False).agg({'interested' : ['sum'], 'viewed' : ['sum']})

In [12]:
df_his.head()

Unnamed: 0_level_0,user_id,product,interested,viewed
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,sum
0,4,A,0.0,3
1,4,B,0.0,1
2,4,H,0.0,1
3,4,I,0.0,2
4,19,A,0.0,14


In [13]:
df_his.columns = ['user_id', 'product', 'interested', 'viewed']

In [14]:
df_his.head()

Unnamed: 0,user_id,product,interested,viewed
0,4,A,0.0,3
1,4,B,0.0,1
2,4,H,0.0,1
3,4,I,0.0,2
4,19,A,0.0,14


### Merge history and training set

In [15]:
df_train.head()

Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,age_level,user_depth,city_development_index,var_1,is_click
0,140690,2017-07-02 00:00,858557,C,359520,13787,4,,10.0,Female,4.0,3.0,3.0,0,0
1,333291,2017-07-02 00:00,243253,C,105960,11085,5,,8.0,Female,2.0,2.0,,0,0
2,129781,2017-07-02 00:00,243253,C,359520,13787,4,,8.0,Female,2.0,2.0,,0,0
3,464848,2017-07-02 00:00,1097446,I,359520,13787,3,,3.0,Male,3.0,3.0,2.0,1,0
4,90569,2017-07-02 00:01,663656,C,405490,60305,3,,2.0,Male,2.0,3.0,2.0,1,0


In [16]:
df_his.head()

Unnamed: 0,user_id,product,interested,viewed
0,4,A,0.0,3
1,4,B,0.0,1
2,4,H,0.0,1
3,4,I,0.0,2
4,19,A,0.0,14


In [17]:
df_train.query('user_id == 858557')

Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,age_level,user_depth,city_development_index,var_1,is_click
0,140690,2017-07-02 00:00,858557,C,359520,13787,4,,10.0,Female,4.0,3.0,3.0,0,0
389894,252642,2017-07-06 22:09,858557,I,396664,51181,1,,10.0,Female,4.0,3.0,3.0,0,0


In [18]:
df_his.query('user_id == 858557')

Unnamed: 0,user_id,product,interested,viewed
801281,858557,B,0.0,80
801282,858557,C,0.0,5
801283,858557,D,0.0,25
801284,858557,E,0.0,436
801285,858557,F,0.0,40
801286,858557,G,0.0,10
801287,858557,H,0.0,25
801288,858557,I,0.0,6


In [19]:
df_training_set = pd.merge(df_train, df_his, how='left', left_on = ['user_id','product'], right_on = ['user_id','product'])

In [20]:
df_training_set

Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,age_level,user_depth,city_development_index,var_1,is_click,interested,viewed
0,140690,2017-07-02 00:00,858557,C,359520,13787,4,,10.0,Female,4.0,3.0,3.0,0,0,0.0,5.0
1,333291,2017-07-02 00:00,243253,C,105960,11085,5,,8.0,Female,2.0,2.0,,0,0,0.0,11.0
2,129781,2017-07-02 00:00,243253,C,359520,13787,4,,8.0,Female,2.0,2.0,,0,0,0.0,11.0
3,464848,2017-07-02 00:00,1097446,I,359520,13787,3,,3.0,Male,3.0,3.0,2.0,1,0,1.0,9.0
4,90569,2017-07-02 00:01,663656,C,405490,60305,3,,2.0,Male,2.0,3.0,2.0,1,0,0.0,3.0
5,151475,2017-07-02 00:01,509591,I,359520,13787,2,,1.0,Male,1.0,3.0,,0,0,0.0,23.0
6,17583,2017-07-02 00:01,1091463,F,405490,60305,3,,9.0,Female,3.0,3.0,4.0,0,0,1.0,22.0
7,461128,2017-07-02 00:01,469098,C,360936,13787,3,,4.0,Male,4.0,3.0,4.0,0,0,,
8,390699,2017-07-02 00:02,611906,H,105960,11085,5,270915.0,,,,,,0,0,2.0,12.0
9,353607,2017-07-02 00:02,418107,B,360936,13787,2,,4.0,Male,4.0,3.0,4.0,0,0,4.0,13.0


### Merge test set and history set

In [21]:
df_test = pd.read_csv('./dataset/amex/test.csv')
df_test.head()

Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,age_level,user_depth,city_development_index,var_1
0,411705,2017-07-08 00:00,732573,J,404347,53587,1,,5.0,Male,5.0,3.0,,0
1,208263,2017-07-08 00:00,172910,I,118601,28529,3,82527.0,,,,,,1
2,239450,2017-07-08 00:00,172910,I,118601,28529,4,82527.0,,,,,,1
3,547761,2017-07-08 00:00,557318,G,118601,28529,5,82527.0,1.0,Male,1.0,3.0,1.0,0
4,574275,2017-07-08 00:00,923896,H,118601,28529,5,82527.0,9.0,Female,3.0,1.0,,1


In [22]:
df_testing_set = pd.merge(df_test, df_his, how='left', left_on = ['user_id','product'], right_on = ['user_id','product'])
df_testing_set.head()

Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,age_level,user_depth,city_development_index,var_1,interested,viewed
0,411705,2017-07-08 00:00,732573,J,404347,53587,1,,5.0,Male,5.0,3.0,,0,,
1,208263,2017-07-08 00:00,172910,I,118601,28529,3,82527.0,,,,,,1,1.0,4.0
2,239450,2017-07-08 00:00,172910,I,118601,28529,4,82527.0,,,,,,1,1.0,4.0
3,547761,2017-07-08 00:00,557318,G,118601,28529,5,82527.0,1.0,Male,1.0,3.0,1.0,0,,
4,574275,2017-07-08 00:00,923896,H,118601,28529,5,82527.0,9.0,Female,3.0,1.0,,1,,


## Gaurav, you need deal with the NaN data in the dataset before you feed data into your machine learning model. Also, you need change some of columns datatype, like _id, they should be string not number. gender should be numbers. Like 0-male, 1-female,2-nottell