In [50]:
import pandas as pd
import numpy as np
import zipfile
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import re
from wordcloud import WordCloud
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from datetime import datetime, date
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix
from scipy.sparse import hstack
from scipy.sparse import vstack
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder
import pickle
import joblib

In [51]:
#train csv

train_df = pd.read_csv('../data/train_users_2.csv')

print(train_df.shape)

print(train_df.columns)

train_df.head()

(213451, 16)
Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser', 'country_destination'],
      dtype='object')


Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [52]:
#Checking null values
train_df.isnull().sum()

id                              0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                          0
age                         87990
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6065
signup_app                      0
first_device_type               0
first_browser                   0
country_destination             0
dtype: int64

In [53]:
# sessions csv

session_df = pd.read_csv("../data/sessions.csv")

print(session_df.shape)

print("\nColumns:",session_df.columns)

session_df.head()

(10567737, 6)

Columns: Index(['user_id', 'action', 'action_type', 'action_detail', 'device_type',
       'secs_elapsed'],
      dtype='object')


Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


In [54]:
#Checking null values
session_df.isnull().sum()

user_id            34496
action             79626
action_type      1126204
action_detail    1126204
device_type            0
secs_elapsed      136031
dtype: int64

In [55]:
print(session_df.shape)

# It removes rows where the 'user_id' column has missing (NaN) values.
session_df = session_df.dropna(subset = ['user_id'])

print(session_df.shape)

(10567737, 6)
(10533241, 6)


In [56]:
session_df['user_id'].nunique()

135483

In [57]:
session_df['device_type'].unique()

array(['Windows Desktop', '-unknown-', 'Mac Desktop', 'Android Phone',
       'iPhone', 'iPad Tablet', 'Android App Unknown Phone/Tablet',
       'Linux Desktop', 'Tablet', 'Chromebook', 'Blackberry', 'iPodtouch',
       'Windows Phone', 'Opera Phone'], dtype=object)

To condense multiple rows into 1

In [58]:
#https://stackoverflow.com/questions/34776651/concatenate-rows-of-pandas-dataframe-with-same-id

session_df_concat = session_df.groupby('user_id', as_index=False).agg(lambda x: x.tolist())

print(session_df_concat.shape)

session_df_concat.head()

(135483, 6)


Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,00023iyk9l,"[index, dashboard, header_userpic, dashboard, ...","[view, view, data, view, partner_callback, mes...","[view_search_results, dashboard, header_userpi...","[Mac Desktop, Mac Desktop, Mac Desktop, Mac De...","[20438.0, 787.0, 850.0, 934.0, nan, 129817.0, ..."
1,0010k6l0om,"[search_results, show, personalize, show, sear...","[click, view, data, nan, click, click, nan, da...","[view_search_results, p3, wishlist_content_upd...","[Mac Desktop, Mac Desktop, Mac Desktop, Mac De...","[1708.0, 21260.0, 1223.0, 26.0, 847.0, 1230.0,..."
2,001wyh0pz8,"[search, search, search, show, social_connecti...","[click, click, click, view, data, -unknown-, v...","[view_search_results, view_search_results, vie...","[Android App Unknown Phone/Tablet, Android App...","[622.0, 1813.0, 1507.0, 6327.0, 927.0, 142.0, ..."
3,0028jgx1x1,"[show, reviews, show, search, show, search, re...","[view, data, view, click, view, click, data, s...","[user_profile, listing_reviews, p3, view_searc...","[-unknown-, -unknown-, -unknown-, -unknown-, -...","[6162.0, 75.0, 86.0, 13710.0, 25217.0, 10989.0..."
4,002qnbzfs5,"[social_connections, payment_methods, create, ...","[data, -unknown-, -unknown-, view, data, data,...","[user_social_connections, -unknown-, -unknown-...","[iPhone, iPhone, iPhone, iPhone, iPhone, iPhon...","[17135.0, 711.0, 274.0, 179.0, 483.0, 1.0, 782..."


In [59]:
# Function to convert list into strings

def abcd(action):
    
    """
    Function to convert list into strings
    
    parameters: action 
    
    returns : action  
    
    """
    action = [ str(i) for i in action ]
    
    action = [ re.sub('nan','',i) for i in action ] 
    
    action = ','.join(action)
    
    return action

In [60]:
session_df_concat['action'] = session_df_concat['action'].apply(abcd)

session_df_concat['action'].head()

0    index,dashboard,header_userpic,dashboard,callb...
1    search_results,show,personalize,show,search_re...
2    search,search,search,show,social_connections,i...
3    show,reviews,show,search,show,search,reviews,c...
4    social_connections,payment_methods,create,show...
Name: action, dtype: object

In [61]:
session_df_concat['action_type'] = session_df_concat['action_type'].apply(abcd)

session_df_concat['action_type'].head()

0    view,view,data,view,partner_callback,message_p...
1    click,view,data,,click,click,,data,view,partne...
2    click,click,click,view,data,-unknown-,view,-un...
3    view,data,view,click,view,click,data,submit,-u...
4    data,-unknown-,-unknown-,view,data,data,data,,...
Name: action_type, dtype: object

In [62]:
session_df_concat['action_detail'] = session_df_concat['action_detail'].apply(abcd)

session_df_concat['action_detail'].head()

0    view_search_results,dashboard,header_userpic,d...
1    view_search_results,p3,wishlist_content_update...
2    view_search_results,view_search_results,view_s...
3    user_profile,listing_reviews,p3,view_search_re...
4    user_social_connections,-unknown-,-unknown-,us...
Name: action_detail, dtype: object

In [63]:
# Function to convert list into strings

def efgh(device):
    
    """
    Function to convert list into strings
    
    parameters: device 
    
    returns : device  
    
    """
    
    device = [ str(i) for i in device ]
    
    device = [ re.sub('nan','',i) for i in device ] 
                
    device = ','.join(set(device))
    
    return device

In [64]:
session_df_concat['device_type'] = session_df_concat['device_type'].apply(efgh)

session_df_concat['device_type'].head()

0                  Mac Desktop,iPhone
1                         Mac Desktop
2    Android App Unknown Phone/Tablet
3             -unknown-,Android Phone
4                    -unknown-,iPhone
Name: device_type, dtype: object

In [65]:
# Function to convert list into strings

def ijkl(time):
    
    """
    Function to convert list into strings
    
    parameters: time 
    
    returns : time  
    
    """
    
    float_time = []
    
    time = [ str(i) for i in time ]
    
    time = [ re.sub('nan','',i) for i in time ] 
        
    for i in time:
        
         try:
                
                float_time.append(float(i))
         
         except ValueError:
                
                continue
    
    float_time = sum(float_time)
    
    return float_time

In [66]:
session_df_concat['secs_elapsed'] = session_df_concat['secs_elapsed'].apply(ijkl)

session_df_concat['secs_elapsed'].head()

0     867896.0
1     586543.0
2     282965.0
3     297010.0
4    6487080.0
Name: secs_elapsed, dtype: float64

In [67]:
print(session_df_concat.shape)

session_df_concat.head()

(135483, 6)


Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,00023iyk9l,"index,dashboard,header_userpic,dashboard,callb...","view,view,data,view,partner_callback,message_p...","view_search_results,dashboard,header_userpic,d...","Mac Desktop,iPhone",867896.0
1,0010k6l0om,"search_results,show,personalize,show,search_re...","click,view,data,,click,click,,data,view,partne...","view_search_results,p3,wishlist_content_update...",Mac Desktop,586543.0
2,001wyh0pz8,"search,search,search,show,social_connections,i...","click,click,click,view,data,-unknown-,view,-un...","view_search_results,view_search_results,view_s...",Android App Unknown Phone/Tablet,282965.0
3,0028jgx1x1,"show,reviews,show,search,show,search,reviews,c...","view,data,view,click,view,click,data,submit,-u...","user_profile,listing_reviews,p3,view_search_re...","-unknown-,Android Phone",297010.0
4,002qnbzfs5,"social_connections,payment_methods,create,show...","data,-unknown-,-unknown-,view,data,data,data,,...","user_social_connections,-unknown-,-unknown-,us...","-unknown-,iPhone",6487080.0


In [68]:
# Join train and session df

train_merge = train_df.merge(session_df_concat, left_on='id', right_on='user_id', how='inner')

print("Train  :",train_df.shape)

print("Session:",session_df_concat.shape)

print("Merge  :",train_merge.shape)

print("No of users in Train Data with session info:",train_merge.shape[0])

print("{} / {} = {}".format(train_merge.shape[0],train_df.shape[0],np.round((train_merge.shape[0]/train_df.shape[0]),2)))

Train  : (213451, 16)
Session: (135483, 6)
Merge  : (73815, 22)
No of users in Train Data with session info: 73815
73815 / 213451 = 0.35


1) only 35 % of users in train data have session info

In [69]:
print(train_merge.columns)

train_merge.head()

Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser', 'country_destination', 'user_id', 'action',
       'action_type', 'action_detail', 'device_type', 'secs_elapsed'],
      dtype='object')


Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,...,signup_app,first_device_type,first_browser,country_destination,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,62.0,basic,0,en,sem-non-brand,...,Web,Windows Desktop,Chrome,other,d1mm9tcy42,"lookup,search_results,lookup,search_results,lo...",",click,,click,,click,,data,view,,click,,data,v...",",view_search_results,,view_search_results,,vie...","-unknown-,Windows Desktop",3427529.0
1,yo8nz8bqcq,2014-01-01,20140101001558,,-unknown-,,basic,0,en,direct,...,Web,Mac Desktop,Firefox,NDF,yo8nz8bqcq,"dashboard,create,confirm_email,show,show_perso...","view,submit,click,view,data,view,data,data,","dashboard,create_user,confirm_email_link,p3,us...",Mac Desktop,207842.0
2,4grx6yxeby,2014-01-01,20140101001639,,-unknown-,,basic,0,en,sem-brand,...,Web,Windows Desktop,Firefox,NDF,4grx6yxeby,"verify,create,,pending,requested,header_userpi...","-unknown-,submit,message_post,booking_request,...","-unknown-,create_user,message_post,pending,p5,...","-unknown-,Windows Desktop",1135444.0
3,ncf87guaf0,2014-01-01,20140101002146,,-unknown-,,basic,0,en,direct,...,Web,Windows Desktop,Chrome,NDF,ncf87guaf0,"lookup,show,search_results,search_results,show...",",view,click,click,view,view,click,data,data,da...",",p3,view_search_results,view_search_results,p3...","-unknown-,Android Phone,Windows Desktop",3755100.0
4,4rvqpxoh3h,2014-01-01,20140101002619,2014-01-02,-unknown-,,basic,25,en,direct,...,iOS,iPhone,-unknown-,GB,4rvqpxoh3h,"campaigns,active,create,notifications,listings...","-unknown-,-unknown-,-unknown-,-unknown-,-unkno...","-unknown-,-unknown-,-unknown-,-unknown-,-unkno...",iPhone,2555.0


In [70]:
# saving to memory

train_merge.to_pickle('../data/train_merge_raw.pickle')

In [71]:
#Checking null values
train_merge.isnull().sum()

id                             0
date_account_created           0
timestamp_first_active         0
date_first_booking         45041
gender                         0
age                        32248
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked      302
signup_app                     0
first_device_type              0
first_browser                  0
country_destination            0
user_id                        0
action                         0
action_type                    0
action_detail                  0
device_type                    0
secs_elapsed                   0
dtype: int64

In [72]:
#test csv

test_df = pd.read_csv('../data/test_users.csv')

print(test_df.shape)

print(test_df.columns)

test_df.head()

(62096, 15)
Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser'],
      dtype='object')


Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


In [73]:
#Checking null values
test_df.isnull().sum()

id                             0
date_account_created           0
timestamp_first_active         0
date_first_booking         62096
gender                         0
age                        28876
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked       20
signup_app                     0
first_device_type              0
first_browser                  0
dtype: int64

In [74]:
# Join test and session df

test_merge = test_df.merge(session_df_concat, left_on='id', right_on='user_id', how='inner')

print("Test  :",test_df.shape)

print("Session:",session_df_concat.shape)

print("Merge  :",test_merge.shape)

print("No of users in Test Data with session info:",test_merge.shape[0])

print("{} / {} = {}".format(test_merge.shape[0],test_df.shape[0],np.round((test_merge.shape[0]/test_df.shape[0]),2)))

Test  : (62096, 15)
Session: (135483, 6)
Merge  : (61668, 21)
No of users in Test Data with session info: 61668
61668 / 62096 = 0.99


1) Nearly 1 % of users in test data dont have session info.

2) Will use left join.

In [75]:
# Join test and session df

test_merge = test_df.merge(session_df_concat, left_on='id', right_on='user_id', how='left')

print("Test  :",test_df.shape)

print("Session:",session_df_concat.shape)

print("Merge  :",test_merge.shape)

print("No of users in Test Data with session info:",test_merge.shape[0])

print("{} / {} = {}".format(test_merge.shape[0],test_df.shape[0],np.round((test_merge.shape[0]/test_df.shape[0]),2)))

Test  : (62096, 15)
Session: (135483, 6)
Merge  : (62096, 21)
No of users in Test Data with session info: 62096
62096 / 62096 = 1.0


In [76]:
#Checking null values
test_merge.isnull().sum()

id                             0
date_account_created           0
timestamp_first_active         0
date_first_booking         62096
gender                         0
age                        28876
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked       20
signup_app                     0
first_device_type              0
first_browser                  0
user_id                      428
action                       428
action_type                  428
action_detail                428
device_type                  428
secs_elapsed                 428
dtype: int64

In [77]:
test_merge['user_id'].fillna('na' , inplace=True)

test_merge['action'].fillna('na' , inplace=True)

test_merge['action_type'].fillna('na' , inplace=True)

test_merge['action_detail'].fillna('na' , inplace=True)

test_merge['device_type'].fillna('na' , inplace=True)

test_merge['secs_elapsed'].fillna(0, inplace=True)

In [78]:
#Checking null values
test_merge.isnull().sum()

id                             0
date_account_created           0
timestamp_first_active         0
date_first_booking         62096
gender                         0
age                        28876
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked       20
signup_app                     0
first_device_type              0
first_browser                  0
user_id                        0
action                         0
action_type                    0
action_detail                  0
device_type                    0
secs_elapsed                   0
dtype: int64

In [79]:
print(test_merge.columns)

test_merge.head()

Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser', 'user_id', 'action', 'action_type', 'action_detail',
       'device_type', 'secs_elapsed'],
      dtype='object')


Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,...,first_affiliate_tracked,signup_app,first_device_type,first_browser,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,...,untracked,Moweb,iPhone,Mobile Safari,5uwns89zht,"show,search,search,show,authenticate,reviews,a...","view,click,click,view,submit,data,-unknown-,click","user_profile,view_search_results,view_search_r...","-unknown-,iPhone",119187.0
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,...,untracked,Moweb,iPhone,Mobile Safari,jtl0dijy2j,"dashboard,login,search,create,search,signup_lo...","view,view,click,submit,click,view,-unknown-,vi...","dashboard,login_page,view_search_results,creat...","-unknown-,iPhone",250119.0
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,...,linked,Web,Windows Desktop,Chrome,xx0ulgorjt,"index,index,index,search_results,search_result...","view,view,view,click,click,click,click,data,vi...","view_search_results,view_search_results,view_s...",Windows Desktop,975575.0
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,...,linked,Web,Windows Desktop,IE,6c6puo6ix0,"personalize,header_userpic,header_userpic,show...","data,data,data,view,view,,data,click,submit,,data","wishlist_content_update,header_userpic,header_...",Windows Desktop,123001.0
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,...,untracked,Web,Mac Desktop,Safari,czqhjk3yfe,"message_to_host_change,agree_terms_check,pendi...","click,-unknown-,booking_request,message_post,v...","message_to_host_change,-unknown-,pending,messa...",Mac Desktop,454023.0


In [80]:
# daily,monthly and yearly data for date_account_created

train_merge['date_account_created'] = pd.to_datetime(train_merge['date_account_created'])

train_merge['date_account_created_day'] = train_merge.date_account_created.dt.weekday

train_merge['date_account_created_month'] = train_merge.date_account_created.dt.month

train_merge['date_account_created_year'] = train_merge.date_account_created.dt.year

test_merge['date_account_created'] = pd.to_datetime(test_merge['date_account_created'])

test_merge['date_account_created_day'] = test_merge.date_account_created.dt.weekday

test_merge['date_account_created_month'] = test_merge.date_account_created.dt.month

test_merge['date_account_created_year'] = test_merge.date_account_created.dt.year

In [81]:
def func_timestamp_first_active(timestamp):
        
        """
        Function takes timestamp_first_active and converts it into datetime.
        
        Input type:  Integer
        
        return type: Datetime
        
        """
        
        timestamp = str(timestamp)
        
        timestamp = datetime(year=int(timestamp[0:4]), month=int(timestamp[4:6]), day=int(timestamp[6:8]),\
                             
                             hour=int(timestamp[8:10]), minute=int(timestamp[10:12]), second=int(timestamp[12:]))
        
        return timestamp

In [82]:
# daily,monthly,yearly and horly data for timestamp_first_active

train_merge['timestamp_first_active'] = pd.to_datetime(train_merge.timestamp_first_active.apply(func_timestamp_first_active))

train_merge['timestamp_first_active_day'] = train_merge.timestamp_first_active.dt.weekday

train_merge['timestamp_first_active_month'] = train_merge.timestamp_first_active.dt.month

train_merge['timestamp_first_active_year'] = train_merge.timestamp_first_active.dt.year

train_merge['timestamp_first_active_hour'] = train_merge.timestamp_first_active.dt.hour 

test_merge['timestamp_first_active'] = pd.to_datetime(test_merge.timestamp_first_active.apply(func_timestamp_first_active))

test_merge['timestamp_first_active_day'] = test_merge.timestamp_first_active.dt.weekday

test_merge['timestamp_first_active_month'] = test_merge.timestamp_first_active.dt.month

test_merge['timestamp_first_active_year'] = test_merge.timestamp_first_active.dt.year

test_merge['timestamp_first_active_hour'] = test_merge.timestamp_first_active.dt.hour 

In [83]:
# median replacement with median age = 34.0

def age_median(age):
    
    """
    Function to replace age outliers with median age
    
    parameters: age 
    
    returns : age  
    
    """
    
    if age < 15.0 or age > 100.0: 
        
        return 34.0
  
    else: 
        
        return age

In [84]:
train_merge['age'] = train_merge['age'].apply(age_median)

train_merge['age'].fillna(34.0 , inplace=True)

test_merge['age'] = test_merge['age'].apply(age_median)

test_merge['age'].fillna(34.0 , inplace=True)

In [85]:
#creating age buckets

bins = [i for i in range(15,106,5)]

def age_interv(age):
        
    """
    
    Function takes age and returns interval.
    
    parameters:  age
    
    returns: age interval
    
    """
        
    for i in range(len(bins)):
        
        if age < bins[i]:
            
            return i

In [86]:
train_merge['age_interv'] = train_merge['age'].apply(lambda x: age_interv(x))

test_merge['age_interv'] = test_merge['age'].apply(lambda x: age_interv(x))

In [87]:
# mode replacement for first_affiliate_tracked

train_merge.first_affiliate_tracked.fillna('untracked',inplace = True)

test_merge.first_affiliate_tracked.fillna('untracked',inplace = True)

In [88]:
#getting labels

y = train_merge['country_destination']

In [89]:
# drop colmns

train_merge.drop(['id','date_account_created','timestamp_first_active',\
                  'date_first_booking','country_destination','user_id'],axis=1,inplace = True)

test_merge.drop(['id','date_account_created','timestamp_first_active',\
                  'date_first_booking','user_id'],axis=1,inplace = True)

In [91]:
# OHE

# https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data

lst_ohe_train = []

ohe = ['gender', 'signup_method', 'language', 'affiliate_channel',\
            'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']

for feat in ohe:
    
    train_ohe = pd.get_dummies(train_merge[feat], prefix=feat)
    
    train_merge.drop([feat], axis=1, inplace = True)
    
    test_ohe = pd.get_dummies(test_merge[feat], prefix=feat)
    
    test_merge.drop([feat], axis=1, inplace = True)
    
    lst_ohe_train.append(train_ohe.columns)
        
    # Get missing columns in the training test
    
    missing_cols = set( train_ohe.columns ) - set( test_ohe.columns )
    
    # Add a missing column in test set with default value equal to 0
    
    for c in missing_cols:
        
        test_ohe[c] = 0
        
    # Ensure the order of column in the test set is in the same order than in train set
    
    test_ohe = test_ohe[train_ohe.columns]
    
    train_merge = pd.concat((train_merge, train_ohe), axis=1)    
        
    test_merge = pd.concat((test_merge, test_ohe), axis=1)

In [94]:
# OHE device_type

# https://stackoverflow.com/questions/57469676/python-one-hot-encoding-for-comma-separated-values

train_ohe = train_merge['device_type'].str.get_dummies(sep=",")
    
train_merge.drop(['device_type'], axis=1, inplace = True)
    
test_ohe = test_merge['device_type'].str.get_dummies(sep=",")
    
test_merge.drop(['device_type'], axis=1, inplace = True)
        
# Get missing columns in the training test
    
missing_cols = set( train_ohe.columns ) - set( test_ohe.columns )

lst_ohe_train.append(train_ohe.columns)
    
# Add a missing column in test set with default value equal to 0
    
for c in missing_cols:
        
    test_ohe[c] = 0
        
# Ensure the order of column in the test set is in the same order than in train set
    
test_ohe = test_ohe[train_ohe.columns]
    
train_merge = pd.concat((train_merge, train_ohe), axis=1)    
        
test_merge = pd.concat((test_merge, test_ohe), axis=1)

In [95]:
print(train_merge.columns)

train_merge.head()

Index(['age', 'signup_flow', 'action', 'action_type', 'action_detail',
       'secs_elapsed', 'date_account_created_day',
       'date_account_created_month', 'date_account_created_year',
       'timestamp_first_active_day',
       ...
       'Chromebook', 'Linux Desktop', 'Mac Desktop', 'Opera Phone', 'Tablet',
       'Windows Desktop', 'Windows Phone', 'iPad Tablet', 'iPhone',
       'iPodtouch'],
      dtype='object', length=139)


Unnamed: 0,age,signup_flow,action,action_type,action_detail,secs_elapsed,date_account_created_day,date_account_created_month,date_account_created_year,timestamp_first_active_day,...,Chromebook,Linux Desktop,Mac Desktop,Opera Phone,Tablet,Windows Desktop,Windows Phone,iPad Tablet,iPhone,iPodtouch
0,62.0,0,"lookup,search_results,lookup,search_results,lo...",",click,,click,,click,,data,view,,click,,data,v...",",view_search_results,,view_search_results,,vie...",3427529.0,2,1,2014,2,...,0,0,0,0,0,1,0,0,0,0
1,34.0,0,"dashboard,create,confirm_email,show,show_perso...","view,submit,click,view,data,view,data,data,","dashboard,create_user,confirm_email_link,p3,us...",207842.0,2,1,2014,2,...,0,0,1,0,0,0,0,0,0,0
2,34.0,0,"verify,create,,pending,requested,header_userpi...","-unknown-,submit,message_post,booking_request,...","-unknown-,create_user,message_post,pending,p5,...",1135444.0,2,1,2014,2,...,0,0,0,0,0,1,0,0,0,0
3,34.0,0,"lookup,show,search_results,search_results,show...",",view,click,click,view,view,click,data,data,da...",",p3,view_search_results,view_search_results,p3...",3755100.0,2,1,2014,2,...,0,0,0,0,0,1,0,0,0,0
4,34.0,25,"campaigns,active,create,notifications,listings...","-unknown-,-unknown-,-unknown-,-unknown-,-unkno...","-unknown-,-unknown-,-unknown-,-unknown-,-unkno...",2555.0,2,1,2014,2,...,0,0,0,0,0,0,0,0,1,0


In [96]:
#Checking null values
train_merge.isnull().sum()

age                0
signup_flow        0
action             0
action_type        0
action_detail      0
                  ..
Windows Desktop    0
Windows Phone      0
iPad Tablet        0
iPhone             0
iPodtouch          0
Length: 139, dtype: int64

In [97]:
print(test_merge.columns)

test_merge.head()

Index(['age', 'signup_flow', 'action', 'action_type', 'action_detail',
       'secs_elapsed', 'date_account_created_day',
       'date_account_created_month', 'date_account_created_year',
       'timestamp_first_active_day',
       ...
       'Chromebook', 'Linux Desktop', 'Mac Desktop', 'Opera Phone', 'Tablet',
       'Windows Desktop', 'Windows Phone', 'iPad Tablet', 'iPhone',
       'iPodtouch'],
      dtype='object', length=139)


Unnamed: 0,age,signup_flow,action,action_type,action_detail,secs_elapsed,date_account_created_day,date_account_created_month,date_account_created_year,timestamp_first_active_day,...,Chromebook,Linux Desktop,Mac Desktop,Opera Phone,Tablet,Windows Desktop,Windows Phone,iPad Tablet,iPhone,iPodtouch
0,35.0,0,"show,search,search,show,authenticate,reviews,a...","view,click,click,view,submit,data,-unknown-,click","user_profile,view_search_results,view_search_r...",119187.0,1,7,2014,1,...,0,0,0,0,0,0,0,0,1,0
1,34.0,0,"dashboard,login,search,create,search,signup_lo...","view,view,click,submit,click,view,-unknown-,vi...","dashboard,login_page,view_search_results,creat...",250119.0,1,7,2014,1,...,0,0,0,0,0,0,0,0,1,0
2,34.0,0,"index,index,index,search_results,search_result...","view,view,view,click,click,click,click,data,vi...","view_search_results,view_search_results,view_s...",975575.0,1,7,2014,1,...,0,0,0,0,0,1,0,0,0,0
3,34.0,0,"personalize,header_userpic,header_userpic,show...","data,data,data,view,view,,data,click,submit,,data","wishlist_content_update,header_userpic,header_...",123001.0,1,7,2014,1,...,0,0,0,0,0,1,0,0,0,0
4,34.0,0,"message_to_host_change,agree_terms_check,pendi...","click,-unknown-,booking_request,message_post,v...","message_to_host_change,-unknown-,pending,messa...",454023.0,1,7,2014,1,...,0,0,1,0,0,0,0,0,0,0


In [98]:
#Checking null values
test_merge.isnull().sum()

age                0
signup_flow        0
action             0
action_type        0
action_detail      0
                  ..
Windows Desktop    0
Windows Phone      0
iPad Tablet        0
iPhone             0
iPodtouch          0
Length: 139, dtype: int64

In [99]:
print(train_merge.shape,y.shape)

(73815, 139) (73815,)


In [100]:
print(test_merge.shape)

(62096, 139)


In [101]:
# tokenisation

def tokens(x):
        
    """
    
    Function takes strings and tokenises using comma.
    
    parameters:  string
    
    returns: comma separeted tokens
    
    """
        
    return x.split(',')

In [102]:
# TFIDF action

# https://stackoverflow.com/questions/28103992/tfidf-vectorizer-giving-error

vectorizer_action = TfidfVectorizer(min_df=10,max_features=5000,tokenizer=tokens)

vectorizer_action.fit(train_merge['action'].values)

train_merge_action_tfidf = vectorizer_action.transform(train_merge['action'].values)

test_merge_action_tfidf = vectorizer_action.transform(test_merge['action'].values)

print("After vectorizations")
print(train_merge_action_tfidf.shape)
print(test_merge_action_tfidf.shape)
print("="*100)

After vectorizations
(73815, 256)
(62096, 256)


In [104]:
action_feat = vectorizer_action.get_feature_names_out() 

action_feat

array(['', '10', '11', '12', '15', 'about_us', 'account', 'active',
       'add_guests', 'add_note', 'agree_terms_check',
       'agree_terms_uncheck', 'airbnb_picks', 'ajax_check_dates',
       'ajax_get_referrals_amt', 'ajax_get_results',
       'ajax_google_translate', 'ajax_google_translate_description',
       'ajax_google_translate_reviews', 'ajax_image_upload', 'ajax_ldp',
       'ajax_lwlb_contact', 'ajax_payout_edit',
       'ajax_payout_options_by_country', 'ajax_photo_widget_form_iframe',
       'ajax_price_and_availability',
       'ajax_referral_banner_experiment_type',
       'ajax_referral_banner_type', 'ajax_refresh_subtotal',
       'ajax_send_message', 'ajax_statsd', 'ajax_worth', 'apply_code',
       'apply_coupon_click', 'apply_coupon_click_success',
       'apply_coupon_error', 'apply_coupon_error_type',
       'apply_reservation', 'ask_question', 'at_checkpoint',
       'authenticate', 'authorize', 'available', 'become_user',
       'calendar_tab_inner2', 'callbac

In [105]:
# TFIDF action_type

# https://stackoverflow.com/questions/28103992/tfidf-vectorizer-giving-error

vectorizer_action_type = TfidfVectorizer(min_df=10,max_features=5000,tokenizer=tokens)

vectorizer_action_type.fit(train_merge['action_type'].values)

train_merge_action_type_tfidf = vectorizer_action_type.transform(train_merge['action_type'].values)

test_merge_action_type_tfidf = vectorizer_action_type.transform(test_merge['action_type'].values)

print("After vectorizations")
print(train_merge_action_type_tfidf.shape)
print(test_merge_action_type_tfidf.shape)
print("="*100)

After vectorizations
(73815, 9)
(62096, 9)


In [107]:
action_type_feat = vectorizer_action_type.get_feature_names_out()

action_type_feat

array(['', '-unknown-', 'booking_request', 'click', 'data',
       'message_post', 'partner_callback', 'submit', 'view'], dtype=object)

In [108]:
# TFIDF action_detail

# https://stackoverflow.com/questions/28103992/tfidf-vectorizer-giving-error

vectorizer_action_detail = TfidfVectorizer(min_df=10,max_features=5000,tokenizer=tokens)

vectorizer_action_detail.fit(train_merge['action_detail'].values)

train_merge_action_detail_tfidf = vectorizer_action_detail.transform(train_merge['action_detail'].values)

test_merge_action_detail_tfidf = vectorizer_action_detail.transform(test_merge['action_detail'].values)

print("After vectorizations")
print(train_merge_action_detail_tfidf.shape)
print(test_merge_action_detail_tfidf.shape)
print("="*100)

After vectorizations
(73815, 122)
(62096, 122)


In [110]:
action_detail_feat = vectorizer_action_detail.get_feature_names_out()

action_detail_feat

array(['', '-unknown-', 'account_notification_settings',
       'account_payout_preferences', 'account_privacy_settings',
       'account_transaction_history', 'admin_templates',
       'airbnb_picks_wishlists', 'alteration_field', 'alteration_request',
       'apply_coupon', 'apply_coupon_click', 'apply_coupon_click_success',
       'apply_coupon_error', 'at_checkpoint', 'book_it',
       'calculate_worth', 'cancellation_policies',
       'cancellation_policy_click', 'change_availability',
       'change_contact_host_dates', 'change_or_alter', 'change_password',
       'change_trip_characteristics', 'complete_booking', 'confirm_email',
       'confirm_email_link', 'contact_host', 'coupon_code_click',
       'coupon_field_focus', 'create_alteration_request',
       'create_listing', 'create_phone_numbers', 'create_user',
       'dashboard', 'delete_listing', 'delete_phone_numbers',
       'edit_profile', 'email_wishlist', 'email_wishlist_button',
       'forgot_password', 'friends_wish

In [111]:
# drop colmns

train_merge.drop(['action','action_type','action_detail'],axis=1,inplace = True)

test_merge.drop(['action','action_type','action_detail'],axis=1,inplace = True)

In [112]:
col_lst = [i for i in train_merge.columns]

col_lst

['age',
 'signup_flow',
 'secs_elapsed',
 'date_account_created_day',
 'date_account_created_month',
 'date_account_created_year',
 'timestamp_first_active_day',
 'timestamp_first_active_month',
 'timestamp_first_active_year',
 'timestamp_first_active_hour',
 'age_interv',
 'gender_-unknown-',
 'gender_FEMALE',
 'gender_MALE',
 'gender_OTHER',
 'signup_method_basic',
 'signup_method_facebook',
 'signup_method_google',
 'language_ca',
 'language_cs',
 'language_da',
 'language_de',
 'language_el',
 'language_en',
 'language_es',
 'language_fi',
 'language_fr',
 'language_hu',
 'language_id',
 'language_is',
 'language_it',
 'language_ja',
 'language_ko',
 'language_nl',
 'language_no',
 'language_pl',
 'language_pt',
 'language_ru',
 'language_sv',
 'language_th',
 'language_tr',
 'language_zh',
 'affiliate_channel_api',
 'affiliate_channel_content',
 'affiliate_channel_direct',
 'affiliate_channel_other',
 'affiliate_channel_remarketing',
 'affiliate_channel_sem-brand',
 'affiliate_cha

In [113]:
# data stacking

train_merge_tfidf = hstack((train_merge,train_merge_action_tfidf,train_merge_action_type_tfidf,train_merge_action_detail_tfidf)).tocsr()

test_merge_tfidf = hstack((test_merge,test_merge_action_tfidf,test_merge_action_type_tfidf,test_merge_action_detail_tfidf)).tocsr()

print("Final Data matrix")
print(train_merge_tfidf.shape)
print(test_merge_tfidf.shape)
print("="*100)

Final Data matrix
(73815, 523)
(62096, 523)


In [114]:
col_lst.extend(action_feat)

col_lst.extend(action_type_feat)

col_lst.extend(action_detail_feat)


len(col_lst)

523

In [115]:
def save_sparse_csr(filename, array):
    
    """
    
    Function takes csr matrix and stores it in single file.
    
    parameters:  filename, csr matrix
        
    """
    
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    
    """
    
    Function takes filename and returns csr matrix .
    
    parameters:  filename
    
    returns: csr matrix
    
    """
    
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

In [121]:
# https://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format

# https://stackoverflow.com/questions/32764991/how-do-i-store-a-tfidfvectorizer-for-future-use-in-scikit-learn

# saving to memory

save_sparse_csr('../data/train_merge_tfidf',train_merge_tfidf)

save_sparse_csr('../data/test_merge_tfidf',test_merge_tfidf)

np.save('../data/y',y)

with open("../data/col_lst.txt", "wb") as fp:
    
    pickle.dump(col_lst, fp)
    
fp.close()

with open("../data/lst_ohe_train.txt", "wb") as fp:
    
    pickle.dump(lst_ohe_train, fp)
    
fp.close()

pickle.dump(vectorizer_action, open("../data/vectorizer_action.pickle", "wb"))

pickle.dump(vectorizer_action_type, open("../data/vectorizer_action_type.pickle", "wb"))

pickle.dump(vectorizer_action_detail, open("../data/vectorizer_action_detail.pickle", "wb"))

session_df_concat.to_pickle('../data/session_df_concat.pickle')

In [127]:
# loading from memory

train_merge_tfidf = load_sparse_csr('../data/train_merge_tfidf.npz')

test_merge_tfidf = load_sparse_csr('../data/test_merge_tfidf.npz')

y = np.load('../data/y.npy', allow_pickle=True)

with open("../data/col_lst.txt", "rb") as fp:
    
    col_lst = pickle.load(fp)
    
fp.close()

with open("../data/lst_ohe_train.txt", "rb") as fp:
    
    lst_ohe_train = pickle.load(fp)
    
fp.close()

vectorizer_action = pickle.load(open("../data/vectorizer_action.pickle", "rb"))

vectorizer_action_type = pickle.load(open("../data/vectorizer_action_type.pickle", "rb"))

vectorizer_action_detail = pickle.load(open("../data/vectorizer_action_detail.pickle", "rb"))

session_df_concat = pd.read_pickle('../data/session_df_concat.pickle')

train_merge_raw = pd.read_pickle('../data/train_merge_raw.pickle')

In [129]:
print("Final Data matrix")
print(train_merge_tfidf.shape)
print(test_merge_tfidf.shape)
print(y.shape)
print(len(vectorizer_action.get_feature_names_out()))
print(len(vectorizer_action_type.get_feature_names_out()))
print(len(vectorizer_action_detail.get_feature_names_out()))
print(session_df_concat.shape)
print(train_merge_raw.shape)
print(len(col_lst))
print(len(lst_ohe_train))
print("="*100)

Final Data matrix
(73815, 523)
(62096, 523)
(73815,)
256
9
122
(135483, 6)
(73815, 22)
523
10


In [131]:
y

<73815x523 sparse matrix of type '<class 'numpy.float64'>'
	with 3709028 stored elements in Compressed Sparse Row format>

In [143]:
from scipy import sparse

# Convert it to a dense DataFrame

train_merge_tfidf = pd.DataFrame.sparse.from_spmatrix(train_merge_tfidf)
y_df = pd.DataFrame({'target': y})  

# Save the DataFrame to a CSV file

train_merge_tfidf.to_csv('../data/train_merge_tfidf.csv', index=False)
test_merge_tfidf.to_csv('../data/test_merge_tfidf.csv', index=False)
y_df.to_csv('../data/y.csv', index=False)