# AIRBNB DATA CLEANING PROJECT


In [1]:
import pandas as pd
import numpy as np

# Import data
df_train = pd.read_csv("train_users_2.csv", header=0, index_col=None)
df_test = pd.read_csv("test_users.csv", header=0, index_col=None)

# Combine into one dataset
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [2]:
df_all.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [3]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275547 entries, 0 to 275546
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       275547 non-null  object 
 1   date_account_created     275547 non-null  object 
 2   timestamp_first_active   275547 non-null  int64  
 3   date_first_booking       88908 non-null   object 
 4   gender                   275547 non-null  object 
 5   age                      158681 non-null  float64
 6   signup_method            275547 non-null  object 
 7   signup_flow              275547 non-null  int64  
 8   language                 275547 non-null  object 
 9   affiliate_channel        275547 non-null  object 
 10  affiliate_provider       275547 non-null  object 
 11  first_affiliate_tracked  269462 non-null  object 
 12  signup_app               275547 non-null  object 
 13  first_device_type        275547 non-null  object 
 14  firs

In [4]:
# Change Dates to consistent format
df_all['date_account_created'] = pd.to_datetime(df_all['date_account_created'], format='%Y-%m-%d')
df_all['timestamp_first_active'] = pd.to_datetime(df_all['timestamp_first_active'], format='%Y%m%d%H%M%S')
df_all['date_account_created'].fillna(df_all.timestamp_first_active, inplace=True)

In [5]:
# Remove date_first_booking column
df_all.drop('date_first_booking', axis=1, inplace=True)

In [6]:
# Remove outliers function
def remove_outliers(df, column, min_val, max_val):
    col_values = df[column].values
    df[column] = np.where(np.logical_or(col_values <= min_val, col_values >= max_val), np.NaN, col_values)
    return df

# Fixing age column
df_all = remove_outliers(df=df_all, column='age', min_val=15, max_val=90)
df_all['age'].fillna(-1, inplace=True)

In [7]:
# Fill first_affiliate_tracked column
df_all['first_affiliate_tracked'].fillna(-1, inplace=True)

In [8]:
df_all.head(10)

Unnamed: 0,id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,2009-03-19 04:32:55,-unknown-,-1.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,2009-05-23 17:48:09,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,2009-06-09 23:12:47,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,2009-10-31 06:01:29,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,2009-12-08 06:11:05,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US
5,osr2jwljor,2010-01-01,2010-01-01 21:56:19,-unknown-,-1.0,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US
6,lsw9q7uk0j,2010-01-02,2010-01-02 01:25:58,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
7,0d01nltbrs,2010-01-03,2010-01-03 19:19:05,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US
8,a1vcnhxeij,2010-01-04,2010-01-04 00:42:11,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
9,6uh8zyj2gn,2010-01-04,2010-01-04 02:37:58,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,US


In [9]:
# Performing One Hot Encoding function
def convert_to_binary(df, column_to_convert):
    categories = list(df[column_to_convert].drop_duplicates())

    for category in categories:
        cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
        col_name = column_to_convert[:5] + '_' + cat_name[:10]
        df[col_name] = 0
        df.loc[(df[column_to_convert] == category), col_name] = 1

    return df

# One Hot Encoding
columns_to_convert = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']

for column in columns_to_convert:
    df_all = convert_to_binary(df=df_all, column_to_convert=column)
    df_all.drop(column, axis=1, inplace=True)

In [10]:
# Add new date related fields
print("Adding new fields...")
df_all['day_account_created'] = df_all['date_account_created'].dt.weekday
df_all['month_account_created'] = df_all['date_account_created'].dt.month
df_all['quarter_account_created'] = df_all['date_account_created'].dt.quarter
df_all['year_account_created'] = df_all['date_account_created'].dt.year
df_all['hour_first_active'] = df_all['timestamp_first_active'].dt.hour
df_all['day_first_active'] = df_all['timestamp_first_active'].dt.weekday
df_all['month_first_active'] = df_all['timestamp_first_active'].dt.month
df_all['quarter_first_active'] = df_all['timestamp_first_active'].dt.quarter
df_all['year_first_active'] = df_all['timestamp_first_active'].dt.year
df_all['created_less_active'] = (df_all['date_account_created'] - df_all['timestamp_first_active']).dt.days

# Drop unnecessary columns
columns_to_drop = ['date_account_created', 'timestamp_first_active', 'date_first_booking', 'country_destination']
for column in columns_to_drop:
    if column in df_all.columns:
        df_all.drop(column, axis=1, inplace=True)

Adding new fields...


In [11]:
df_all.head()

Unnamed: 0,id,age,gende_unknown,gende_male,gende_female,gende_other,signu_facebook,signu_basic,signu_google,signu_weibo,...,day_account_created,month_account_created,quarter_account_created,year_account_created,hour_first_active,day_first_active,month_first_active,quarter_first_active,year_first_active,created_less_active
0,gxn3p5htnn,-1.0,1,0,0,0,1,0,0,0,...,0,6,2,2010,4,3,3,1,2009,465
1,820tgsjxq7,38.0,0,1,0,0,1,0,0,0,...,2,5,2,2011,17,5,5,2,2009,731
2,4ft3gnwmtx,56.0,0,0,1,0,0,1,0,0,...,1,9,3,2010,23,1,6,2,2009,475
3,bjjt8pjhuk,42.0,0,0,1,0,1,0,0,0,...,0,12,4,2011,6,5,10,4,2009,764
4,87mebub9p4,41.0,1,0,0,0,0,1,0,0,...,1,9,3,2010,6,1,12,4,2009,279


## Cleaning Sessions Data

In [12]:
# Import sessions data
sessions = pd.read_csv("sessions.csv", header=0, index_col=False)

In [13]:
sessions.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


In [14]:
# Determine primary device
sessions_device = sessions[['user_id', 'device_type', 'secs_elapsed']]
aggregated_lvl1 = sessions_device.groupby(['user_id', 'device_type'], as_index=False, sort=False).sum()
idx = aggregated_lvl1.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == aggregated_lvl1['secs_elapsed']
df_primary = aggregated_lvl1.loc[idx, ['user_id', 'device_type', 'secs_elapsed']].copy()
df_primary.rename(columns = {'device_type':'primary_device', 'secs_elapsed':'primary_secs'}, inplace=True)
df_primary = convert_to_binary(df=df_primary, column_to_convert='primary_device')
df_primary.drop('primary_device', axis=1, inplace=True)

# Determine Secondary device
remaining = aggregated_lvl1.drop(aggregated_lvl1.index[idx])
idx = remaining.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == remaining['secs_elapsed']
df_secondary = remaining.loc[idx, ['user_id', 'device_type', 'secs_elapsed']].copy()
df_secondary.rename(columns = {'device_type':'secondary_device', 'secs_elapsed':'secondary_secs'}, inplace=True)
df_secondary = convert_to_binary(df=df_secondary, column_to_convert='secondary_device')
df_secondary.drop('secondary_device', axis=1, inplace=True)

In [15]:
df_primary

Unnamed: 0,user_id,primary_secs,prima_windows_de,prima_mac_deskto,prima_iphone,prima_ipad_table,prima_unknown,prima_android_ap,prima_linux_desk,prima_tablet,prima_chromebook,prima_android_ph,prima_ipodtouch,prima_blackberry,prima_windows_ph,prima_opera_phon
0,d1mm9tcy42,3315820.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,yo8nz8bqcq,207842.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4grx6yxeby,1059362.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5,ncf87guaf0,3361226.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,4rvqpxoh3h,2555.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178228,cv0na2lf5a,1572098.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
178230,zp8xfonng8,51618.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
178231,fa6260ziny,823297.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
178233,87k0fy4ugm,283468.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [16]:
df_secondary

Unnamed: 0,user_id,secondary_secs,secon_unknown,secon_android_ph,secon_ipad_table,secon_android_ap,secon_mac_deskto,secon_iphone,secon_windows_de,secon_linux_desk,secon_tablet,secon_blackberry,secon_windows_ph,secon_chromebook,secon_opera_phon,secon_ipodtouch
1,d1mm9tcy42,111709.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4grx6yxeby,76082.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
7,ncf87guaf0,274002.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
11,xwxei6hdk4,810.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
15,awiurksqr3,488.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178221,jg618z94wo,42404.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
178223,o6ofmozucx,254.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
178229,cv0na2lf5a,962134.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
178232,87k0fy4ugm,68600.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [17]:
# Count occurrences of value in a column
def convert_to_counts(df, id_col, column_to_convert):
    id_list = df[id_col].drop_duplicates()

    df_counts = df[[id_col, column_to_convert]]
    df_counts['count'] = 1
    df_counts = df_counts.groupby(by=[id_col, column_to_convert], as_index=False, sort=False).sum()

    new_df = df_counts.pivot(index=id_col, columns=column_to_convert, values='count')
    new_df = new_df.fillna(0)

    # Rename Columns
    categories = list(df[column_to_convert].drop_duplicates())
    for category in categories:
       cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
       col_name = column_to_convert + '_' + cat_name
       new_df.rename(columns = {category:col_name}, inplace=True)

    return new_df

# Aggregate and combine actions taken columns
print("Aggregating actions taken...")
session_actions = sessions[['user_id', 'action', 'action_type', 'action_detail']].copy()
columns_to_convert = ['action', 'action_type', 'action_detail']
session_actions = session_actions.fillna('not provided')
first = True

for column in columns_to_convert:
    print("Converting " + column + " column...")
    current_data = convert_to_counts(df=session_actions, id_col='user_id', column_to_convert=column)

    # If first loop, current data becomes existing data, otherwise merge existing and current
    if first:
        first = False
        actions_data = current_data
    else:
        actions_data = pd.concat([actions_data, current_data], axis=1, join='inner')

Aggregating actions taken...
Converting action column...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_counts['count'] = 1


Converting action_type column...
Converting action_detail column...


In [18]:
# Merge device datasets
print("Combining results...")
df_primary.set_index('user_id', inplace=True)
df_secondary.set_index('user_id', inplace=True)
device_data = pd.concat([df_primary, df_secondary], axis=1, join="outer")

# Merge device and actions datasets
combined_results = pd.concat([device_data, actions_data], axis=1, join='outer')
df_sessions = combined_results.fillna(0)

# Merge user and session datasets
df_all.set_index('id', inplace=True)
df_all = pd.concat([df_all, df_sessions], axis=1, join='inner')

Combining results...


In [19]:
df_all

Unnamed: 0,age,gende_unknown,gende_male,gende_female,gende_other,signu_facebook,signu_basic,signu_google,signu_weibo,signu_0,...,action_detail_view_resolutions,action_detail_view_search_results,action_detail_view_security_checks,action_detail_view_user_real_names,action_detail_wishlist,action_detail_wishlist_content_update,action_detail_wishlist_note,action_detail_your_listings,action_detail_your_reservations,action_detail_your_trips
d1mm9tcy42,62.0,0,1,0,0,0,1,0,0,1,...,0.0,23.0,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0
yo8nz8bqcq,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4grx6yxeby,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
ncf87guaf0,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,32.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
4rvqpxoh3h,-1.0,1,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cv0na2lf5a,31.0,1,0,0,0,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.0,2.0,0.0
zp8xfonng8,-1.0,1,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fa6260ziny,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,21.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,1.0
87k0fy4ugm,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0


## Training Our Model

In [20]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time


In [21]:
pip install sklearn

You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install xgboost

You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [23]:
conda install -c conda-forge xgboost

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [32]:
import numpy as np

from scipy.stats import uniform, randint
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import  decomposition, grid_search

import xgboost as xgb

ImportError: cannot import name 'grid_search' from 'sklearn' (/opt/anaconda3/lib/python3.8/site-packages/sklearn/__init__.py)

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder


In [25]:
df_train

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213446,zxodksqpep,2014-06-30,20140630235636,,MALE,32.0,basic,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari,NDF
213447,mhewnxesx9,2014-06-30,20140630235719,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome,NDF
213448,6o3arsjbb4,2014-06-30,20140630235754,,-unknown-,32.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,NDF
213449,jh95kwisub,2014-06-30,20140630235822,,-unknown-,,basic,25,en,other,other,tracked-other,iOS,iPhone,Mobile Safari,NDF


In [26]:
# Prepare training data for modelling
df_train.set_index('id', inplace=True)
df_train = pd.concat([df_train['country_destination'], df_all], axis=1, join='inner')

id_train = df_train.index.values
labels = df_train['country_destination']
le = LabelEncoder()
y = le.fit_transform(labels)
X = df_train.drop('country_destination', axis=1, inplace=False)

In [36]:
# Grid Search - Used to find best combination of parameters
XGB_model = xgb.XGBClassifier(objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)

param_grid = {'max_depth': [3, 4, 5], 'learning_rate': [0.1, 0.3], 'n_estimators': [25, 50]}
model = GridSearchCV(estimator=XGB_model, param_grid=param_grid, \
                                 scoring='accuracy', verbose=10, n_jobs=1, refit=True, cv=3)

model.fit(X, y)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3; 1/12] START learning_rate=0.1, max_depth=3, n_estimators=25............




[CV 1/3; 1/12] END learning_rate=0.1, max_depth=3, n_estimators=25; total time=  22.6s
[CV 2/3; 1/12] START learning_rate=0.1, max_depth=3, n_estimators=25............




[CV 2/3; 1/12] END learning_rate=0.1, max_depth=3, n_estimators=25; total time=  22.0s
[CV 3/3; 1/12] START learning_rate=0.1, max_depth=3, n_estimators=25............




[CV 3/3; 1/12] END learning_rate=0.1, max_depth=3, n_estimators=25; total time=  21.7s
[CV 1/3; 2/12] START learning_rate=0.1, max_depth=3, n_estimators=50............




[CV 1/3; 2/12] END learning_rate=0.1, max_depth=3, n_estimators=50; total time=  43.2s
[CV 2/3; 2/12] START learning_rate=0.1, max_depth=3, n_estimators=50............




[CV 2/3; 2/12] END learning_rate=0.1, max_depth=3, n_estimators=50; total time=  43.0s
[CV 3/3; 2/12] START learning_rate=0.1, max_depth=3, n_estimators=50............




[CV 3/3; 2/12] END learning_rate=0.1, max_depth=3, n_estimators=50; total time=  45.0s
[CV 1/3; 3/12] START learning_rate=0.1, max_depth=4, n_estimators=25............




[CV 1/3; 3/12] END learning_rate=0.1, max_depth=4, n_estimators=25; total time=  29.0s
[CV 2/3; 3/12] START learning_rate=0.1, max_depth=4, n_estimators=25............




[CV 2/3; 3/12] END learning_rate=0.1, max_depth=4, n_estimators=25; total time=  29.7s
[CV 3/3; 3/12] START learning_rate=0.1, max_depth=4, n_estimators=25............




[CV 3/3; 3/12] END learning_rate=0.1, max_depth=4, n_estimators=25; total time=  31.5s
[CV 1/3; 4/12] START learning_rate=0.1, max_depth=4, n_estimators=50............




[CV 1/3; 4/12] END learning_rate=0.1, max_depth=4, n_estimators=50; total time= 1.0min
[CV 2/3; 4/12] START learning_rate=0.1, max_depth=4, n_estimators=50............




[CV 2/3; 4/12] END learning_rate=0.1, max_depth=4, n_estimators=50; total time= 1.0min
[CV 3/3; 4/12] START learning_rate=0.1, max_depth=4, n_estimators=50............




[CV 3/3; 4/12] END learning_rate=0.1, max_depth=4, n_estimators=50; total time=  59.6s
[CV 1/3; 5/12] START learning_rate=0.1, max_depth=5, n_estimators=25............




[CV 1/3; 5/12] END learning_rate=0.1, max_depth=5, n_estimators=25; total time=  38.1s
[CV 2/3; 5/12] START learning_rate=0.1, max_depth=5, n_estimators=25............




[CV 2/3; 5/12] END learning_rate=0.1, max_depth=5, n_estimators=25; total time=  39.1s
[CV 3/3; 5/12] START learning_rate=0.1, max_depth=5, n_estimators=25............




[CV 3/3; 5/12] END learning_rate=0.1, max_depth=5, n_estimators=25; total time=  39.6s
[CV 1/3; 6/12] START learning_rate=0.1, max_depth=5, n_estimators=50............




[CV 1/3; 6/12] END learning_rate=0.1, max_depth=5, n_estimators=50; total time= 1.3min
[CV 2/3; 6/12] START learning_rate=0.1, max_depth=5, n_estimators=50............




[CV 2/3; 6/12] END learning_rate=0.1, max_depth=5, n_estimators=50; total time= 1.2min
[CV 3/3; 6/12] START learning_rate=0.1, max_depth=5, n_estimators=50............




[CV 3/3; 6/12] END learning_rate=0.1, max_depth=5, n_estimators=50; total time= 1.3min
[CV 1/3; 7/12] START learning_rate=0.3, max_depth=3, n_estimators=25............




[CV 1/3; 7/12] END learning_rate=0.3, max_depth=3, n_estimators=25; total time=  22.9s
[CV 2/3; 7/12] START learning_rate=0.3, max_depth=3, n_estimators=25............




[CV 2/3; 7/12] END learning_rate=0.3, max_depth=3, n_estimators=25; total time=  22.7s
[CV 3/3; 7/12] START learning_rate=0.3, max_depth=3, n_estimators=25............




[CV 3/3; 7/12] END learning_rate=0.3, max_depth=3, n_estimators=25; total time=  22.0s
[CV 1/3; 8/12] START learning_rate=0.3, max_depth=3, n_estimators=50............




[CV 1/3; 8/12] END learning_rate=0.3, max_depth=3, n_estimators=50; total time=  43.2s
[CV 2/3; 8/12] START learning_rate=0.3, max_depth=3, n_estimators=50............




[CV 2/3; 8/12] END learning_rate=0.3, max_depth=3, n_estimators=50; total time=  43.0s
[CV 3/3; 8/12] START learning_rate=0.3, max_depth=3, n_estimators=50............




[CV 3/3; 8/12] END learning_rate=0.3, max_depth=3, n_estimators=50; total time= 2.2min
[CV 1/3; 9/12] START learning_rate=0.3, max_depth=4, n_estimators=25............




[CV 1/3; 9/12] END learning_rate=0.3, max_depth=4, n_estimators=25; total time=  31.5s
[CV 2/3; 9/12] START learning_rate=0.3, max_depth=4, n_estimators=25............




[CV 2/3; 9/12] END learning_rate=0.3, max_depth=4, n_estimators=25; total time=  31.1s
[CV 3/3; 9/12] START learning_rate=0.3, max_depth=4, n_estimators=25............




[CV 3/3; 9/12] END learning_rate=0.3, max_depth=4, n_estimators=25; total time=  30.6s
[CV 1/3; 10/12] START learning_rate=0.3, max_depth=4, n_estimators=50...........




[CV 1/3; 10/12] END learning_rate=0.3, max_depth=4, n_estimators=50; total time=  58.4s
[CV 2/3; 10/12] START learning_rate=0.3, max_depth=4, n_estimators=50...........




[CV 2/3; 10/12] END learning_rate=0.3, max_depth=4, n_estimators=50; total time=  57.9s
[CV 3/3; 10/12] START learning_rate=0.3, max_depth=4, n_estimators=50...........




[CV 3/3; 10/12] END learning_rate=0.3, max_depth=4, n_estimators=50; total time=  59.7s
[CV 1/3; 11/12] START learning_rate=0.3, max_depth=5, n_estimators=25...........




[CV 1/3; 11/12] END learning_rate=0.3, max_depth=5, n_estimators=25; total time=  36.7s
[CV 2/3; 11/12] START learning_rate=0.3, max_depth=5, n_estimators=25...........




[CV 2/3; 11/12] END learning_rate=0.3, max_depth=5, n_estimators=25; total time=  38.9s
[CV 3/3; 11/12] START learning_rate=0.3, max_depth=5, n_estimators=25...........




[CV 3/3; 11/12] END learning_rate=0.3, max_depth=5, n_estimators=25; total time=  36.4s
[CV 1/3; 12/12] START learning_rate=0.3, max_depth=5, n_estimators=50...........




[CV 1/3; 12/12] END learning_rate=0.3, max_depth=5, n_estimators=50; total time= 1.2min
[CV 2/3; 12/12] START learning_rate=0.3, max_depth=5, n_estimators=50...........




[CV 2/3; 12/12] END learning_rate=0.3, max_depth=5, n_estimators=50; total time= 1.2min
[CV 3/3; 12/12] START learning_rate=0.3, max_depth=5, n_estimators=50...........




[CV 3/3; 12/12] END learning_rate=0.3, max_depth=5, n_estimators=50; total time= 1.2min




Best score: 0.700
Best parameters set:
	learning_rate: 0.1
	max_depth: 4
	n_estimators: 50


In [37]:
# Prepare test data for prediction
df_test.set_index('id', inplace=True)
df_test = pd.merge(df_test[['date_first_booking']], df_all, how='left', left_index=True, right_index=True, sort=False)
X_test = df_test.drop('date_first_booking', axis=1, inplace=False)
X_test = X_test.fillna(-1)
id_test = df_test.index.values

# Make predictions
y_pred = model.predict_proba(X_test)

In [38]:
#Taking the 5 classes with highest probabilities
ids = [] #list of ids
cts = [] #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
print("Outputting final results...")
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('./submission.csv', index=False)

Outputting final results...


In [39]:
y_pred

array([[0.00580819, 0.00699432, 0.00578922, ..., 0.0046221 , 0.183283  ,
        0.02850945],
       [0.00319688, 0.00431316, 0.00357585, ..., 0.0028966 , 0.0463637 ,
        0.01320926],
       [0.00336524, 0.00463014, 0.00389265, ..., 0.00312182, 0.04791488,
        0.0143653 ],
       ...,
       [0.00362362, 0.00510601, 0.01051749, ..., 0.00336392, 0.04803659,
        0.02079882],
       [0.00433561, 0.00602367, 0.00494611, ..., 0.00394021, 0.07217266,
        0.02126788],
       [0.00847019, 0.01295397, 0.00962862, ..., 0.00698511, 0.3029596 ,
        0.05298099]], dtype=float32)