In [1]:
# Loading packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

## Loading files

In [2]:
# age_gender_bkts
# countries
# sessions
# test_users: Used to create predicitons for submission
# train_users_2: Used to train and evaluate model
# sample_submission_NDF: Columns: id & country

In [22]:
age_gender_bkts = pd.read_csv("data/age_gender_bkts.csv")
countries = pd.read_csv("data/countries.csv")
sessions = pd.read_csv("data/sessions.csv")
test_users = pd.read_csv("data/test_users.csv")
train = pd.read_csv("data/train_users_2.csv")

In [5]:
sample = pd.read_csv("data/sample_submission_NDF.csv")

In [43]:
sample

Unnamed: 0,id,country
0,5uwns89zht,NDF
1,jtl0dijy2j,NDF
2,xx0ulgorjt,NDF
3,6c6puo6ix0,NDF
4,czqhjk3yfe,NDF
...,...,...
62091,cv0na2lf5a,NDF
62092,zp8xfonng8,NDF
62093,fa6260ziny,NDF
62094,87k0fy4ugm,NDF


## Data analysis

In [16]:
test_users

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62091,cv0na2lf5a,2014-09-30,20140930235232,,-unknown-,31.0,basic,0,en,direct,direct,untracked,Web,Windows Desktop,IE
62092,zp8xfonng8,2014-09-30,20140930235306,,-unknown-,,basic,23,ko,direct,direct,untracked,Android,Android Phone,-unknown-
62093,fa6260ziny,2014-09-30,20140930235408,,-unknown-,,basic,0,de,direct,direct,linked,Web,Windows Desktop,Firefox
62094,87k0fy4ugm,2014-09-30,20140930235430,,-unknown-,,basic,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari


In [35]:
train.id.describe()

count         213451
unique        213451
top       oe4sphq448
freq               1
Name: id, dtype: object

In [33]:
train.age

0          NaN
1         38.0
2         56.0
3         42.0
4         41.0
          ... 
213446    32.0
213447     NaN
213448    32.0
213449     NaN
213450     NaN
Name: age, Length: 213451, dtype: float64

In [42]:
train.gender.value_counts()

-unknown-    95688
FEMALE       63041
MALE         54440
OTHER          282
Name: gender, dtype: int64

### Finding NaN-values in the data set

In [32]:
percent_missing = train.isnull().sum() * 100 / len(train)
missing_value_df = pd.DataFrame({'column_name': train.columns, 'percent_missing': percent_missing})
percent_missing

id                          0.000000
date_account_created        0.000000
timestamp_first_active      0.000000
date_first_booking         58.347349
gender                      0.000000
age                        41.222576
signup_method               0.000000
signup_flow                 0.000000
language                    0.000000
affiliate_channel           0.000000
affiliate_provider          0.000000
first_affiliate_tracked     2.841402
signup_app                  0.000000
first_device_type           0.000000
first_browser               0.000000
country_destination         0.000000
dtype: float64

Seems like *date_first_booking* and *age* has lost of missing values
- date_first_booking: Dropping column
- age: replacing NaN-values with the mean of age in the data set


In [46]:
test_users.age.describe()

count    33220.000000
mean        37.616677
std         74.440647
min          1.000000
25%         26.000000
50%         31.000000
75%         40.000000
max       2002.000000
Name: age, dtype: float64

Some infeasible values...

In [67]:
## Merging data before on hot encoding features

labels = train['country_destination']
train.drop('country_destination', inplace = True, axis = 1)

data = pd.concat((train, test_users), axis=0, ignore_index=True)
data = data.drop(['id', 'date_first_booking'], axis=1)

In [68]:
data['age'] = data['age'].apply(lambda x: 122 if x > 122 else x)
data['age'] = data['age'].apply(lambda x: 18 if x < 18 else x)

In [69]:
data.age.fillna(data.age.mean(), inplace=True)

In [70]:
#One-hot-encoding features
cat_features = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel',
             'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in cat_features:
    data_dummy = pd.get_dummies(data[f], prefix=f) # encode categorical variables
    data.drop([f], axis=1, inplace = True) # drop encoded variables
    data = pd.concat((data, data_dummy), axis=1) # concat numerical and categorical variables

In [71]:
data

Unnamed: 0,date_account_created,timestamp_first_active,age,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,signup_method_basic,signup_method_facebook,signup_method_google,...,first_browser_Silk,first_browser_SiteKiosk,first_browser_SlimBrowser,first_browser_Sogou Explorer,first_browser_Stainless,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_UC Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser
0,2010-06-28,20090319043255,37.314757,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2011-05-25,20090523174809,38.000000,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2010-09-28,20090609231247,56.000000,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2011-12-05,20091031060129,42.000000,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2010-09-14,20091208061105,41.000000,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,2014-09-30,20140930235232,31.000000,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
275543,2014-09-30,20140930235306,37.314757,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
275544,2014-09-30,20140930235408,37.314757,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
275545,2014-09-30,20140930235430,37.314757,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Splitting the data again:

In [72]:
X = data[:train.shape[0]]
X_test = data[train.shape[0]:]

In [79]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(labels)
# le.inverse_transform([2]) To get back to the country code!

In [77]:
#from sklearn import svm
import xgboost as xgb
#from sklearn.model_selection import train_test_split

XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/Users/bjorelind/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/bjorelind/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


In [None]:
# Predict top 5 countries
# Training model
xgb = XGBClassifier(use_label_encoder=False)
xgb.fit(X, y)

In [None]:
y_pred = xgb.predict_proba(X_test) # Results in a probability distribution