In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, preprocessing, feature_extraction

%matplotlib inline

plt.style.use('seaborn')

In [49]:
test_users = pd.read_csv("test_users.csv")
train_users = pd.read_csv("train_users_2.csv")
print(train_users.shape)
print(test_users.shape)

(213451, 16)
(62096, 15)


In [50]:
all_users = pd.concat((train_users, test_users), axis = 0, ignore_index = True)
all_users.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,affiliate_channel,affiliate_provider,age,country_destination,date_account_created,date_first_booking,first_affiliate_tracked,first_browser,first_device_type,gender,id,language,signup_app,signup_flow,signup_method,timestamp_first_active
0,direct,direct,,NDF,2010-06-28,,untracked,Chrome,Mac Desktop,-unknown-,gxn3p5htnn,en,Web,0,facebook,20090319043255
1,seo,google,38.0,NDF,2011-05-25,,untracked,Chrome,Mac Desktop,MALE,820tgsjxq7,en,Web,0,facebook,20090523174809
2,direct,direct,56.0,US,2010-09-28,2010-08-02,untracked,IE,Windows Desktop,FEMALE,4ft3gnwmtx,en,Web,3,basic,20090609231247
3,direct,direct,42.0,other,2011-12-05,2012-09-08,untracked,Firefox,Mac Desktop,FEMALE,bjjt8pjhuk,en,Web,0,facebook,20091031060129
4,direct,direct,41.0,US,2010-09-14,2010-02-18,untracked,Chrome,Mac Desktop,-unknown-,87mebub9p4,en,Web,0,basic,20091208061105


In [51]:
sessions = pd.read_csv('sessions.csv')
sessions.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


In [52]:
df_sess = sessions.groupby(['user_id']).user_id.count().reset_index(name = 'session_count')
df_sess.head()

Unnamed: 0,user_id,session_count
0,00023iyk9l,40
1,0010k6l0om,63
2,001wyh0pz8,90
3,0028jgx1x1,31
4,002qnbzfs5,789


In [53]:
secs = sessions.groupby(['user_id']).secs_elapsed.sum().reset_index()
secs.columns = ['user_id', 'secs_elapsed']
secs.describe()

Unnamed: 0,secs_elapsed
count,135483.0
mean,1489732.0
std,1892923.0
min,0.0
25%,260598.0
50%,850271.0
75%,2000422.0
max,38221360.0


In [54]:
train_users_labels = train_users.loc[:, 'country_destination']
print(train_users_labels.head())

0      NDF
1      NDF
2       US
3    other
4       US
Name: country_destination, dtype: object


In [55]:
train_users = train_users.drop(['date_first_booking'], axis = 1)
test_users = test_users.drop(['date_first_booking'], axis = 1)

In [56]:
#  Split the feature "data_account_created" to "year", "month", "day"
date_acc_created_train = np.vstack(train_users.date_account_created.astype(str).apply(
    lambda x : list(map(int, x.split('-')))).values)

train_users['create_year'] = date_acc_created_train[:, 0]
train_users['create_month'] = date_acc_created_train[:, 1]
train_users['create_day'] = date_acc_created_train[:, 2]
train_users = train_users.drop(['date_account_created'], axis = 1)

date_acc_created_test = np.vstack(test_users.date_account_created.astype(str).apply(
    lambda x : list(map(int, x.split('-')))).values)

test_users['create_year'] = date_acc_created_test[:, 0]
test_users['create_month'] = date_acc_created_test[:, 1]
test_users['create_day'] = date_acc_created_test[:, 2]
test_users = test_users.drop(['date_account_created'], axis = 1)

In [57]:
train_users.loc[train_users.gender == '-unknown-', 'gender'] = -1
train_users.loc[train_users.gender.isnull(), 'gender'] = -1
test_users.loc[test_users.gender == '-unknown-', 'gender'] = -1
test_users.loc[test_users.gender.isnull(), 'gender'] = -1

In [58]:
gender_enc = {'FEMALE' : 0,
             'MALE' : 1,
             'OTHER' : 2,
             -1 : -1}
for data in [train_users, test_users]:
    data.gender = data.gender.apply(lambda x : gender_enc[x])

In [59]:
train_users.loc[train_users.age > 90, 'age'] = np.nan
train_users.loc[train_users.age < 16, 'age'] = np.nan
test_users.loc[test_users.age > 90, 'age'] = np.nan
test_users.loc[test_users.age < 16, 'age'] = np.nan

In [60]:
train_users.loc[train_users.age.isnull(), 'age'] = train_users.age.median()
test_users.loc[test_users.age.isnull(), 'age'] = test_users.age.median()

In [61]:
signup_enc = {'facebook' : 0,
             'google' : 1,
             'basic' : 2,
             'weibo' : 3}
for data in [train_users, test_users]:
    data.signup_method = data.signup_method.apply(lambda x : signup_enc[x])

In [62]:
test_users.loc[test_users.language == '-unknown-', 'language'] = test_users.language.mode()[0]

In [63]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_users.language = le.fit_transform(train_users.language)
test_users.language = le.fit_transform(test_users.language)

In [64]:
train_users.affiliate_channel = le.fit_transform(train_users.affiliate_channel)
train_users.affiliate_provider = le.fit_transform(train_users.affiliate_provider)
test_users.affiliate_channel = le.fit_transform(test_users.affiliate_channel)
test_users.affiliate_provider = le.fit_transform(test_users.affiliate_provider)

train_users.loc[train_users.first_affiliate_tracked.isnull(), 'first_affiliate_tracked'] = 'untracked'
train_users.first_affiliate_tracked = le.fit_transform(train_users.first_affiliate_tracked)

test_users.loc[test_users.first_affiliate_tracked.isnull(), 'first_affiliate_tracked'] = 'untracked'
test_users.first_affiliate_tracked = le.fit_transform(test_users.first_affiliate_tracked)

In [65]:
train_users.signup_app = le.fit_transform(train_users.signup_app)
train_users.first_device_type = le.fit_transform(train_users.first_device_type)
train_users.first_browser = le.fit_transform(train_users.first_browser)
test_users.signup_app = le.fit_transform(test_users.signup_app)
test_users.first_device_type = le.fit_transform(test_users.first_device_type)
test_users.first_browser = le.fit_transform(test_users.first_browser)

In [66]:
## session
df = sessions.user_id.value_counts()
print(df.shape)
print(df.head())

(135483,)
mxqbh3ykxl    2722
0hjoc5q8nf    2644
mjbl6rrj52    2476
l5lgm3w5pc    2424
wg9413iaux    2362
Name: user_id, dtype: int64


In [67]:
df = df.to_frame()

In [68]:
df = df.rename(columns = {'user_id' : 'session_count'})
df['id'] = df.index
df.head()

Unnamed: 0,session_count,id
mxqbh3ykxl,2722,mxqbh3ykxl
0hjoc5q8nf,2644,0hjoc5q8nf
mjbl6rrj52,2476,mjbl6rrj52
l5lgm3w5pc,2424,l5lgm3w5pc
wg9413iaux,2362,wg9413iaux


In [69]:
train_users = pd.merge(train_users, df, how = 'left', on = ['id'])
test_users = pd.merge(test_users, df, how = 'left', on = ['id'])

In [70]:
train_users.session_count.fillna(0, inplace = True)
test_users.session_count.fillna(0, inplace = True)

In [71]:
train_users.session_count = train_users.session_count.astype(int)
test_users.session_count = test_users.session_count.astype(int)

In [72]:
label_df = train_users_labels.to_frame()
for data in [label_df]:
    data.country_destination = le.fit_transform(data.country_destination)

label_df.head()

Unnamed: 0,country_destination
0,7
1,7
2,10
3,11
4,10


In [73]:
train_users = train_users.drop(['id'], axis = 1)
train_users = train_users.drop([ 'timestamp_first_active'], axis = 1)
train_users = train_users.drop(['country_destination'], axis = 1)
train_users.head()

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,create_year,create_month,create_day,session_count
0,-1,33.0,0,0,5,2,4,6,2,3,8,2010,6,28,0
1,1,38.0,0,0,5,7,8,6,2,3,8,2011,5,25,0
2,0,56.0,2,3,5,2,4,6,2,6,21,2010,9,28,0
3,0,42.0,0,0,5,2,4,6,2,3,17,2011,12,5,0
4,-1,41.0,2,0,5,2,4,6,2,3,8,2010,9,14,0


In [74]:
## 把维度改为一致
test_users1= test_users.drop(['id','timestamp_first_active'],axis=1)
test_users1.head()

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,create_year,create_month,create_day,session_count
0,0,35.0,0,0,5,1,4,6,1,8,17,2014,7,1,8
1,-1,31.0,2,0,5,1,4,6,1,8,17,2014,7,1,19
2,-1,31.0,2,0,5,1,4,0,2,6,5,2014,7,1,58
3,-1,31.0,2,0,5,1,4,0,2,6,11,2014,7,1,11
4,-1,31.0,2,0,5,1,4,6,2,3,23,2014,7,1,19


In [75]:
train_users = train_users.drop(['create_day'], axis = 1)
test_users1 = test_users1.drop(['create_day'], axis = 1)
test_users1.shape

(62096, 14)

In [76]:
from sklearn import preprocessing
ss = preprocessing.StandardScaler()
train_users_scaled = pd.DataFrame(ss.fit_transform(train_users))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [30]:
# naive bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [31]:
gnb = GaussianNB()
gnb.fit(train_users_scaled, label_df.values.ravel())

GaussianNB(priors=None, var_smoothing=1e-09)

In [32]:
test_users_scaled = pd.DataFrame(ss.fit_transform(test_users1))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [34]:
country = gnb.predict(test_users_scaled)

In [38]:
submission = pd.DataFrame({
    "id" : test_users['id'],
    "country" : le.inverse_transform(country).tolist()
})

In [39]:
submission.to_csv('submission_bayes_only1.csv', index = False)

In [95]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(max_depth = 4, learning_rate = 0.15, n_estimators = 200, random_state = 817)
gb.fit(train_users_scaled, label_df.values.ravel())

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.15, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              n_iter_no_change=None, presort='auto', random_state=817,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [96]:
prediction_proba = gb.predict_proba(test_users_scaled)
ids_test = test_users['id']

ids = []
countries = []

for i in range(len(ids_test)):
    idx = ids_test[i]
    ids += [idx] * 5
    countries += le.inverse_transform(np.argsort(prediction_proba[2])[::-1][:5]).tolist()
    
submission = pd.DataFrame({
    "id" : ids,
    "country" : countries
})
submission.to_csv('submission_gbc.csv', index = False)

In [97]:
submission.to_csv('submission_gbc_1.csv', index = False)