In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import csv
import string
from datetime import datetime

from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn import metrics




In [3]:
df_train = pd.read_csv('./train_users_2.csv')
df_test = pd.read_csv('./test_users.csv')
labels = df_train['country_destination'].values
id_test = df_test['id']
df_train = df_train.drop(['country_destination'], axis=1)
piv_train = df_train.shape[0]

In [8]:
# concatenate the train and test user files
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

# drop columns unecessary for prediction
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)

#fill NA values with -1
df_all = df_all.fillna(-1)

# replace '/' with '-' in date fields for consistency
for i in range(len(df_all.date_account_created.values)):
        df_all.date_account_created.values[i] = df_all.date_account_created.values[i].replace('/','-')

# split date fields into 3 columns (year, month, day) to avoid having a feature for every possible date
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]
df_all = df_all.drop(['date_account_created'], axis=1)

# encode categorical features with dummy values
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

In [5]:
# After cleansing, split the data back up between the train and test users
vals = df_all.values
train_vals = vals[:piv_train]
test_vals = vals[piv_train:]

In [6]:
# Split training values between train & dev sets 

np.random.seed(0)
msk = np.random.rand(len(train_vals)) < 0.75
train = train_vals[msk]
train_labs = labels[msk]
dev = train_vals[~msk]
dev_labs = labels[~msk]

In [7]:
strengths = {'C': [0.0001,0.001,0.01,0.1,0.3,0.5,1.0]}
clf_lr = GridSearchCV(LogisticRegression(), strengths, scoring='f1_micro')
clf_lr.fit(train, train_labs)
preds = clf_lr.predict(dev)
print "Optimal Regularization Strength:", clf_lr.best_params_
print "LogReg F1:", metrics.f1_score(dev_labs, preds, average='micro')

Optimal Regularization Strength: {'C': 0.0001}
LogReg F1: 0.584372789217


In [96]:
# Generate Kaggle Submission

sub = pd.DataFrame(np.column_stack((id_test, preds)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)