# Logistic Regression + GridSearchCV

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import csv
import string
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder




In [4]:
df_train = pd.read_csv('./train_users_2.csv')
df_test = pd.read_csv('./test_users.csv')
id_test = df_test['id']
labels = df_train['country_destination'].values
df_train = df_train.drop(['country_destination'], axis=1)
piv_train = df_train.shape[0]

# Clean up and then encode each categorical variable

In [7]:
# split date fields into 3 columns (year, month, day) to avoid having a feature for every possible date
dac = np.vstack(df_train.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('/')))).values)
df_train['acct_create_month'] = dac[:,0]
df_train['acct_create_day'] = dac[:,1]
df_train['acct_create_year'] = dac[:,2]

# train and test data use different formats, so this is doing the same thing as above
dac = np.vstack(df_test.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_test['acct_create_month'] = dac[:,1]
df_test['acct_create_day'] = dac[:,2]
df_test['acct_create_year'] = dac[:,0]

# concatenate the train and test user files
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

# concat account create month/year
df_all['acct_create_month_year'] = df_all.acct_create_month.astype(str) +'/'+ df_all.acct_create_year.astype(str)

# drop columns unecessary for prediction
df_all = df_all.drop(['id', 'date_first_booking','date_account_created','timestamp_first_active','acct_create_day','acct_create_month','acct_create_year'], axis=1)

#set unknown gender values to NA
df_all.gender = df_all.gender.replace('-unknown-',np.nan)

#fill NA values with -1
df_all = df_all.fillna(-1)

# The age field is populated with some outlying values and some year values (e.g., 2014)
# This will pull only ages between 14 and 100
av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

# encode categorical features with dummy values
categorical = ['acct_create_month_year', 'age', 'gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in categorical:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)
    
print "Done!"

Unnamed: 0,acct_create_month_year_1/2010,acct_create_month_year_1/2011,acct_create_month_year_1/2012,acct_create_month_year_1/2013,acct_create_month_year_1/2014,acct_create_month_year_10/2010,acct_create_month_year_10/2011,acct_create_month_year_10/2012,acct_create_month_year_10/2013,acct_create_month_year_11/2010,...,first_browser_Silk,first_browser_SiteKiosk,first_browser_SlimBrowser,first_browser_Sogou Explorer,first_browser_Stainless,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_UC Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split training data into a training and development set

In [12]:
# After cleansing, split the data back up between the train and test users
vals = df_all.values
full_train = vals[:piv_train]
full_test = vals[piv_train:]

# Split training values between train & dev sets 

np.random.seed(0)
msk = np.random.rand(len(train_vals)) < 0.75

split_train_data = train_vals[msk]
split_train_labels = labels[msk]

split_dev_data = train_vals[~msk]
split_dev_labels = labels[~msk]

# Using GridSearchCV, identify the optimal regularization strength and predict the label of the dev data to understand the model's performance. Then, run the model again to predict for the test data.

In [14]:
strengths = {'C': [0.0001,0.001,0.01,0.1,0.3,0.5,1.0]}

# GridSearch for optimal regularization strength
clf_lr = GridSearchCV(LogisticRegression(), strengths, scoring='f1_micro')

################################
# Fitting 75% of the training set and then predicting the remaining 25%
################################

clf_lr.fit(split_train_data, split_train_labels)

# development predictions
dev_preds = clf_lr.predict(split_dev_data)

print "Optimal Regularization Strength:", clf_lr.best_params_
print "LogReg F1:", metrics.f1_score(split_dev_labels, dev_preds, average='micro')

################################
# Fitting the full training set and predicting on the test set
################################
clf_lr.fit(full_train, labels)
test_preds = clf_lr.predict_proba(full_test)

Optimal Regularization Strength: {'C': 0.001}
LogReg F1: 0.623654913058


In [10]:
le = LabelEncoder().fit(train_labs)
#Taking the 5 classes with highest probabilities
user_ids = []  #list of ids
countries = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    user_ids += [idx] * 5
    countries += le.inverse_transform(np.argsort(test_preds[i])[::-1])[:5].tolist()

In [31]:
# Generate Kaggle Submission

sub = pd.DataFrame(np.column_stack((user_ids, countries)), columns=['id', 'country'])
sub.to_csv('sub_LogReg.csv',index=False)