#### Build baseline model notebook for CP1 using Logistic Regression, with hyper-parameter tuning (regularization parameter), L1 and L2 regularization.  

In [55]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

In [68]:
df_train = pd.read_csv('train_users_2.csv')
df_train['age'] = df_train['age'].apply(lambda x: np.nan if x > 120 else x)

In [69]:
len(df_train[df_train['age'].isnull()])

88771

In [70]:
df_train["age"].fillna(df_train["age"].mean(), inplace=True)

In [71]:
df_train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,37.41187,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


### Find and replace country categories into numbers

In [72]:
class_dict = {"country_destination": {
    'NDF': 0,
    'US': 1,
    'other': 2,
    'FR': 3,
    'CA': 4,
    'GB': 5,
    'ES': 6,
    'IT': 7,
    'PT': 8,
    'NL': 9,
    'DE': 10,
    'AU': 11
}
}

In [73]:
df_train.replace(class_dict, inplace=True)
df_train.head(20)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,37.41187,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,0
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,0
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,1
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,2
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,1
5,osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,37.41187,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,1
6,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,1
7,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,1
8,a1vcnhxeij,2010-01-04,20100104004211,2010-07-29,FEMALE,50.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,1
9,6uh8zyj2gn,2010-01-04,20100104023758,2010-01-04,-unknown-,46.0,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,1


In [74]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#define the first parameter and see what it is
X = df_train[['age','country_destination']].values
print("X: ", type(X), X.shape)

X:  <class 'numpy.ndarray'> (213451, 2)


In [75]:
# what is the shape of the dataframe used to build X?
df_train[['age','country_destination']].shape

(213451, 2)

In [76]:
# define the second parameter, investigate its type and shape
# and also compare its shape with that of the dataframe it comes from ...
y = (df_train.gender == "MALE").values

# notice that, by construction, y will have value True in positions where
# the value for the "Gender" column of the dataframe is "Male", and False
# otherwise

print("y: ", type(y), y.shape)
print("df_train.Gender shape: ", df_train.gender.shape)


y:  <class 'numpy.ndarray'> (213451,)
df_train.Gender shape:  (213451,)


In [77]:
# AJS: Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(X, y, random_state=5)

In [78]:
# AJS:
print("\n")
print("Xlr:", Xlr, type(Xlr), Xlr.shape, len(Xlr)) #TrainX

# AJS:
print("\n")
print("Xtestlr", Xtestlr, type(Xtestlr), Xtestlr.shape, len(Xtestlr)) #TestX

# AJS:
print("\n")
print("ylr", ylr, type(ylr), ylr.shape, len(ylr)) #Trainy

# AJS:
print("\n")
print("ytestlr", ytestlr, type(ytestlr), ytestlr.shape, len(ytestlr)) #Testy

# AJS:
# By default train_test_split splits to 75% train and 25% test
# Random state set to a fixed number wil guarantee 
# that the output of Run 1 will be equal to the output of Run 2, 
# i.e. your split will be always the same



Xlr: [[ 37.41187039   0.        ]
 [ 51.           1.        ]
 [ 29.           0.        ]
 ..., 
 [ 34.           0.        ]
 [ 49.           0.        ]
 [ 24.           0.        ]] <class 'numpy.ndarray'> (160088, 2) 160088


Xtestlr [[ 34.           0.        ]
 [ 33.           0.        ]
 [ 37.41187039   0.        ]
 ..., 
 [ 37.41187039   0.        ]
 [ 37.41187039   0.        ]
 [ 37.41187039   0.        ]] <class 'numpy.ndarray'> (53363, 2) 53363


ylr [False False False ..., False False  True] <class 'numpy.ndarray'> (160088,) 160088


ytestlr [False False False ..., False False False] <class 'numpy.ndarray'> (53363,) 53363


### Splitting  data into training and testing
1. ```Xlr``` (the training set) is a matrix with 160088 rows and 2 columns--i.e., 75% of the dataset
2. ```ylr``` is the corresponding label vector for the training dataset
3. ```Xtestlr``` (the testing set) is a matrix with 53363 rows and 2 columns--i.e., 25% of the dataset
4. ```ytestlr``` is the corresponding label vector for the testing dataset

In [79]:
# construct the LogisticRegression model
clf = LogisticRegression()

# Fit the model on the training data.
clf.fit(Xlr, ylr) 

# Print the accuracy from the testing data.
# introduce variable to be reused later
y_predict_test = clf.predict(Xtestlr)
print("\n")
print("[Test] Accuracy score (y_predict_test, ytestlr):",accuracy_score(y_predict_test, ytestlr))

# Note the order in which the parameters must be passed
# according to the documentation ... although there should be
# no difference since it is a one-to-one comparison ...
# ref: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
print("\n")
print("[Test] Accuracy score: (ytestlr, y_predict_test)",accuracy_score(ytestlr, y_predict_test))

# also printout the training score
y_predict_training = clf.predict(Xlr)
print("\n")
print("[Training] Accuracy score: (ylr, y_predict_training)",accuracy_score(ylr, y_predict_training))



[Test] Accuracy score (y_predict_test, ytestlr): 0.743792515413


[Test] Accuracy score: (ytestlr, y_predict_test) 0.743792515413


[Training] Accuracy score: (ylr, y_predict_training) 0.745340062965


#### The model's training accuracy (0.745340062965) is not very good, it's clear that there's "bias" in the model.

#### The model's test accuracy (0.743792515413) is pretty close to the training accuracy, then one says that there is no "variance" between the training accuracy and the test accuracy. This is an indication that the model will "generalize well" 