# CMF Data Analysis Training

## Initialisation

In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.grid_search import GridSearchCV
%matplotlib inline

## Data preparation

### Load data

In [2]:
train = pd.read_pickle('../data/train_prep.pkl')
test = pd.read_pickle('../data/test_prep.pkl')
sample = pd.read_csv('../data/sample.csv')

In [3]:
coverType = pd.read_csv('../data/train.csv', usecols=['Cover_Type'], squeeze=True)

### Prepare X and y matrices

In [4]:
X = train
y = coverType

le = LabelEncoder()
y = le.fit_transform(y)

## Logistic regression

In [5]:
Xtr, Xval, ytr, yval = train_test_split(X, y, test_size=0.15, random_state=777)

In [6]:
Xtr.shape, Xval.shape

((468360, 54), (82652, 54))

In [7]:
lr = LogisticRegression()
%time lr.fit(Xtr, ytr)

CPU times: user 1min 21s, sys: 201 ms, total: 1min 21s
Wall time: 1min 21s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
print('Train logloss', log_loss(ytr, lr.predict_proba(Xtr)))
print('Validation logloss', log_loss(yval, lr.predict_proba(Xval)))

Train logloss 0.673832068044
Validation logloss 0.677545242971


### Hyperparameter optimization

In [9]:
Cs = 10**np.linspace(-4, 4, num=15)
Cs

array([  1.00000000e-04,   3.72759372e-04,   1.38949549e-03,
         5.17947468e-03,   1.93069773e-02,   7.19685673e-02,
         2.68269580e-01,   1.00000000e+00,   3.72759372e+00,
         1.38949549e+01,   5.17947468e+01,   1.93069773e+02,
         7.19685673e+02,   2.68269580e+03,   1.00000000e+04])

In [13]:
grid = {'C': Cs}
gridsearch = GridSearchCV(LogisticRegression(), grid, scoring='log_loss', cv=10)

In [14]:
%time gridsearch.fit(X, y)

CPU times: user 3h 41min 21s, sys: 40.7 s, total: 3h 42min 2s
Wall time: 3h 39min 45s


GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-04,   3.72759e-04,   1.38950e-03,   5.17947e-03,
         1.93070e-02,   7.19686e-02,   2.68270e-01,   1.00000e+00,
         3.72759e+00,   1.38950e+01,   5.17947e+01,   1.93070e+02,
         7.19686e+02,   2.68270e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)