# Logistic Regression Training

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import h5py
import re
import pandas as pd

### Load code for this project

In [None]:
import logistic.train
import logistic.eval
%load_ext autoreload
%autoreload 1
%aimport logistic.train
%aimport logistic.eval

In [None]:
import pickle

import numpy as np
import os
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

from utils.data import CLASS_NAMES, load_discretized_data

FEATURES = 0
TARGETS = 1

## Train a logistic regression classifier.

In [None]:
data_dir = "./data_processing/voxels/"

# Set random seeds
seed = 71
np.random.seed(seed)

# Load data
train, _ = load_discretized_data(data_dir, prefix='', binary=True)

#select subset of data
examples_limit = 1000
if examples_limit == -1:
    examples_limit = train[TARGETS].shape[0]

## Build model

Use standard `LogisticRegression` without regularization:

In [None]:
model = LogisticRegression(
        solver='saga', 
        n_jobs=-1, 
        class_weight='None', #'balanced'
        penalty='l2', #regularization (penalization)
        C=1.0, # regularization parameter
        max_iter=4000,
        tol=1e-4
        )

Use cross-validation and a grid-search for best regularization, `LogisticRegressionCV`:

In [None]:
model_cv = LogisticRegressionCV(
        solver='saga', 
        n_jobs=-1, 
        class_weight='None', #'balanced'
        penalty='l2', #regularization (penalization)
        Cs=10, #inverse regularization strength (if int, set in a scale)
        cv=5, #cross-validation fold
        max_iter=4000,
        tol=1e-4
        )

## Train the model

In [None]:
%%time
model.fit(train[FEATURES][:examples_limit],
          train[TARGETS][:examples_limit])

model_cv.fit(train[FEATURES][:examples_limit],
          train[TARGETS][:examples_limit])


log_dir = "logs/"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

model_filename = os.path.join(log_dir, 'logistic_model.pkl')
pickle.dump(model, open(model_filename, 'wb'))
model_filename = os.path.join(log_dir, 'logistic_cv_model.pkl')
pickle.dump(model_cv, open(model_filename, 'wb'))

## Evaluate the logistic regression classifier

Accuracy and classification metrics are printed to the console.

In [None]:
model_file = 'logs/logistic_model.pkl'
assert model_file.endswith('.pkl'), 'model_file must point to a pickle file'

# Set random seeds
np.random.seed(seed)

# Load data
_, test = load_discretized_data(data_dir, prefix='', binary=True)

examples_limit = -1
if examples_limit == -1:
    examples_limit = test[TARGETS].shape[0]

# Load the model
model = pickle.load(open(model_file, 'rb'))

# Evaluate the model
acc = model.score(test[FEATURES][:examples_limit], test[TARGETS][:examples_limit])
acc_cv = model_cv.score(test[FEATURES][:examples_limit], test[TARGETS][:examples_limit])


# Make predictions
preds = model.predict(test[FEATURES][:examples_limit])
preds = model_cv.predict(test[FEATURES][:examples_limit])

# Get classification metrics
report = classification_report(test[TARGETS][:examples_limit], preds,
                               target_names=CLASS_NAMES,
                               digits=2)

report_cv = classification_report(test[TARGETS][:examples_limit], preds,
                               target_names=CLASS_NAMES,
                               digits=2)

# Print the results
print('\nClassification Report, LogisticRegression:\n')
print(report)
print('Accuracy: {}\n'.format(acc))

print('\nClassification Report, LogisticRegressionCV:\n')
print(report_cv)
print('Accuracy: {}\n'.format(acc_cv))