In [35]:
%matplotlib inline

In [36]:
from nose.tools import *
# Write your imports in the cell below

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [38]:
np.random.seed(1234)

# Model Training and Improvement Lab
## Comparing and selecting models

### 1. Read the data (1 point)
Like in the previous lab, you need to read the Portuguese bank dataset [here](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/). It has been provided for you in the `data` folder.

Read the dataset using `pandas` (you can use the library with the alias `pd`). Save it in the `bank_data` variable.

In [39]:
bank_data = pd.read_csv("data/bank.csv", sep=';')

In [40]:
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [41]:
bank_data.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [42]:
bank_data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [43]:
bank_data.groupby("y").size()/len(bank_data.y)*100

y
no     88.476001
yes    11.523999
dtype: float64

In [44]:
# From now on, all test cells might contain hidden tests. If you follow the instructions correctly, 
# your solution will be graded with maximum points
assert_is_not_none(bank_data)

### 2. Preprocess the data (1 point)
Separate explanatory features from labels. Save all features (16 columns total) in the variable `bank_features`. Save the labels (corresponding to the `y` column) in the `bank_labels` variable. Rewrite the labels to be `0` and `1` instead of `no` and `yes`: `bank_labels` should be a numeric column.

In [45]:
bank_features, bank_labels = None, None
bank_features = bank_data.drop("y", axis = 1)

In [46]:
bank_features.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown


In [47]:
bank_features.shape

(4521, 16)

In [48]:
bank_labels = bank_data.y

In [49]:
bank_labels.shape

(4521,)

In [50]:
assert_is_not_none(bank_features)
assert_is_not_none(bank_labels)

### 3. Get indicator variables (1 point)
Get indicator (dummy) variables for all categorical columns in `bank_features`. Overwrite the `bank_features` variable to store the new data.

In [51]:
bank_features = pd.get_dummies(bank_features)

In [57]:
bank_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 51 columns):
age                    4521 non-null int64
balance                4521 non-null int64
day                    4521 non-null int64
duration               4521 non-null int64
campaign               4521 non-null int64
pdays                  4521 non-null int64
previous               4521 non-null int64
job_admin.             4521 non-null uint8
job_blue-collar        4521 non-null uint8
job_entrepreneur       4521 non-null uint8
job_housemaid          4521 non-null uint8
job_management         4521 non-null uint8
job_retired            4521 non-null uint8
job_self-employed      4521 non-null uint8
job_services           4521 non-null uint8
job_student            4521 non-null uint8
job_technician         4521 non-null uint8
job_unemployed         4521 non-null uint8
job_unknown            4521 non-null uint8
marital_divorced       4521 non-null uint8
marital_married        4521 non-n

In [52]:
assert_equal(bank_features.shape, (4521, 51))

In [60]:
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [62]:
scaler.fit(data)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [63]:
scaler.mean_

array([0.5, 0.5])

In [65]:
scaler.var_

array([0.25, 0.25])

In [66]:
scaler.transform(data)

array([[-1., -1.],
       [-1., -1.],
       [ 1.,  1.],
       [ 1.,  1.]])

### 4. Split the data (1 point)
Split the data into training and testing set, with 70% of the data for training. Because the output labels are not equaly distributed, use stratification based on the `bank_labels`.

In [None]:
bank_features_train, bank_labels_train = None, None
bank_features_test, bank_labels_test = None, None
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert_is_not_none(bank_features_train)
assert_is_not_none(bank_labels_train)
assert_is_not_none(bank_features_test)
assert_is_not_none(bank_labels_test)

### 5. Train a baseline algorithm (1 point)
Train a logistic regression using the training data. Use 1 000 000 (`1e6`) as the value of C. Score it using the testing data. Save the score in the `baseline_score` variable. You should see a fairly high score.

In [None]:
model = None
baseline_score = None

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert_is_not_none(model)
assert_greater(baseline_score, 0.7)

### 6. Select a better score (2 points)
As you alrady saw, the positive examples are very few. If you aren't convinced, just check the counts.

We know that the default scoring (accuracy) isn't correct in this case. Better measures would be precision and recall. However, we only want one number. Evaluate the algorithm once again, using a standard scoring method which combines precision and recall. Overwrite the `baseline_score` variable.

Don't forget to score the model on the testing data only.

In [None]:
# YOUR CODE HERE
raise NotImplementedError()
print(baseline_score)

In [None]:
assert_less(baseline_score, 0.7)

### 7. Tune your model (2 points)
Fine-tune the `C` and `max_iter` parameters.

Use full grid search with the following values:
* `C`: 0.0001, 0.01, 0.1, 1, 10, 100, 10000
* `max_iter`: 50, 100, 300, 1000
* `fit_itercept`: True, False

Save the grid search result in the `grid_search` variable. Don't forget to use the better scoring model that you obtained in the previous task.

In [None]:
grid_search = None
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert_is_not_none(grid_search)
assert_is_not_none(grid_search.best_estimator_)

### 8. Compare scores (1 point)
Use the best estimator from your grid search. Score it using the function from problem 6. Save your answer in `tuned_score`.

In [None]:
tuned_score = None
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
print(tuned_score)

In [None]:
print(baseline_score - tuned_score)

Hmmmm, it seems we have not obtained a better algorithm, even the opposite (the difference is marginal and depends on the random initialization of the cross-validation datasets).

We can, of course, do a lot more things to improve our model's performance, such as normalizing the data, feature selection and feature engineering, trying out different aspects, e.g. polynomial terms, RANSAC; even boosting (we'll talk about this later). However, we'll stop at this point.

What can we conclude? It seems that this is close to the best performance we can get out of this algorithm, given these data points.

We can try improving (cleaning) our dataset, selecting features, etc. but we most likely need a better algorithm. In the next labs, we're going to explore that.