In [1]:
from utils import css_from_file
css_from_file('style/style.css')

In [2]:
import pandas as pd
import numpy as np

from sklearn.cross_validation import KFold
from sklearn.metrics import log_loss



Read data from the file __data/boehringer/(train|test).csv__ from the data folder. 

The first column is a binary variable that you want to predict. The rest are numericals

In [3]:
def load(path):
    df = pd.read_csv(path)
    if "Activity" not in df.columns:
        df["Activity"] = np.nan
    return df.drop("Activity",axis=1), df.Activity
    
X_tr, y_tr = load("data/boehringer/train.csv")
X_te, y_te = load("data/boehringer/test.csv")

print("training data shape", X_tr.shape)
print("testing data shape", X_te.shape)

training data shape (3751, 1776)
testing data shape (2501, 1776)


In [4]:
X_tr.head()

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,0.243144,...,0,0,0,0,0,0,0,0,0,0
1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,0.10648,...,1,1,1,1,0,1,0,0,1,0
2,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,0.352308,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,0.208989,...,0,0,0,0,0,0,0,0,0,0
4,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,0.125177,...,0,0,0,0,0,0,0,0,0,0


Exercise
---------------------

Using the starter code below try to improve the solution

1. What kind of models you can use?
2. Try changing model parameters to get the best cross validation error.
3. Use pipeline to transform features before modeling:
   - use some feature selection mechanism
   - use dimension reduction method (pca, svd, etc)
   
Tip: It is ok to loop over models and datasets like this.

```python
for data in [pipeline_1, pipeline_2, pipeline_3]:
    for model in [model_1, model_2, model_3]:
        # do stuff
```

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression, BayesianRidge, SGDClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

# clf = GridSearchCV(RandomForestClassifier(n_jobs=-1, n_estimators=100), 
#                    param_grid={'max_depth':[5,10,15]})

# RandomForestClassifier(n_estimators=50, max_depth=50),

clf = make_pipeline(
    make_union(
        PCA(),
        RandomTreesEmbedding(n_estimators=200, max_depth=70, n_jobs=-1),
#         VarianceThreshold()
    ),
    StandardScaler(with_mean=False),
#     BaggingClassifier(),
#     SGDClassifier(alpha=0.001, loss='log'),
    RandomForestClassifier(n_estimators=200, max_depth=100),
#     MLPClassifier(hidden_layer_sizes=(32,16), verbose=True, max_iter=5)
)

oof_predictions = cross_val_predict(clf, X_tr, y_tr, method="predict_proba", n_jobs=-1, verbose=1)

err = log_loss(y_tr, oof_predictions)
print("Your error is", err)
if err > 0.5:
    print("You can still improve :)")

KeyboardInterrupt: 

Examples of classifiers 

<div class='spoiler'>

clf1 = RandomForestClassifier(n_estimators=10,n_jobs=-1)
clf2 = make_pipeline(PCA(), LogisticRegression())

clf3 = make_pipeline(
   make_union(
       RandomTreesEmbedding(n_estimators=10), 
       LazyTransformer()
   ), 
   LogisticRegression()
)

for clf in [clf1,clf2,clf3]:
    clf.fit(x,y)
    
clf = make_pipeline(make_union(make_pipeline(RandomTreesEmbedding(n_estimators=20), StandardScaler(with_mean=False)), 
                               make_pipeline(StandardScaler(with_mean=False), VarianceThreshold(0.1))),
                    MLPClassifier((15,), alpha=15.0, verbose=True))

clf = BaggingClassifier(make_pipeline(
                        make_union(RandomTreesEmbedding(n_estimators=10), 
                                   LazyTransformer()),
                        StandardScaler(with_mean=False), 
                        VarianceThreshold(0.001),
                        MLPClassifier((25,), alpha=10.0, verbose=False)), 
                        max_samples=0.75,
                        max_features=0.75,
                        n_estimators=10)
</div>