# KAGGLE MEETUP #10: BNP

## 4 steps:
    - 1: Load original dataset and create new variant
    - 2: Train classifiers on each datasets and create oof predictions
    - 3: Train classifiers on oof predictions from step 2
    - 4: Average the test predictions of the step 3 classifier

In [1]:
__author__ = 'Ardalan'
import zipfile, copy, pickle
import numpy as np
import pandas as pd

from sklearn import ensemble, linear_model, svm
from sklearn.metrics import log_loss
from sklearn.cross_validation import StratifiedKFold

DATA_FOLDER = "/home/ardalan/Documents/kaggle/bnp/data/"

In [2]:
def loadFileinZipFile(zip_filename, dtypes=None, parsedate = None, password=None, **kvargs):
    """
    Load zipfile to dataframe.
    """
    with zipfile.ZipFile(zip_filename, 'r') as myzip:
        if password:
            myzip.setpassword(password)

        inside_zip_filename = myzip.filelist[0].filename

        if parsedate:
            pd_data = pd.read_csv(myzip.open(inside_zip_filename), sep=',', parse_dates=parsedate, dtype=dtypes, **kvargs)
        else:
            pd_data = pd.read_csv(myzip.open(inside_zip_filename), sep=',', dtype=dtypes, **kvargs)
        return pd_data, inside_zip_filename
    
def create_dataset1(pd_data):
    """
    - Label Encoding Categorical variables
    - Filling NaNs with -999
    """
    
    df = copy.copy(pd_data)
    df['target'] = df['target'].fillna(-1)
    df = df.fillna(-999)
    df = df.drop('v107',1)

    #Label encoding categorical variable
    for col in df.select_dtypes(['object']):
        df[col] = pd.factorize(df[col])[0]

    #Extracting train and test data
    pd_train = df[df.target >= 0]
    pd_test = df[df.target == -1]

    Y = pd_train['target'].values.astype(int)
    test_idx = pd_test['ID'].values

    X = np.array(pd_train.drop(['target','ID'],1))
    X_test = np.array(pd_test.drop(['target','ID'],1))
    
    return X, Y, X_test, test_idx

def create_dataset2(pd_data):
    """
    - One Hot Encoding Categorical variables
    - Filling NaNs with -999
    """

    df = copy.copy(pd_data)
    df['target'] = df['target'].fillna(-1)

    df = df.drop('v107',1)

    cat_vars_selected = ['v110', 'v112', 'v113', 'v125','v24', 'v3',
                         'v30','v31', 'v47', 'v52', 'v56', 'v66', 'v71', 'v74',
                         'v75', 'v79', 'v91']

    df = pd.get_dummies(df, columns=cat_vars_selected, dummy_na=True)
    df['v22'] = pd.factorize(df['v22'])[0]

    df = df.fillna(-999)

    #Extracting train and test data
    pd_train = df[df.target >= 0]
    pd_test = df[df.target == -1]

    Y = pd_train['target'].values.astype(int)
    test_idx = pd_test['ID'].values

    X = np.array(pd_train.drop(['target','ID'],1))
    X_test = np.array(pd_test.drop(['target','ID'],1))

    return X, Y, X_test, test_idx


def LoadParseBlendData(DATA_FOLDER):
    import fnmatch, glob
    folder = DATA_FOLDER + '*'
    pattern = "*.p"
    l_filenames = [path for path in glob.iglob(folder) if fnmatch.fnmatch(path, pattern)]
    print(len(l_filenames), l_filenames)

    dic_log = pickle.load(open(l_filenames[0], 'rb'))

    test_idx = dic_log['test_idx']
    Y = dic_log['blend_Y']

    X = np.zeros((len(dic_log['blend_X']), len(l_filenames)))
    X_test = np.zeros((len(dic_log['blend_X_test']), len(l_filenames)))

    for i, filename in enumerate(l_filenames):
        print(filename)

        dic_log = pickle.load(open(filename, 'rb'))

        X[:, i] = dic_log['blend_X'][:, 0]
        X_test[:, i] = dic_log['blend_X_test']
    return X, Y, X_test, test_idx

## Step 1: Load dataset and create variant

In [3]:
#Loading datasets
pd_train, _ = loadFileinZipFile(DATA_FOLDER + "train.csv.zip")
pd_test, _ = loadFileinZipFile(DATA_FOLDER + "test.csv.zip")
pd_data = pd_train.append(pd_test)

#Creating DATASET1 (D1)
X, Y, X_test, test_idx = create_dataset1(pd_data)
D1 = (X, Y, X_test, test_idx)

#Creating DATASET2 (D2)
X, Y, X_test, test_idx = create_dataset2(pd_data)
D2 = (X, Y, X_test, test_idx)

## Step 2: Train classifiers and create OOF predictions

In [17]:
def models():
    """
    Create a list of [DATASET, Classifier] to train on
    """
    
    
    ET_params = {'n_estimators':50,'max_features': 50,'criterion': 'entropy',
                 'min_samples_split': 4,'max_depth': 35, 'min_samples_leaf': 2}
    
    #ET_params = {'n_estimators':50}
    
    clfs = [
        #[D2, linear_model.LogisticRegression(penalty='l2')]
        [D1, ensemble.RandomForestClassifier(n_jobs=8, **ET_params)],
        [D2, ensemble.RandomForestClassifier(n_jobs=8, **ET_params)],
         
        [D1, ensemble.ExtraTreesClassifier(n_jobs=8, **ET_params)],
        [D2, ensemble.ExtraTreesClassifier(n_jobs=8, **ET_params)],
    ]
    return clfs

In [18]:
clfs = models()
skf = StratifiedKFold(Y, n_folds=5, shuffle=True , random_state=123)

#Cross validation from a list of models
for clf_indice, data_clf in enumerate(clfs):
    
    #Selecting a model from the list
    print("Classifier [%i]" % clf_indice)
    
    X = data_clf[0][0]
    Y = data_clf[0][1]
    X_test = data_clf[0][2]
    test_idx = data_clf[0][3]
    
    clf = data_clf[1]
    clf_name = clf.__class__.__name__
    print(clf)
    
    blend_X = np.zeros((len(X), 1))
    blend_Y = Y
    blend_X_test = np.zeros((len(X_test), 1))
    blend_X_test_fold = np.zeros((len(X_test), len(skf)))
    
    l_train_error = []
    l_val_error = []
    for fold_indice, (train_indices, val_indices) in enumerate(skf):
        
        print("Fold [%i]" % fold_indice)
        xtrain = X[train_indices]
        ytrain = Y[train_indices]
        xval = X[val_indices]
        yval = Y[val_indices]
        
        clf.fit(xtrain, ytrain)
        
        ytrain_pred = clf.predict_proba(xtrain)[:,1]
        yval_pred = clf.predict_proba(xval)[:,1]
        ytest_pred = clf.predict_proba(X_test)[:,1]
        
        # filling blend data sets
        blend_X[val_indices, 0] = yval_pred
        blend_X_test_fold[:, fold_indice] = ytest_pred
        
        # evaluating model
        train_error = log_loss(ytrain, ytrain_pred)
        val_error = log_loss(yval, yval_pred)
        l_train_error.append(train_error)
        l_val_error.append(val_error)
        print("train/val error: [{0:.4f}|{1:.4f}]".format(train_error, val_error))
        
        
    blend_X_test = np.mean(blend_X_test_fold, axis=1)
    
    diclogs = {'blend_X': blend_X,
               'blend_Y': Y,
               'blend_X_test': blend_X_test,
               'test_idx': test_idx,
               'clf_name': clf_name}
        
    #saving relevant information for blending later
    filename = "{}_tr-val_{:.4f}-{:.4f}".format(clf_name, np.mean(l_train_error), np.mean(l_val_error))
    pickle.dump(diclogs, open(DATA_FOLDER+filename + ".p", 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
    
    
    #Saving final predictions
    output_filename = DATA_FOLDER + filename + '.csv'
    np.savetxt(output_filename, np.vstack((test_idx, blend_X_test)).T,
               delimiter=',', fmt='%i,%.10f', header='ID,PredictedProb', comments="")

Classifier [0]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=35, max_features=50, max_leaf_nodes=None,
            min_samples_leaf=2, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold [0]
train/val error: [0.1644|0.4761]
Fold [1]
train/val error: [0.1637|0.4818]
Fold [2]
train/val error: [0.1623|0.4803]
Fold [3]
train/val error: [0.1623|0.4919]
Fold [4]
train/val error: [0.1634|0.4897]
Classifier [1]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=35, max_features=50, max_leaf_nodes=None,
            min_samples_leaf=2, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold [0]
train/val error: [0.1958|0.4676]
Fold [1]
t

## Step 3: Train classifiers on prediction from previous step

#### Gathering prediction from previous layer

In [19]:
X, Y, X_test, test_idx = LoadParseBlendData(DATA_FOLDER)
D_BLEND = (X, Y, X_test, test_idx)

5 ['/home/ardalan/Documents/kaggle/bnp/data/LogisticRegression_tr-val_0.5202-0.5212.p', '/home/ardalan/Documents/kaggle/bnp/data/RandomForestClassifier_tr-val_0.1632-0.4840.p', '/home/ardalan/Documents/kaggle/bnp/data/RandomForestClassifier_tr-val_0.1948-0.4721.p', '/home/ardalan/Documents/kaggle/bnp/data/ExtraTreesClassifier_tr-val_0.0812-0.4695.p', '/home/ardalan/Documents/kaggle/bnp/data/ExtraTreesClassifier_tr-val_0.1986-0.4722.p']
/home/ardalan/Documents/kaggle/bnp/data/LogisticRegression_tr-val_0.5202-0.5212.p
/home/ardalan/Documents/kaggle/bnp/data/RandomForestClassifier_tr-val_0.1632-0.4840.p
/home/ardalan/Documents/kaggle/bnp/data/RandomForestClassifier_tr-val_0.1948-0.4721.p
/home/ardalan/Documents/kaggle/bnp/data/ExtraTreesClassifier_tr-val_0.0812-0.4695.p
/home/ardalan/Documents/kaggle/bnp/data/ExtraTreesClassifier_tr-val_0.1986-0.4722.p


In [25]:
def models():
    """
    Create a list of [DATASET, Classifier] to train on
    """
    clfs = [
        [D_BLEND, ensemble.GradientBoostingClassifier(learning_rate=0.03, n_estimators=200,max_depth=5 ) ],  
    ]
    return clfs

In [26]:
clfs = models()
skf = StratifiedKFold(Y, n_folds=5, shuffle=True , random_state=123)

#Cross validation from a list of models
for clf_indice, data_clf in enumerate(clfs):
    
    #Selecting a model from the list
    print("Classifier [%i]" % clf_indice)
    
    X = data_clf[0][0]
    Y = data_clf[0][1]
    X_test = data_clf[0][2]
    test_idx = data_clf[0][3]
    
    clf = data_clf[1]
    clf_name = clf.__class__.__name__
    print(clf)
    
    blend_X = np.zeros((len(X), 1))
    blend_Y = Y
    blend_X_test = np.zeros((len(X_test), 1))
    blend_X_test_fold = np.zeros((len(X_test), len(skf)))
    
    l_train_error = []
    l_val_error = []
    for fold_indice, (train_indices, val_indices) in enumerate(skf):
        
        print("Fold [%i]" % fold_indice)
        xtrain = X[train_indices]
        ytrain = Y[train_indices]
        xval = X[val_indices]
        yval = Y[val_indices]
        
        clf.fit(xtrain, ytrain)
        
        ytrain_pred = clf.predict_proba(xtrain)[:,1]
        yval_pred = clf.predict_proba(xval)[:,1]
        ytest_pred = clf.predict_proba(X_test)[:,1]
        
        # filling blend data sets
        blend_X[val_indices, 0] = yval_pred
        blend_X_test_fold[:, fold_indice] = ytest_pred
        
        # evaluating model
        train_error = log_loss(ytrain, ytrain_pred)
        val_error = log_loss(yval, yval_pred)
        l_train_error.append(train_error)
        l_val_error.append(val_error)
        print("train/val error: [{0:.4f}|{1:.4f}]".format(train_error, val_error))
        
        
    blend_X_test = np.mean(blend_X_test_fold, axis=1)
    
    diclogs = {'blend_X': blend_X,
               'blend_Y': Y,
               'blend_X_test': blend_X_test,
               'test_idx': test_idx,
               'clf_name': clf_name}
        
    #saving relevant information for blending later
    filename = "BLEND_{}_tr-val_{:.4f}-{:.4f}".format(clf_name, np.mean(l_train_error), np.mean(l_val_error))
    pickle.dump(diclogs, open(DATA_FOLDER+filename + ".p" , 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
    
    #Saving final predictions
    output_filename = DATA_FOLDER + filename + '.csv'
    np.savetxt(output_filename, np.vstack((test_idx, blend_X_test)).T,
               delimiter=',', fmt='%i,%.10f', header='ID,PredictedProb', comments="")

Classifier [0]
GradientBoostingClassifier(init=None, learning_rate=0.03, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Fold [0]
train/val error: [0.4538|0.4515]
Fold [1]
train/val error: [0.4527|0.4557]
Fold [2]
train/val error: [0.4528|0.4554]
Fold [3]
train/val error: [0.4518|0.4591]
Fold [4]
train/val error: [0.4524|0.4571]


## Step 4: Average predictions