# BNP

In [12]:
import numpy as np
import pandas as pd
import zipfile

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import log_loss
from sklearn.cross_validation import StratifiedShuffleSplit

In [2]:
[x for x in range(20) if x == 2]

[2]

## Load dataset

In [2]:
def loadFileinZipFile(zip_filename, dtypes=None, parsedate = None, password=None, **kvargs):
    """
    Load zipfile to dataframe.
    """
    with zipfile.ZipFile(zip_filename, 'r') as myzip:
        if password:
            myzip.setpassword(password)

        inside_zip_filename = myzip.filelist[0].filename

        if parsedate:
            pd_data = pd.read_csv(myzip.open(inside_zip_filename), sep=',', parse_dates=parsedate, dtype=dtypes, **kvargs)
        else:
            pd_data = pd.read_csv(myzip.open(inside_zip_filename), sep=',', dtype=dtypes, **kvargs)
        return pd_data, inside_zip_filename
    
def create_dataset1(pd_data):
    
    #filling na with -999 so that there is no overlap with none Na values
    pd_data = pd_data.fillna(-999)
    
    #Label encoding categorical variable
    for col in pd_data.select_dtypes(['object']):
        pd_data[col] = pd.factorize(pd_data[col])[0]
    
    #Extracting train and test data
    pd_train = pd_data[pd_data.target >= 0]
    pd_test = pd_data[pd_data.target == -1]

    Y = pd_train['target'].values
    X = np.array(pd_train.drop(['target','ID'],1))
    X_test = np.array(pd_test.drop(['ID'],1))
    test_idx = pd_test['ID']
    
    return X, Y, X_test, test_idx

def create_dataset2(pd_data):
    
    #filling na with -999 so that there is no overlap with none Na values
    pd_data = pd_data.fillna(-999)
    
    #Label encoding categorical variable
    for col in pd_data.select_dtypes(['object']):
        pd_data[col] = pd.factorize(pd_data[col])[0]
    
    #Extracting train and test data
    pd_train = pd_data[pd_data.target >= 0]
    pd_test = pd_data[pd_data.target == -1]

    Y = pd_train['target'].values
    X = np.array(pd_train.drop(['target','ID'],1))
    X_test = np.array(pd_test.drop(['ID'],1))
    test_idx = pd_test['ID']
    
    return X, Y, X_test, test_idx

In [3]:
folder = "/home/ardalan/Documents/kaggle/bnp/data/"

#Loading datasets
pd_train, _ = loadFileinZipFile(folder + "train.csv.zip")
pd_test, _ = loadFileinZipFile(folder + "test.csv.zip")

#Merging datasets into one DataFrame
pd_test['target'] = -1
pd_data = pd_train.append(pd_test).reset_index(drop=True)

#getting dataset1
X, Y, X_test, test_idx = create_dataset1(pd_data)
D1 = (X, Y, X_test, test_idx)

X, Y, X_test, test_idx = create_dataset2(pd_data)
D2 = (X, Y, X_test, test_idx)

In [21]:
def models():
    
    clfs = [
        [D1, RandomForestClassifier(n_estimators=20, n_jobs=8, max_depth=10)],
        [D1, ExtraTreesClassifier(n_estimators=20, n_jobs=8, max_depth=10)],
        [D2, RandomForestClassifier(n_estimators=20, n_jobs=8, max_depth=10)],
        [D3, RandomForestClassifier(n_estimators=20, n_jobs=8, max_depth=10)],
        [D4, RandomForestClassifier(n_estimators=20, n_jobs=8, max_depth=10)],
        [D5, RandomForestClassifier(n_estimators=20, n_jobs=8, max_depth=10)],
        [D6, RandomForestClassifier(n_estimators=20, n_jobs=8, max_depth=10)],
        [D7, RandomForestClassifier(n_estimators=20, n_jobs=8, max_depth=10)],
        [D8, RandomForestClassifier(n_estimators=20, n_jobs=8, max_depth=10)],
    ]
    return clfs


clfs = models()

In [40]:
prediction

array([[4, 8, 6, ..., 2, 6, 1],
       [8, 7, 2, ..., 7, 2, 3],
       [8, 3, 4, ..., 5, 3, 9],
       ..., 
       [8, 9, 5, ..., 2, 3, 4],
       [2, 2, 2, ..., 4, 2, 5],
       [7, 7, 1, ..., 9, 5, 6]])

In [26]:
skf = StratifiedShuffleSplit(Y, n_iter=4, test_size=0.2, random_state=123)

#Cross validation from a list of models
for clf_indice, data_clf in enumerate(clfs):
    
    #Selecting a model from the list
    print("Classifier [%i]" % clf_indice)
    
    X = data_clf[0][0]
    Y = data_clf[0][1]
    X_test = data_clf[0][2]
    test_idx = data_clf[0][3]
    
    clf = data_clf[1]
    print(clf)
    
    for fold_indice, (tr_idx, te_idx) in enumerate(skf):
        
        print("Fold [%i]" % fold_indice)
        xtrain = X[tr_idx]
        ytrain = Y[tr_idx]
        xval = X[te_idx]
        yval = Y[te_idx]
        
        clf.fit(xtrain, ytrain)
        
        y_train_pred = clf.predict_proba(xtrain)
        y_val_pred = clf.predict_proba(xval)
        
        train_error = log_loss(ytrain, y_train_pred)
        val_error = log_loss(yval, y_val_pred)
        print(train_error, val_error)

Classifier [0]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold [0]
0.462327192606 0.49178501286
Fold [1]
0.457616495516 0.486886104061
Fold [2]
0.458760349588 0.487374256865
Fold [3]
0.456442443075 0.48817378845
Classifier [1]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold [0]
0.459728124188 0.489453693541
Fold [1]
0.459457126707 0.490049838357
Fold [2]
0.460448811722 0.48

##### We create two datasets:
    - One by Label Encoding categorical columns
    - One by One Hot Encoding categorical columns

## Creation of the first dataset

##### Label Encoding all categorical features

In [37]:
prediction = np.random.randint(1,10, (10000,10))

In [38]:
prediction.shape

(10000, 10)

In [39]:
prediction

array([[4, 8, 6, ..., 2, 6, 1],
       [8, 7, 2, ..., 7, 2, 3],
       [8, 3, 4, ..., 5, 3, 9],
       ..., 
       [8, 9, 5, ..., 2, 3, 4],
       [2, 2, 2, ..., 4, 2, 5],
       [7, 7, 1, ..., 9, 5, 6]])