# BNP

In [12]:
import numpy as np
import pandas as pd
import zipfile

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import log_loss
from sklearn.cross_validation import StratifiedShuffleSplit

## Load dataset

In [2]:
def loadFileinZipFile(zip_filename, dtypes=None, parsedate = None, password=None, **kvargs):
    """
    Load zipfile to dataframe.
    """
    with zipfile.ZipFile(zip_filename, 'r') as myzip:
        if password:
            myzip.setpassword(password)

        inside_zip_filename = myzip.filelist[0].filename

        if parsedate:
            pd_data = pd.read_csv(myzip.open(inside_zip_filename), sep=',', parse_dates=parsedate, dtype=dtypes, **kvargs)
        else:
            pd_data = pd.read_csv(myzip.open(inside_zip_filename), sep=',', dtype=dtypes, **kvargs)
        return pd_data, inside_zip_filename
    
def create_dataset1(pd_data):
    
    #filling na with -999 so that there is no overlap with none Na values
    pd_data = pd_data.fillna(-999)
    
    #Label encoding categorical variable
    for col in pd_data.select_dtypes(['object']):
        pd_data[col] = pd.factorize(pd_data[col])[0]
    
    #Extracting train and test data
    pd_train = pd_data[pd_data.target >= 0]
    pd_test = pd_data[pd_data.target == -1]

    Y = pd_train['target'].values
    X = np.array(pd_train.drop(['target','ID'],1))
    X_test = np.array(pd_test.drop(['ID'],1))
    test_idx = pd_test['ID']
    
    return X, Y, X_test, test_idx

In [3]:
folder = "/home/ardalan/Documents/kaggle/bnp/data/"

#Loading datasets
pd_train, _ = loadFileinZipFile(folder + "train.csv.zip")
pd_test, _ = loadFileinZipFile(folder + "test.csv.zip")

#Merging datasets into one DataFrame
pd_test['target'] = -1
pd_data = pd_train.append(pd_test).reset_index(drop=True)

#getting dataset1
X, Y, X_test, test_idx = create_dataset1(pd_data)


In [21]:
def models():
    
    clfs = [
        [D1_labelencoder, RandomForestClassifier(n_estimators=20, n_jobs=8, max_depth=10)],
        [D1_labelencoder, RandomForestClassifier(n_estimators=20, n_jobs=8, max_depth=10)]
    ]
    return clfs


D1_labelencoder = (X, Y, X_test, test_idx)
clfs = models()

In [25]:
grosse_list = ['bonjour', 'aurevoir', 'fdp']

for indice_de_element, element in enumerate(grosse_list):
    print(indice_de_element, element)

0 bonjour
1 aurevoir
2 fdp


In [26]:
skf = StratifiedShuffleSplit(Y, n_iter=4, test_size=0.2, random_state=123)

#Cross validation from a list of models
for clf_indice, data_clf in enumerate(clfs):
    
    #Selecting a model from the list
    print("Classifier [%i]" % clf_indice)
    
    X = data_clf[0][0]
    Y = data_clf[0][1]
    X_test = data_clf[0][2]
    test_idx = data_clf[0][3]
    
    clf = data_clf[1]
    print(clf)
    
    for fold_indice, (tr_idx, te_idx) in enumerate(skf):
        
        print("Fold [%i]" % fold_indice)
        xtrain = X[tr_idx]
        ytrain = Y[tr_idx]
        xval = X[te_idx]
        yval = Y[te_idx]
        
        clf.fit(xtrain, ytrain)
        
        y_train_pred = clf.predict_proba(xtrain)
        y_val_pred = clf.predict_proba(xval)
        
        train_error = log_loss(ytrain, y_train_pred)
        val_error = log_loss(yval, y_val_pred)
        print(train_error, val_error)

Classifier [0]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold [0]
0.462327192606 0.49178501286
Fold [1]
0.457616495516 0.486886104061
Fold [2]
0.458760349588 0.487374256865
Fold [3]
0.456442443075 0.48817378845
Classifier [1]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold [0]
0.459728124188 0.489453693541
Fold [1]
0.459457126707 0.490049838357
Fold [2]
0.460448811722 0.48

In [None]:

        if hasattr(clf, 'predict_proba'):
            #This is a list (not matrix or else)
            train_pred = clipProba(reshapePrediction(clf.predict_proba(xtrain)))
            val_pred = clipProba(reshapePrediction(clf.predict_proba(xval)))
        elif hasattr(clf, 'predict'):
            #This is a list (not matrix or else)
            train_pred = clipProba(reshapePrediction(clf.predict(xtrain)))
            val_pred = clipProba(reshapePrediction(clf.predict(xval)))

        #metrics
        train_error = eval_func(ytrain, train_pred)
        val_error = eval_func(yval, val_pred)

        print("train/val error: [{0:.4f}|{1:.4f}]".format(train_error, val_error))
        print(metrics.confusion_matrix(yval, val_pred.round()))

        dic_logs['fold'].append(fold_indice)
        dic_logs['ypredproba'].append(val_pred)
        dic_logs['yval'].append(yval)
        dic_logs['train_error'].append(train_error)
        dic_logs['val_error'].append(val_error)


    string_result = printResults(dic_logs)
    filename += "{}_{}".format(''.join(added_params),string_result)

    print(string_result)
    print(filename)

    if STORE: saveDicLogs(dic_logs, CODE_FOLDER + 'diclogs/' + filename + '.p')

    if DOTEST:

        print('Test prediction...')
        if clf_class_name == 'XGBClassifier' or 'XGBRegressor':
            clf.n_estimators = np.mean(dic_logs['best_epoch']).astype(int)
            print("Best n_estimators set to: ", clf.n_estimators)
            clf.fit(X, Y)

        elif clf_class_name == 'NN':
            clf.fit(X, Y)

        else:
            clf.fit(X, Y)

        if hasattr(clf, 'predict_proba'):
            ypredproba = clipProba(reshapePrediction(clf.predict_proba(X_test)))
        elif hasattr(clf, 'predict'):
            ypredproba = clipProba(reshapePrediction(clf.predict(X_test)))

        output_filename = CODE_FOLDER + 'diclogs/' + filename + '.csv'
        np.savetxt(output_filename, np.vstack((test_idx,ypredproba)).T, delimiter=',',
                   fmt='%i,%.10f'  ,header='ID,PredictedProb', comments="")
    del data_clf


##### We create two datasets:
    - One by Label Encoding categorical columns
    - One by One Hot Encoding categorical columns

## Creation of the first dataset

##### Label Encoding all categorical features

Unnamed: 0,ID,target,v1,v10,v100,v101,v102,v103,v104,v105,...,v90,v91,v92,v93,v94,v95,v96,v97,v98,v99
114321,0,-1,1.375465e+00,1.312911,1.970803e+01,4.186787,1.873945,4.129022,1.701894,0.004535,...,0.988980,1,0.658269,4.929298,2.992365,0.759955,5.947955,4.999999,10.013503,0.817844
114322,1,-1,-9.990000e+02,1.291029,-9.990000e+02,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,...,-999.000000,3,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000
114323,2,-1,-4.903407e-07,1.575492,1.980000e+01,6.535555,0.762963,4.917534,1.981859,0.002794,...,0.918734,3,0.552642,4.350983,3.163664,0.604966,6.233767,12.631578,10.214574,0.952380
114324,7,-1,2.661870e+00,1.575493,1.161434e-01,3.627655,-999.000000,7.486234,4.313037,1.048637,...,1.495102,1,0.775499,3.176199,5.071340,1.020527,4.980016,8.603352,5.743589,0.688783
114325,10,-1,1.252822e+00,1.050328,1.922770e+01,6.606787,1.493882,4.929004,1.906923,0.055134,...,0.993206,1,0.389974,5.632135,3.885107,0.417548,6.941310,7.652733,8.589221,1.075998
114326,11,-1,1.733601e+00,0.656456,1.895062e+01,5.467954,1.820259,5.440665,2.261760,0.027792,...,1.006321,3,0.529553,4.549925,3.785681,0.645640,6.398928,6.440678,7.571328,1.055333
114327,13,-1,-9.990000e+02,2.078774,-9.990000e+02,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,...,-999.000000,2,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000
114328,14,-1,2.027902e+00,0.262583,7.950963e+00,10.022851,2.668494,5.106525,2.987106,1.023943,...,0.763345,2,0.658457,8.104410,3.694061,0.702215,7.802691,7.999999,6.351981,1.735592
114329,15,-1,1.296225e+00,5.229759,9.214744e+00,8.412875,2.208578,4.126977,2.690052,0.146540,...,0.943545,2,0.930603,5.321216,3.017868,1.003715,7.037758,4.905659,8.209414,0.884910
114330,16,-1,-9.990000e+02,1.050329,-9.990000e+02,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,...,-999.000000,1,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000
