In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from preprocessing import AutoEncoder, ItemSelector, ReplaceMissing
from sklearn.feature_selection import VarianceThreshold
from evaluation import gini, gini_normalized
import pickle

from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Activation, Dropout
from keras.models import Model

%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


In [2]:
np.random.seed(10)

In [3]:
from tensorflow import set_random_seed
set_random_seed(15)

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('data/train.csv')
df = df.drop('id', axis=1)

df_test = pd.read_csv('data/test.csv')
df_test = df_test

In [6]:
target = df['target'].as_matrix()
df = df.drop('target', axis=1)
df_test_temp = df_test
df_test = df_test.drop('id', axis=1)

In [7]:
cols_use = [c for c in df.columns if (not c.startswith('ps_calc_'))]
df = df[cols_use]
df_test = df_test[cols_use]

# delete almost 1 value in data

In [8]:
filter_more_frequent = []
for i in df:
    if(np.sum(df[i] == df[i].mode()[0]) / len(df) <= 0.95):
        filter_more_frequent.append(i)

In [9]:
df = df[filter_more_frequent]
df_test = df_test[filter_more_frequent]

# Filter many missing out

In [10]:
filter_index = []
for i in df:
    if(np.sum(df[i] == -1) / df.shape[0] <= 0.05):
        filter_index.append(i)

In [11]:
df = df[filter_index]
df_test = df_test[filter_index]

# create each map of feature 

In [12]:
maps_of_feature = { 'binary': [], 'nominal': [], 'interval': [], 'ordinal': []}
for f, i in zip(df.columns, range(len(df.columns))):
    # Defining the level
    if 'bin' in f:
        maps_of_feature['binary'].append(i-1)
    elif f == 'target':
        print(f, i)
    elif 'cat' in f or f == 'id':
        if(f == 'ps_car_08_cat'):
            maps_of_feature['binary'].append(i-1)
        else:
            maps_of_feature['nominal'].append(i-1)
    elif df[f].dtype == float:
        maps_of_feature['interval'].append(i-1)
    elif df[f].dtype == int:
        maps_of_feature['ordinal'].append(i-1)

In [13]:
train_validate = df.as_matrix()
test = df_test.as_matrix()

In [14]:
from sklearn.model_selection import train_test_split

# balance

In [15]:
def balance_train(x_train,y_train):
    x_true = x_train[y_train==1]
    y_true = y_train[y_train==1]
    x_false = x_train[y_train==0]
    y_false = y_train[y_train==0]
    idx_false = np.random.choice(len(y_false),len(y_true))
    x_false = x_false[idx_false]
    y_false = y_false[idx_false]
    x_train = np.concatenate((x_true,x_train),axis=0)
    y_train = np.concatenate((y_true,y_train),axis=0)
    
    idx = np.arange(len(x_train))
    np.random.shuffle(idx)
    x_train = x_train[idx]
    y_train = y_train[idx]
    
    print(np.sum(y_train == 0))
    print("Train count: "+str(len(y_train)))
    unique, counts = np.unique(y_train, return_counts=True)
#     print(dict(zip(unique, counts)))
    print("0 (%): "+str(counts[0]/y_train.shape[0]*100))
    print("1 (%): "+str(counts[1]/y_train.shape[0]*100))
    return x_train,y_train

# create validate

In [16]:
def createPipeline():
    
    PipelineForNumberic = Pipeline([
                                ('select_numeric', ItemSelector(maps_of_feature['interval']))
                                ,('replace_mean', Imputer(missing_values=-1, strategy='mean'))
                                ,('scaler', MinMaxScaler())
                               ])

    PipelineForOrdinal = Pipeline([
                                    ('select_ordinal', ItemSelector(maps_of_feature['ordinal'] + maps_of_feature['binary']))
                                    ,('scaler', MinMaxScaler())
                                ])

    PipelineForCategorical = Pipeline([
                                    ('select_categorical', ItemSelector(maps_of_feature['nominal'] ))
    #                                 ,('replace_mode', ReplaceMissing(value=999))
    #                                 ,('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    #                                 ,('autoencoder300', AutoEncoder(80, 300).cuda())
    #                                 ,('autoencoder100', AutoEncoder(300, 100).cuda())
                                   ])
    
    rest_data_preprocessing = Pipeline([
            ('data_pre', FeatureUnion(
                [
                    ('numeric_preprocessing', PipelineForNumberic), 
                    ('ordinal_preprocessing', PipelineForOrdinal),
                ])
            ),
            ('variance_out', VarianceThreshold(threshold=0.01))
    ])
    
    return rest_data_preprocessing, PipelineForCategorical

# Model

In [17]:
cat_features = [];
cat_features_ncat = {};

for col in df.columns:
    if 'cat' in col:
        if(col != 'ps_car_08_cat'):
            cat_features.append(col)
        
for col in cat_features:
    values = df[col].values
    cat_features_ncat[col] = len(set(values))
#     print(cat_features_ncat[col])

In [18]:
# em_dims = {}
# for cat in cat_features:
#     n_dim = cat_features_ncat[cat]
#     em_dim = np.log2(n_dim) + 1
#     em_dim = int(em_dim)
#     em_dims[cat] = em_dim
#     print(n_dim, em_dim)

In [19]:
em_dims = {'ps_ind_02_cat': 3,
          'ps_ind_04_cat': 2,
          'ps_ind_05_cat': 5,
          'ps_car_01_cat': 7,
          'ps_car_02_cat': 2,
          'ps_car_03_cat': 2,
          'ps_car_04_cat': 5,
          'ps_car_05_cat': 2,
          'ps_car_06_cat': 8,
          'ps_car_07_cat': 2,
          'ps_car_09_cat': 3,
          'ps_car_10_cat': 2,
          'ps_car_11_cat': 10}

In [74]:
def build_embedding_network():
    em_inputs = []
    em_outs = []
    for cat in cat_features:
        n_dim = cat_features_ncat[cat]
        em_dim = em_dims[cat]
        em_input = Input(shape=(1,), name = cat+'_input')
        em_inputs.append(em_input)
        em_out = Embedding(n_dim, em_dim, input_length = 1, name = cat+'_embedding')(em_input)
        em_out = Flatten()(em_out)
        em_outs.append(em_out)

    # rest of input in model
    n_rest_input = rest_preprocessed_data.shape[1]
    rest_input = Input(shape = (n_rest_input,), name = 'rest_input') 
    rest_out = Dense(16)(rest_input)
    print(n_rest_input)

    # concatenate input and embedding things
    concat_layer_out = concatenate([rest_out] + em_outs, name = 'concatenated_layer')
    all_em_out = concatenate(em_outs, name='all_em_output')

    # some NN thing
    embed = Dense(80, name = 'encode')(concat_layer_out)
    embed = Activation('relu')(embed)
    embed = Dropout(0.35)(embed)

    x = Dense(20)(embed)
    x = Activation('relu')(x)
    x = Dropout(0.15)(x)

    x = Dense(10)(x)
    x = Activation('relu')(x)
    x = Dropout(0.15)(x)

    output = Dense(1, activation = 'sigmoid', name = 'output')(x)

    model = Model(inputs = [rest_input] + em_inputs,
                 outputs = [output])

    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

## Prepare input 

In [75]:
class ChangeFormat():
    
    def train_change(self, cate, rest):
        all_input = {}
        self.val_maps = []
        for i, n in enumerate(maps_of_feature['nominal']):
            cat = df.columns[n+1]
            raw_vals = np.unique(cate[: , i])
            val_map = {}
            for j in range(len(raw_vals)):
                val_map[raw_vals[j]] = j
            self.val_maps.append(val_map)
            all_input[cat+'_input'] = cate[:, i]
        all_input['rest_input'] = rest
        return all_input
    
    def change(self, cate, rest):
        all_input = {}
        for i, n in enumerate(maps_of_feature['nominal']):
            cat = df.columns[n+1]
            val_map = self.val_maps[i]
            all_input[cat+'_input'] = cate[:, i]
        all_input['rest_input'] = rest
        return all_input

In [76]:
from sklearn.model_selection import StratifiedKFold

In [77]:
all_pred_test = []

In [78]:
rest_data_preprocessing, PipelineForCategorical = createPipeline()

In [80]:
kfold = StratifiedKFold(n_splits=8,
                        random_state=231,
                        shuffle=True)
for i, (f_ind, outf_ind) in enumerate(kfold.split(train_validate, target)):
    train, validate = train_validate[f_ind].copy(), train_validate[outf_ind].copy()
    train_target, validate_target = target[f_ind], target[outf_ind]
    balanced_train, balanced_train_target = balance_train(train, train_target)

    rest_data_preprocessing, PipelineForCategorical = createPipeline()
    rest_preprocessed_data = rest_data_preprocessing.fit_transform(balanced_train, balanced_train_target)
    cate_preprocessed_data = PipelineForCategorical.fit_transform(balanced_train, balanced_train_target)
    rest_preprocessed_data_validate = rest_data_preprocessing.transform(validate)
    cate_preprocessed_data_validate = PipelineForCategorical.transform(validate)
    
    format_changer = ChangeFormat()
    all_input_train = format_changer.train_change(cate_preprocessed_data, rest_preprocessed_data)
    all_input_validate = format_changer.change(cate_preprocessed_data_validate, rest_preprocessed_data_validate)

    val = []
    for i in range(1):
        model = build_embedding_network()
        model.fit(all_input_train, balanced_train_target, epochs=30, batch_size=4096, verbose=1)
        intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer('encode').output)
        pred_validate = model.predict(all_input_validate)
        val.append(pred_validate.reshape(-1))
    print(gini_normalized(validate_target, np.sum(val, axis=0))) 
    break

#     rest_preprocessed_data_test = rest_data_preprocessing.transform(test)
#     cate_preprocessed_data_test = PipelineForCategorical.transform(test)
#     all_input_test = format_changer.change(cate_preprocessed_data_test, rest_preprocessed_data_test)
#     pred_test = model.predict(all_input_test).reshape(-1)
#     all_pred_test.append(pred_test)

501828
Train count: 539792
0 (%): 92.9669205916
1 (%): 7.03307940836
17
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
0.256306374422


# Apply Model

In [81]:
encoded_train = intermediate_layer_model.predict(all_input_train)
encoded_validate = intermediate_layer_model.predict(all_input_validate)
model = LogisticRegression(class_weight='balanced')

In [82]:
model.fit(encoded_train, balanced_train_target)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [83]:
pred = model.predict(encoded_train)
pred_validate = model.predict(encoded_validate)

In [84]:
print(gini_normalized(balanced_train_target, pred))
print(gini_normalized(validate_target, pred_validate))

0.188575625673
0.18005023884


# Prepare test

In [None]:
# rest_preprocessed_data_test = rest_data_preprocessing.transform(test)
# cate_preprocessed_data_test = PipelineForCategorical.transform(test)
# all_input_test = changeFormat(cate_preprocessed_data_test, rest_preprocessed_data_test)
# pred_test = model.predict(all_input_test).reshape(-1)

In [None]:
# result = pd.DataFrame({'id':df_test_temp['id'],'target': pred_test.reshape(-1)})
# result.to_csv('data/result_embedding.csv', index=False)

In [None]:
# preprocessed_data_all = data_preprocessing.transform(train_validate)
# preprocessed_data_test = data_preprocessing.transform(test)

# with open('data/Preprocessing/train.pickle', 'wb') as f:
#     pickle.dump(preprocessed_data_all, f)
    
# with open('data/Preprocessing/test.pickle', 'wb') as f:
#     pickle.dump(preprocessed_data_test, f)

In [None]:
# pred_test = model.predict_proba(preprocessed_data_test)
# result = pd.DataFrame({'id':df_test['id'],'target': pred_test[:,1]})
# result.to_csv('data/result.csv', index=False)