In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from preprocessing import AutoEncoder
from preprocessing import ItemSelector

In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('input/train.csv')
df = df.drop('id', axis=1)

df_test = pd.read_csv('input/test.csv')
df_test = df_test

In [5]:
maps_of_feature = { 'binary': [], 'nominal': [], 'interval': [], 'ordinal': []}
for f, i in zip(df.columns, range(len(df.columns))):
    # Defining the level
    if 'bin' in f:
        maps_of_feature['binary'].append(i-1)
    elif f == 'target':
        print(f, i)
    elif 'cat' in f or f == 'id':
        maps_of_feature['nominal'].append(i-1)
    elif df[f].dtype == float:
        maps_of_feature['interval'].append(i-1)
    elif df[f].dtype == int:
        maps_of_feature['ordinal'].append(i-1)

target 0


In [6]:
train = df.drop('target', axis=1).as_matrix()
target = df['target'].as_matrix()
test = df_test.drop('id', axis=1).as_matrix()

In [8]:
PipelineForNumberic = Pipeline([
                                ('select_numeric', ItemSelector(maps_of_feature['interval']))
                                ,('replace_mean', Imputer(missing_values=-1, strategy='mean'))
                               ])

PipelineForOrdinal = Pipeline([
                                ('select_ordinal', ItemSelector(maps_of_feature['ordinal']))
                                ,('replace_mean', Imputer(missing_values=-1, strategy='most_frequent'))
                               ])

PipelineForCategorical = Pipeline([
                                ('select_categorical', ItemSelector(maps_of_feature['nominal'] + maps_of_feature['binary']))
                                ,('replace_mode', Imputer(missing_values=-1, strategy='most_frequent'))
                                ,('onehot', OneHotEncoder(sparse=False))
                                ,('autoencoder', AutoEncoder(209))
                               ])

In [9]:
data_preprocessing = Pipeline([
            ('data_pre', FeatureUnion(
                [
                    ('numeric_preprocessing', PipelineForNumberic), 
                    ('ordinal_preprocessing', PipelineForOrdinal),
                    ('categorical_preprocessing', PipelineForCategorical)
                ])
            )
            ,('scaler', StandardScaler())
])

In [10]:
preprocessed_data = data_preprocessing.fit_transform(train, target)

AssertionError: 
Found no NVIDIA driver on your system. Please check that you
have an NVIDIA GPU and installed a driver from
http://www.nvidia.com/Download/index.aspx

In [None]:
preprocessed_data_test = data_preprocessing.transform(test)

with open('data/Preprocessing/train.pickle', 'wb') as f:
    pickle.dump(preprocessed_data, f)
    
with open('data/Preprocessing/test.pickle', 'wb') as f:
    pickle.dump(preprocessed_data_test, f)

In [None]:
from model import NeuralNet

%load_ext autoreload
%autoreload 2

In [None]:
model = LogisticRegression(class_weight='balanced')

In [None]:
model.fit(preprocessed_data, target)

In [None]:
pred = model.predict(preprocessed_data)

In [None]:
print(np.sum(pred[pred == target] == 1))
print(np.sum(target))
print(np.sum(pred))

In [None]:
pred_test = model.predict_proba(preprocessed_data_test)

In [None]:
pred_test.shape

In [None]:
result = pd.DataFrame({'id':df_test['id'],'target': pred_test[:,1]})

In [None]:
result.to_csv('data/result.csv', index=False)