In [94]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from preprocessing import AutoEncoder
from preprocessing import ItemSelector

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('data/train.csv')
df = df.drop('id', axis=1)

df_test = pd.read_csv('data/test.csv')
df_test = df_test

In [5]:
maps_of_feature = { 'binary': [], 'nominal': [], 'interval': [], 'ordinal': []}
for f, i in zip(df.columns, range(len(df.columns))):
    # Defining the level
    if 'bin' in f:
        maps_of_feature['binary'].append(i-1)
    elif f == 'target':
        print(f, i)
    elif 'cat' in f or f == 'id':
        maps_of_feature['nominal'].append(i-1)
    elif df[f].dtype == float:
        maps_of_feature['interval'].append(i-1)
    elif df[f].dtype == int:
        maps_of_feature['ordinal'].append(i-1)

target 0


In [38]:
train = df.drop('target', axis=1).as_matrix()
target = df['target'].as_matrix()
test = df_test.drop('id', axis=1).as_matrix()

In [49]:
PipelineForNumberic = Pipeline([
                                ('select_numeric', ItemSelector(maps_of_feature['interval']))
                                ,('replace_mean', Imputer(missing_values=-1, strategy='mean'))
                               ])

PipelineForOrdinal = Pipeline([
                                ('select_numeric', ItemSelector(maps_of_feature['ordinal']))
                                ,('replace_mean', Imputer(missing_values=-1, strategy='most_frequent'))
                               ])

PipelineForCategorical = Pipeline([
                                ('select_categorical', ItemSelector(maps_of_feature['nominal'] + maps_of_feature['binary']))
                                ,('replace_mode', Imputer(missing_values=-1, strategy='most_frequent'))
                                ,('onehot', OneHotEncoder(sparse=False))
                                ,('autoencoder', AutoEncoder(209).cuda())
                               ])

In [50]:
data_preprocessing = Pipeline([
            ('data_pre', FeatureUnion(
                [
                    ('numeric_preprocessing', PipelineForNumberic), 
                    ('ordinal_preprocessing', PipelineForOrdinal),
                    ('categorical_preprocessing', PipelineForCategorical)
                ])
            )
            ,('scaler', StandardScaler())
])

In [51]:
preprocessed_data = data_preprocessing.fit_transform(train, target)

0.0461827
0.0298381
0.0190215
0.0147346
0.0124333
0.0111777
0.0105226
0.00991962
0.00947829
0.00895393
0.00864824
0.00829739
0.00803534
0.00785614
0.0076699
0.00746025
0.00743011
0.0071934
0.00698975
0.00688393


In [148]:
preprocessed_data_test = data_preprocessing.transform(test)

with open('data/Preprocessing/train.pickle', 'wb') as f:
    pickle.dump(preprocessed_data, f)
    
with open('data/Preprocessing/test.pickle', 'wb') as f:
    pickle.dump(preprocessed_data_test, f)

In [85]:
from model import NeuralNet

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [109]:
model = LogisticRegression(class_weight='balanced')

In [110]:
model.fit(preprocessed_data, target)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [149]:
pred = model.predict(preprocessed_data)

In [150]:
print(np.sum(pred[pred == target] == 1))
print(np.sum(target))
print(np.sum(pred))

12110
21694
232886


In [151]:
pred_test = model.predict_proba(preprocessed_data_test)

In [152]:
pred_test.shape

(892816, 2)

In [161]:
result = pd.DataFrame({'id':df_test['id'],'target': pred_test[:,1]})

In [163]:
result.to_csv('data/result.csv', index=False)