In [1]:
# coding: utf-8
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing, model_selection

In [2]:
def ginic(actual, pred):
    actual = np.asarray(actual) #In case, someone passes Series or list
    n = 50
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_s.sum() - (n + 1) / 2.0
    return giniSum / n

# ## Load Data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

d_median = train.median(axis=0)
d_mean = train.mean(axis=0)

def transform_df(df):
    df = pd.DataFrame(df)
    dcol = [c for c in df.columns if c not in ['id','target']]
    df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
    df['negative_one_vals'] = np.sum((df[dcol]==-1).values, axis=1)
    for c in dcol:
        if '_bin' not in c:
            df[c+str('_median_range')] = (df[c].values > d_median[c]).astype(int)
            df[c+str('_mean_range')] = (df[c].values > d_mean[c]).astype(int)
    return df

def prepare_data(dataset,Useless_fatures, name_target = 'target', replace_Nan = -100000):
    if replace_Nan != False:
        dataset.replace(-1, replace_Nan )

    if name_target in dataset:
        X = dataset.drop([name_target],1)
        X = X.drop(Useless_fatures,1)
        y = dataset[name_target]
        return X , y
    else:
        return dataset.drop(Useless_fatures,1)


In [3]:
target_0 = train['target']==0
target_1 = train['target']==1
train_down_samp = train[target_1].append(train[target_0][0:sum(target_1)*3])
#sum(new_train['target']==1)/len(new_train)

Useless_fatures = ['ps_ind_10_bin','ps_ind_11_bin','ps_ind_13_bin','ps_calc_20_bin','ps_ind_12_bin','ps_calc_15_bin','id']
#Useless_fatures = train.columns[train.columns.str.startswith('ps_calc_')] 
new_df_train = transform_df(train_down_samp)
X_50 , y_50 = prepare_data(new_df_train,['id'])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X_50 , y_50, test_size=0.02)

print("Size of:")
print("- Training-set:\t\t{}".format(len(X_train)))
print("- Test-set:\t\t{}".format(len(X_test)))


Size of:
- Training-set:		85040
- Test-set:		1736


In [9]:
#input size
input_size = X_train.shape[1]
num_classes = 1


# ### Placeholder variables
x = tf.placeholder(tf.float32, [None, input_size])
y_true = tf.placeholder(tf.float32, [None,1]) ##[None, num_classes]

# ### Helper-functions for creating new variables
def new_weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1))

def new_biases(length):
    return tf.Variable(tf.constant(0.05, shape=[length]))


# ### Helper-function for creating a new Fully-Connected Layer
def new_fc_layer(input,          # The previous layer.
                 num_inputs,     # Num. inputs from prev. layer.
                 num_outputs,    # Num. outputs.
                 use_relu=True,  # Use Rectified Linear Unit (ReLU)?
                 use_sig=False):
    # Create new weights and biases.
    weights = new_weights(shape=[num_inputs, num_outputs])
    biases = new_biases(length=num_outputs)

    # Calculate the layer as the matrix multiplication of
    # the input and weights, and then add the bias-values.
    layer = tf.matmul(input, weights) + biases

    # Use ReLU?
    if use_relu:
        layer = tf.nn.relu(layer)
    elif use_sig:
        layer = tf.nn.sigmoid(layer)
    return layer


# ###  Layers
#hyperparameters
n_nodes_hl1 = 1300
n_nodes_hl2 = 2200
n_nodes_hl3 = 500

layer_h1 = new_fc_layer(input=x,
                         num_inputs=input_size,
                         num_outputs=n_nodes_hl1,
                         use_relu=True)

layer_h2 = new_fc_layer(input=layer_h1,
                         num_inputs=n_nodes_hl1,
                         num_outputs=n_nodes_hl2,
                         use_relu=False,
                         use_sig = True)

layer_h3 = new_fc_layer(input=layer_h2,
                         num_inputs=n_nodes_hl2,
                         num_outputs=n_nodes_hl3,
                         use_relu=False,
                         use_sig = True)

output_layer = new_fc_layer(input=layer_h3,
                         num_inputs=n_nodes_hl3,
                         num_outputs=num_classes,
                         use_relu=False,
                         use_sig = True)


# ### Cost-function and Optimizer
learning_rate = tf.placeholder(tf.float32, shape=[])
error = tf.square(output_layer - y_true)/2
cost = tf.reduce_sum(error + error*y_true*0.5)
optimizer = tf.train.AdamOptimizer(learning_rate= learning_rate).minimize(cost)  #  1e-4


# ### Performance measures
correct_prediction = tf.equal(tf.round(output_layer), y_true)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))




session = tf.Session()

session.run(tf.global_variables_initializer())


# ### Helper-functions to show performance
feed_dict_test = {x: X_test,
                  y_true: y_test.values.reshape([-1,1])}

def print_accuracy(r_valu = False):
    # Use TensorFlow to compute the accuracy.
    acc = session.run(accuracy, feed_dict=feed_dict_test)
    if r_valu:
        return acc
    # Print the accuracy.
    print("Accuracy on test-set: {0:.1%}".format(acc))

print_accuracy()


saver = tf.train.Saver()  #search tf.saver.tensorflow

# ### Helper-function to perform optimization iterations
batch_size = 50
best_validation_accuracy = 0
def optimize( hm_epochs = 10, learning_r = 1e-4):
    global best_validation_accuracy
     
    for epoch in range(hm_epochs):
    		epoch_loss = 0
    		i = 0
    		len_train = X_train.shape[0]
    		while i < len_train:
    			start = i
    			end = i + batch_size

    			batch_x = np.array(X_train[start:end])
    			batch_y = np.array(y_train[start:end].values.reshape([-1,1]))
    			# Put the batch into a dict with the proper names
    			# for placeholder variables in the TensorFlow graph.
    			# Note that the placeholder for y_true_cls is not set
    			# because it is not used during training.

    			feed_dict_train = {x: batch_x,
    							   y_true: batch_y,
    							   learning_rate: learning_r}

    			# Run the optimizer using this batch of training data.
         		# TensorFlow assigns the variables in feed_dict_train
    			# to the placeholder variables and then runs the optimizer.
    			_, c = session.run([optimizer,cost], feed_dict=feed_dict_train)
    			#session.run(optimizer, feed_dict=feed_dict_train)
    			epoch_loss += c

    			i += batch_size

    			if i % 20000 == 0  or i == (len_train - 1):


    				acc_ = print_accuracy(True)
    				if acc_ > best_validation_accuracy:
                    # Update the best-known validation accuracy.
    					best_validation_accuracy = acc_
    					saver.save(session, 'data/my model')
    					improved_str = '*'
    				else:
    					improved_str = ''

    				print('loss: {}, data used: {} / {},  acc: {}{}'.format(epoch_loss/20,i,len_train,acc_,improved_str))
    				epoch_loss = 0



    print('Epoch {} completed out of {}'.format(epoch+1,hm_epochs))

    print('Optimization ended '+ 10*'_' + 'best accuracy: {}'.format(best_validation_accuracy))


# ## optimization

Accuracy on test-set: 23.7%


In [47]:
train = pd.read_csv("data/train.csv")
new_df_test = transform_df(train)
X_ , y_ = prepare_data(new_df_test,['id'])

print_accuracy()
feed_dict_test = {x: X_,
    							   y_true: y_}
print_accuracy()


Accuracy on test-set: 57.0%


ValueError: Cannot feed value of shape (595212,) for Tensor 'Placeholder_25:0', which has shape '(?, 1)'

In [None]:
a = ['a','b']
a+a

In [29]:
restore= '' # input('/n/n restore previous model? []/no')
if restore == '':
        #saver.restore(session,'model.ckpt')
        pass

In [10]:
optimize(6,1e-4)

loss: 125.6570591211319, data used: 20000 / 85040,  acc: 0.7517281174659729*
loss: 123.58367420434952, data used: 40000 / 85040,  acc: 0.7442396283149719
loss: 121.60196077823639, data used: 60000 / 85040,  acc: 0.7620967626571655*
loss: 122.26655447483063, data used: 80000 / 85040,  acc: 0.7638248801231384*
loss: 121.1341554403305, data used: 20000 / 85040,  acc: 0.7430875301361084
loss: 121.73337571620941, data used: 40000 / 85040,  acc: 0.7465437650680542
loss: 120.16090667247772, data used: 60000 / 85040,  acc: 0.7586405277252197
loss: 121.22797577381134, data used: 80000 / 85040,  acc: 0.757488489151001
loss: 120.27952246665954, data used: 20000 / 85040,  acc: 0.7459677457809448
loss: 121.12003929615021, data used: 40000 / 85040,  acc: 0.7459677457809448
loss: 119.51142902374268, data used: 60000 / 85040,  acc: 0.7528801560401917
loss: 120.53036869764328, data used: 80000 / 85040,  acc: 0.7563363909721375
loss: 119.69784064292908, data used: 20000 / 85040,  acc: 0.7413594722747803

In [25]:
test = pd.read_csv("data/test.csv")

X_sub = transform_df(test)
X_sub = prepare_data(X_sub,['id'])
sub1 = session.run(output_layer,feed_dict={x:X_sub[:int(892816/2)]})

In [26]:
X_sub.head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_10_median_range,ps_calc_10_mean_range,ps_calc_11_median_range,ps_calc_11_mean_range,ps_calc_12_median_range,ps_calc_12_mean_range,ps_calc_13_median_range,ps_calc_13_mean_range,ps_calc_14_median_range,ps_calc_14_mean_range
0,0,1,8,1,0,0,1,0,0,0,...,1,1,0,0,0,0,0,0,1,1
1,4,2,5,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,1,1
2,5,1,3,0,0,0,0,0,1,0,...,1,1,0,0,0,0,0,0,0,0
3,0,1,6,0,0,1,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
4,5,1,7,0,0,0,0,0,1,0,...,1,1,0,0,0,0,0,0,0,0


In [27]:
sub2 = session.run(output_layer,feed_dict={x:X_sub[int(892816/2):]})

In [28]:
sub_tot = np.append(sub1,sub2)

In [29]:
test_id = test.id.values

sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = sub_tot

#series = pd.Series(sub_tot)
sub.to_csv('submission_{}.csv'.format(int(best_validation_accuracy*100)) ,index = False)

In [25]:
best_validation_accuracy

0.60368663

In [None]:
892816*0.4


In [14]:
df2 = transform_df(train)

In [15]:
df2.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_10_median_range,ps_calc_10_mean_range,ps_calc_11_median_range,ps_calc_11_mean_range,ps_calc_12_median_range,ps_calc_12_mean_range,ps_calc_13_median_range,ps_calc_13_mean_range,ps_calc_14_median_range,ps_calc_14_mean_range
0,7,0,2,2,5,1,0,0,1,0,...,0,0,1,1,0,0,1,1,1,1
1,9,0,1,1,7,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
2,13,0,5,4,9,1,0,0,0,1,...,0,0,0,0,1,1,1,1,0,0
3,16,0,0,1,2,0,0,1,0,0,...,0,0,0,0,1,1,1,1,1,1
4,17,0,0,2,0,1,0,1,0,0,...,1,1,0,0,0,0,0,0,0,0
