In [1]:
import pandas as pd
import numpy as np

#### Import and basic dataset info
Importing test and train datasets

In [2]:
train_data = pd.read_csv("skill-task/train.csv",sep = ',')
test_data = pd.read_csv("skill-task/test.csv",sep = ',')

train_data = train_data.set_index('sample_id')
test_data = test_data.set_index('sample_id')

In [46]:
train_data

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1603,f1604,f1605,f1606,f1607,f1608,f1609,f1610,f1611,y
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sample_0,25.609375,6.703125,3.652344,10.039062,169.375,102.8125,1.422852,6.722656,8.015625,8.109375,...,8.070312,4.363281,5.019531,5.710938,6.343750,6.843750,7.289062,7.617188,7.980469,1.0
sample_1,18.343750,5.824219,2.966797,4.902344,164.625,71.8125,1.357422,5.894531,2.753906,7.406250,...,7.359375,4.195312,4.808594,5.425781,5.949219,6.339844,6.730469,7.074219,7.175781,1.0
sample_2,28.562500,6.230469,3.583984,7.882812,159.500,113.1875,1.696289,6.316406,4.605469,7.753906,...,8.562500,4.523438,5.097656,5.789062,6.457031,6.871094,7.386719,7.878906,8.328125,1.0
sample_3,28.062500,6.132812,2.726562,6.378906,169.750,111.0000,1.535156,6.199219,3.712891,7.664062,...,4.558594,3.533203,3.900391,4.261719,4.042969,3.869141,3.890625,4.042969,4.273438,1.0
sample_4,20.109375,6.144531,3.203125,6.035156,164.750,78.8750,1.281250,6.187500,4.003906,7.734375,...,6.613281,4.625000,4.996094,5.328125,5.593750,5.800781,6.027344,6.242188,6.449219,0.0
sample_5,19.406250,5.933594,2.816406,5.878906,163.000,76.1250,1.381836,6.003906,5.082031,7.515625,...,4.832031,3.818359,4.125000,4.425781,4.824219,4.652344,4.796875,4.859375,5.000000,0.0
sample_6,22.125000,6.191406,3.263672,7.093750,164.500,87.9375,1.465820,6.250000,5.003906,7.683594,...,6.707031,4.339844,4.929688,5.527344,6.023438,6.207031,6.496094,6.753906,6.566406,1.0
sample_7,24.453125,6.433594,3.535156,8.929688,164.125,96.6250,1.457031,6.480469,4.050781,7.917969,...,7.609375,4.464844,5.070312,5.632812,6.226562,6.550781,6.917969,7.261719,7.613281,1.0
sample_8,21.562500,6.222656,2.722656,7.000000,165.375,84.8125,1.350586,6.269531,5.093750,7.761719,...,4.363281,3.570312,3.845703,4.183594,4.574219,4.121094,4.183594,4.394531,4.589844,0.0
sample_9,19.281250,5.878906,2.912109,5.367188,162.000,75.6875,1.414062,5.957031,3.130859,7.460938,...,6.261719,4.078125,4.578125,5.093750,5.562500,5.593750,5.953125,6.183594,6.171875,1.0


Get basic dataset info

In [4]:
# getting class number
class_num = len(train_data['y'].unique())
print("Number of classes: %s"%class_num)
# getting samples number
sample_num_train = len(train_data.index)
sample_num_test = len(test_data.index)
print("Training set size: %s \nTest set size: %s"%(sample_num_train,sample_num_test))
# getting features number
features_num = len(train_data.columns) - 1 # -1 one column is y
print("Number of features: %s"%features_num)

Number of classes: 2
Training set size: 1095 
Test set size: 194
Number of features: 1612


In [5]:
# Separate y column from all others
def get_y(imput_df, y_column = 'y'):
    y_vector = np.array(imput_df[y_column])
    x_frame = imput_df.drop(y_column, inplace=False, axis=1)
    return x_frame, y_vector

In [6]:
train_x_df, train_y = get_y(train_data)

#### Data preparation.
1. At first we drop feature columns with more then 30% missing data points. There is no point in using columns with too much artificial(created with imputing) data points.
2. Then we will fix missing and inf data points. For this we will use impute missing values with mean of this feature column.
3. Then we will scale data in each column for it to be in [0,1] boundaries.

In [7]:
# Drop feature columns with not enough data
drop_column_rate = 0.3

def drop_columns(input_df, drop_column_rate):
    dropped_columns = []
    sample_num = len(input_df.index)
    # get number of samples for column to be dropped
    sample_limit = int(sample_num * drop_column_rate)
    
    for column in input_df.columns:
        input_df.loc[input_df[column] >= 1E308, column] = np.nan # Fill all the infinities 
        if input_df[column].isnull().sum() > sample_limit: # check for number of empty data points in a feature column
            dropped_columns.append(column)

    cleaned_df = input_df.drop(dropped_columns, inplace=False, axis=1)
    
    return cleaned_df, dropped_columns

In [8]:
cl_train_x, dropped_columns = drop_columns(train_x_df, drop_column_rate)
cleaned_features = len(cl_train_x.columns) - 1 # -1 couse one column is y
print("Number of features after cleanup: %s"%cleaned_features)

Number of features after cleanup: 1449


In [9]:
# Now we will get numpy array for trainig
train_x = cl_train_x.values

In [45]:
def count_nan(input_matrix):
    nancount = 0
    for i in range(input_matrix.shape[0]):
        nancount += sum(np.isnan(input_matrix[i]))
    return nancount

print("Number of nans is %i"%count_nan(train_x))

Number of nans is 6676


In [12]:
# Continue with imputing data

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_train_x = imp.fit_transform(train_x)  

In [13]:
# Lets check if all the nans are gone
print("Number of nans after imputing %i"%count_nan(imputed_train_x))

Number of nans is 0


Done with imputing, lets scale our data

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_train_x = scaler.fit_transform(imputed_train_x)

#### Neural net training

In [15]:
epochs = 150
val_split = 0.05 # We cant get too big of a validation with just 1000 training examples, so 5% will do
patience = 50 # epochs to wait for early stopping

In [16]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
#create model
model = Sequential()

#get number of columns in training data
n_cols = scaled_train_x.shape[1]

#add model layers
model.add(Dense(512, activation='relu', input_shape=(n_cols,)))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='mse', metrics = ['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

save_history = model.fit(scaled_train_x, train_y, shuffle = True, batch_size = 128, epochs=epochs, verbose=1, validation_split=val_split, callbacks=[mc, es])

model = load_model('best_model.h5')

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 1040 samples, validate on 55 samples
Epoch 1/150

Epoch 00001: val_acc improved from -inf to 0.69091, saving model to best_model.h5
Epoch 2/150

Epoch 00002: val_acc improved from 0.69091 to 0.72727, saving model to best_model.h5
Epoch 3/150

Epoch 00003: val_acc did not improve from 0.72727
Epoch 4/150

Epoch 00004: val_acc did not improve from 0.72727
Epoch 5/150

Epoch 00005: val_acc did not improve from 0.72727
Epoch 6/150

Epoch 00006: val_acc improved from 0.72727 to 0.74545, saving model to best_model.h5
Epoch 7/150

Epoch 00007: val_acc did not improve from 0.74545
Epoch 8/150

Epoch 00008: val_acc did not improve from 0.74545
Epoch 9/150

Epoch 00009: val_acc improved from 0.74545 to 0.76364, saving model to best_model.h5
Epoch 10/150

Epoch 00010: val_acc did not improve from 0.76364
Epoch 11/150

Epoch 00011: val_acc did not improve from 0.76364
Ep


Epoch 00040: val_acc did not improve from 0.85455
Epoch 41/150

Epoch 00041: val_acc did not improve from 0.85455
Epoch 42/150

Epoch 00042: val_acc did not improve from 0.85455
Epoch 43/150

Epoch 00043: val_acc did not improve from 0.85455
Epoch 44/150

Epoch 00044: val_acc did not improve from 0.85455
Epoch 45/150

Epoch 00045: val_acc did not improve from 0.85455
Epoch 46/150

Epoch 00046: val_acc did not improve from 0.85455
Epoch 47/150

Epoch 00047: val_acc did not improve from 0.85455
Epoch 48/150

Epoch 00048: val_acc did not improve from 0.85455
Epoch 49/150

Epoch 00049: val_acc did not improve from 0.85455
Epoch 50/150

Epoch 00050: val_acc did not improve from 0.85455
Epoch 51/150

Epoch 00051: val_acc did not improve from 0.85455
Epoch 52/150

Epoch 00052: val_acc did not improve from 0.85455
Epoch 53/150

Epoch 00053: val_acc did not improve from 0.85455
Epoch 54/150

Epoch 00054: val_acc did not improve from 0.85455
Epoch 55/150

Epoch 00055: val_acc did not improve fr


Epoch 00083: val_acc did not improve from 0.85455
Epoch 84/150

Epoch 00084: val_acc did not improve from 0.85455
Epoch 85/150

Epoch 00085: val_acc did not improve from 0.85455
Epoch 86/150

Epoch 00086: val_acc did not improve from 0.85455
Epoch 87/150

Epoch 00087: val_acc did not improve from 0.85455
Epoch 88/150

Epoch 00088: val_acc did not improve from 0.85455
Epoch 89/150

Epoch 00089: val_acc did not improve from 0.85455
Epoch 90/150

Epoch 00090: val_acc did not improve from 0.85455
Epoch 91/150

Epoch 00091: val_acc did not improve from 0.85455
Epoch 92/150

Epoch 00092: val_acc did not improve from 0.85455
Epoch 93/150

Epoch 00093: val_acc did not improve from 0.85455
Epoch 94/150

Epoch 00094: val_acc did not improve from 0.85455
Epoch 95/150

Epoch 00095: val_acc did not improve from 0.85455
Epoch 96/150

Epoch 00096: val_acc did not improve from 0.85455
Epoch 97/150

Epoch 00097: val_acc did not improve from 0.85455
Epoch 98/150

Epoch 00098: val_acc did not improve fr


Epoch 00126: val_acc did not improve from 0.85455
Epoch 127/150

Epoch 00127: val_acc did not improve from 0.85455
Epoch 128/150

Epoch 00128: val_acc did not improve from 0.85455
Epoch 129/150

Epoch 00129: val_acc did not improve from 0.85455
Epoch 130/150

Epoch 00130: val_acc did not improve from 0.85455
Epoch 131/150

Epoch 00131: val_acc did not improve from 0.85455
Epoch 132/150

Epoch 00132: val_acc did not improve from 0.85455
Epoch 133/150

Epoch 00133: val_acc did not improve from 0.85455
Epoch 134/150

Epoch 00134: val_acc did not improve from 0.85455
Epoch 135/150

Epoch 00135: val_acc did not improve from 0.85455
Epoch 136/150

Epoch 00136: val_acc did not improve from 0.85455
Epoch 137/150

Epoch 00137: val_acc did not improve from 0.85455
Epoch 138/150

Epoch 00138: val_acc did not improve from 0.85455
Epoch 139/150

Epoch 00139: val_acc did not improve from 0.85455
Epoch 140/150

Epoch 00140: val_acc did not improve from 0.85455
Epoch 141/150

Epoch 00141: val_acc did

In [39]:
%matplotlib notebook
import matplotlib.pyplot as plt

fig = plt.figure()
graph1 = plt.plot(save_history.history['acc'])
plt.title("Train accuracy")
plt.show()

fig = plt.figure()
graph1 = plt.plot(save_history.history['val_acc'])
plt.title("Validation accuracy")
plt.show()

fig = plt.figure()
graph1 = plt.plot(save_history.history['loss'])
plt.title("Train loss")
plt.show()

fig = plt.figure()
graph1 = plt.plot(save_history.history['val_loss'])
plt.title("Validation loss")
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Analysis

Here we can see a number of troubles. 
1. Even if training accuracy rises to 90% level, validation accuracy jumps between different points pretty significanlty. So even with more training we just getting an overfitted model. 
2. Both validation loss and validation accuracy doesnt change much with more training. We can pretty much stop at the beggining and dont go for too much epochs. Even though some improvement can be seen. 

To reduce overfitting we can usually try adding regularization, but on previous tests there was no improvent for validation, test and even train sets. So i discarded the idea after trying dropout, l1 and l2 regularizations(with different parameters). 

I also tried fitting SVM on data to no avail. But i dont know how to tune it, so it was probably not that bad idea overall, just a bad idea for me. 

**Final thoughts**

In the end i can say that there is probably not enough data to get a significant improvement for results. Validation set is too small, and if we try to increase validation set, we lose even more training data, which we dont have alot too. 

##### Evaluate on training set

In [37]:
# Evaluating for roc curve and showing it for training set. 
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

y_pred_keras = model.predict(scaled_train_x).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(train_y, y_pred_keras)
auc_keras = auc(fpr_keras, tpr_keras)

In [40]:
plt.figure()
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

<IPython.core.display.Javascript object>

##### Test data preprocess

In [32]:
cleaned_test = test_data.drop(dropped_columns, inplace=False, axis=1) # drop same columns from test set
for column in cleaned_test.columns:
    cleaned_test.loc[cleaned_test[column] >= 1E308, column] = np.nan # Fill all the infinities 
test_x = cleaned_test.values # Turn into numpy array
imputed_test = imp.transform(test_x) # Inpute with trained inputer
scaled_test = scaler.transform(imputed_test) # Scale with trained scaler

##### Predicting on test and saving results

In [33]:
test_y = model.predict(scaled_test)
y_list = test_y.reshape(len(test_y)).tolist()
test_df = pd.DataFrame({"sample_id": list(test_data.index), "y": y_list})
# Save submission
test_df.to_csv("submission.csv",sep = ',', index = False)