In [8]:
import logging
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import KFold
import numpy as np

from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Activation, Dropout, Flatten, Dense

physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [18]:
# Load data
columns = ['A1', 'A2','A3','A4', 'A5', 'A6', 'A7', 'A8','A9', 'A10', 'A11','A12', 'A13', 'A14', 'A15', 'label']
df = pd.read_csv('../data/crx.data', names=columns)
logging.debug(df.head())
print(df['label'].values[0])


df.replace('?', np.nan, inplace=True)
df['label'].replace('+', 1, inplace=True)
df['label'].replace('-', 0, inplace=True)
df.dropna(how='any', inplace=True)

+


In [19]:
# Convert data
# df = pd.get_dummies(df, columns=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A11', 'A12', 'A13'])
# df = pd.get_dummies(df, columns=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A11', 'A12', 'A13'])

for col in ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A11', 'A12', 'A13']:
    df[col] = pd.Categorical(df[col])
    df[col] = df[col].cat.codes

for col in df.columns:
    if col == 'label':
        continue
    df[col].astype(float)
print(df.head())
print(df.shape)

   A1     A2     A3  A4  A5  A6  A7    A8  A9  A10  A11  A12  A13    A14  A15  \
0   1  30.83  0.000   1   0  12   7  1.25   1    1    1    0    0  00202    0   
1   0  58.67  4.460   1   0  10   3  3.04   1    1    6    0    0  00043  560   
2   0  24.50  0.500   1   0  10   3  1.50   1    0    0    0    0  00280  824   
3   1  27.83  1.540   1   0  12   7  3.75   1    1    5    1    0  00100    3   
4   1  20.17  5.625   1   0  12   7  1.71   1    0    0    0    2  00120    0   

   label  
0      1  
1      1  
2      1  
3      1  
4      1  
(653, 16)


In [28]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
# print(df.shape)
# df = pd.DataFrame(scaler.fit_transform(df[]), columns=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'label'])

col_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15']
features = df[col_names]
scaler = StandardScaler().fit(features.values)
features = pd.DataFrame(scaler.transform(features.values))

print(features.describe())
df[col_names] = features
print(df.columns)
df.describe()
df['label'].values

                 0             1             2             3             4   \
count  5.840000e+02  5.840000e+02  5.840000e+02  5.840000e+02  5.840000e+02   
mean  -3.041707e-18  6.083414e-18 -3.650048e-17 -6.995926e-17  1.034180e-16   
std    1.000857e+00  1.000857e+00  1.000857e+00  1.000857e+00  1.000857e+00   
min   -1.516389e+00 -1.515384e+00 -9.554088e-01 -2.872162e+00 -5.522982e-01   
25%   -1.516389e+00 -7.535578e-01 -7.496806e-01 -5.327720e-01 -5.522982e-01   
50%    6.594615e-01 -2.609157e-01 -4.030087e-01 -5.327720e-01 -5.522982e-01   
75%    6.594615e-01  5.336960e-01  5.282084e-01 -5.327720e-01 -5.522982e-01   
max    6.594615e-01  3.920071e+00  4.583429e+00  1.806618e+00  1.819335e+00   

                 5             6             7             8             9   \
count  5.840000e+02  5.840000e+02  5.840000e+02  5.840000e+02  5.840000e+02   
mean  -4.866731e-17  2.129195e-17 -2.433366e-17  4.866731e-17  2.433366e-17   
std    1.000857e+00  1.000857e+00  1.000857e+00  1.

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
def create_model2(optimizer='rmsprop', init='glorot_uniform'):
    model = Sequential()
    model.add(Conv1D(filters=8,
                     kernel_size=2,
                     input_shape=(68, 1),
                     kernel_initializer=init,
                     activation='relu'
                     ))
    model.add(MaxPooling1D())

    model.add(Conv1D(8, 2, activation='relu'))
    model.add(MaxPooling1D())

    model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
    model.add(Dense(units=8, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    # opt = Keras.optimizers.SGD(lr=0.01, momentum=0.9)
    model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])

    return model

In [46]:
def train_with_kfold(model):
    n_folds = 5

    train_data = df.drop(['label'],axis=1).values
    train_label = df['label'].values

    kfold = KFold(n_folds, shuffle=True, random_state=1)
    for train_ix, test_ix in kfold.split(train_data):
        # select rows for train and test
        trainX, trainY, testX, testY = train_data[train_ix], train_label[train_ix], \
                                       train_data[test_ix], train_label[test_ix]

        # fit model
        trainX = np.asarray(trainX, dtype=float)
        testX = np.asarray(testX, dtype=float)

        trainX = np.expand_dims(trainX, axis=2)
        testX = np.expand_dims(testX, axis=2)

        trainY = np.asarray(trainY)
        testY = np.asarray(testY)

        history = model.fit(trainX, trainY, epochs=100, batch_size=32, validation_data=(testX, testY), verbose=0)
        
        evaluate(model, testX, testY)
        

In [47]:
def evaluate(model, testX, testY):
    # evaluate model
    _, acc = model.evaluate(testX, testY, verbose=0)
    print('Accuracy> %.3f' % (acc * 100.0))
    
    print(model.predict(testX))
    y_pred = np.where(model.predict(testX) > 0.5, 1, 0)
    print(y_pred)
    f1 = f1_score(testY, y_pred , average="macro")
    print('F1 score> %.3f' % (f1 * 100.0))
    
#     print(testX[:1])
#     y_pred = model.predict(testX)
#     print(y_pred)
#     f1 = f1_score(testY, y_pred , average="macro")
#     print("F1 score> {}".format(f1))

In [48]:
def create_model():
    inputs=tf.keras.Input(shape=15)

    x = tf.keras.layers.Dense(30, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l2(0) )(inputs)
    x = tf.keras.layers.Dense(50, activation=tf.nn.relu)(x)
    x = tf.keras.layers.Dense(50, activation=tf.nn.relu)(x)
    output = tf.keras.layers.Dense(1, activation=tf.nn.softmax)(x)

    model = tf.keras.Model(inputs=inputs, outputs=output)
    logging.info(model.summary())

    # Compile model
    opt = tf.keras.optimizers.SGD(lr=0.001, momentum=0.9)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [49]:
from sklearn.metrics import f1_score
model = create_model()
train_with_kfold(model)

Model: "model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        [(None, 15)]              0         
_________________________________________________________________
dense_40 (Dense)             (None, 30)                480       
_________________________________________________________________
dense_41 (Dense)             (None, 50)                1550      
_________________________________________________________________
dense_42 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_43 (Dense)             (None, 1)                 51        
Total params: 4,631
Trainable params: 4,631
Non-trainable params: 0
_________________________________________________________________
Accuracy> 45.038
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1