In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
from sklearn.decomposition import PCA 
from sklearn.preprocessing import normalize
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout
from keras.utils import np_utils
from keras.optimizers import RMSprop
from keras import backend as K

Using TensorFlow backend.


## Load the data
dataset is [datax, datay]

In [3]:
datax = pd.read_csv('../feature.csv', sep='\t')
datay = pd.read_csv('output_y.csv', sep='\t')

In [4]:
# merge x and y, and drop NA
pd_data = pd.concat([datax,datay],axis=1)
pd_data = pd_data.dropna(axis=0, how='any') 
#Remove bad data
pd_data = pd_data[~pd_data['discount_rate'].isin(['販売価格'])]
pd_data = pd_data[~pd_data['discount_rate'].isin(['ダミー'])]

### normalization

In [5]:
data1 = pd_data[['deal_category_cd','shop_id','deal_type_cd','deal_price','discount_rate','month','duration_days']]
data2 = pd_data.drop(['deal_category_cd','shop_id','deal_type_cd','deal_price','discount_rate','month','duration_days'], axis=1)
data1 = normalize(data1, axis=0, norm='max')
data2 = np.array(data2)
data1 = DataFrame(data1)
data2 = DataFrame(data2)

pd_data_normalized = pd.concat([data1,data2],axis=1)

dataset = np.array(pd_data_normalized)
dataset = dataset.astype(np.float64)

### pick data gruop via range of y

In [6]:
def dataPick(oriData, lowLmt, upLmt, idx_y=207):
    result = oriData[oriData[:,idx_y]<upLmt,:]
    result = result[result[:,idx_y]>=lowLmt,:]
    return result

dataPick(dataset, 350, 100000).shape[0]

556

# Set dataset

In [7]:
def sepTrainX(dataset, percentage=0.9):
    train = dataset[:np.trunc(len(dataset)*percentage).astype(np.int)]
    test = dataset[np.trunc(len(dataset)*percentage).astype(np.int):]
    return train, test

def sepXY(dataset, feature_dim, percentage=0.9):   
    traindata = dataset[:np.trunc(len(dataset)*percentage).astype(np.int)]
    testdata = dataset[np.trunc(len(dataset)*percentage).astype(np.int):]
    x_train = traindata[:,0:feature_dim]
    y_train = traindata[:,feature_dim:]
    x_test = testdata[:,0:feature_dim]
    y_test = testdata[:,feature_dim:]
    return x_train, y_train, x_test, y_test

## Set class tag

In [9]:
def Amt2Cat(amount, keys, numClasses):
    result = []
    for i in amount:
        y_class = 0
        for j in keys:
            if i<j:
                result = np.append(result,[y_class],0)
                break
            y_class = y_class+1
            if y_class == numClasses-1:
                result = np.append(result,[y_class],0)
                break
    result = np_utils.to_categorical(result.astype(np.int), num_classes=numClasses)
    return result

# Set evlu  func

In [10]:
def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives / (possible_positives + K.epsilon())

    
def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    return true_positives / (predicted_positives + K.epsilon())
    
def f1(y_true, y_pred):
    """true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))    
    recall = true_positives / (possible_positives + K.epsilon())
    precision = true_positives / (predicted_positives + K.epsilon())"""
    myprecision = precision(y_true, y_pred)
    myrecall = recall(y_true, y_pred)
    return 2*((myprecision*myrecall)/(myprecision+myrecall+K.epsilon()))

def tp(y_true, y_pred):
    return K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))

def fp(y_true, y_pred):
    return K.sum(K.round(K.clip(y_pred, 0, 1)))-K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))

def fn(y_true, y_pred):
    return K.sum(K.round(K.clip(y_true, 0, 1)))-K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))

# Determine size of bins

In [188]:
def setKeys(num_bins, data):
    sorted_data = sorted(data,key = lambda data: data[feature_dim])
    keys = []
    total_size = data.shape[0]
    left_size = total_size
    used_size = 0
    key = sorted_data[0][feature_dim]
    last_key = sorted_data[0][feature_dim]
    for num in range(0, num_bins-1):
        critical_size = left_size//(num_bins-num)
        key = sorted_data[used_size+critical_size][feature_dim]+1
        used_size = used_size + dataPick(dataset, last_key, key).shape[0]
        left_size = total_size - used_size
        last_key = key
        keys.append(key)
    return keys

# SA>=100

In [203]:
def setModel(num_bins):
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(num_bins, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy',
                                                                               recall, precision, f1])
    
    return model

In [204]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 0, 100000)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [207]:
for num_bins in range(2,4):
    accuracys=[]
    keys = setKeys(num_bins, data)
    print('Setting ------------')
    print('number of bins is', num_bins, '\nkeys is', keys)
    y_train_class = Amt2Cat(y_train, keys, num_bins)
    y_test_class = Amt2Cat(y_test, keys, num_bins)
    
    # training & testing
    print('Training -----------')
    model = setModel(num_bins)
    model.fit(x_train, y_train_class, epochs=1, batch_size = 10)
    
    print('\nTesting ------------')
    loss, accuracy, testrecall, testprecision, testf1 = model.evaluate(x_test, y_test_class, batch_size=10)
    print('test loss: ', loss)
    print('test accuracy: ', accuracy)
    print('recall: ', testrecall)
    print('precision: ', testprecision)
    print('f1:', testf1)
    print('\n\n\n\n')
    accuracys.append(accuracy)

Setting ------------
number of bins is 2 
keys is [1.0]
Training -----------
Epoch 1/1

Testing ------------
test loss:  0.7051001404250282
test accuracy:  0.5651981861068877
recall:  0.5651981861068877
precision:  0.5651981861068877
f1: 0.5651981473435539





Setting ------------
number of bins is 3 
keys is [1.0, 6.0]
Training -----------
Epoch 1/1

Testing ------------
test loss:  1.1664774194900527
test accuracy:  0.5044180799761872
recall:  0.3908735214394245
precision:  0.625964265729601
f1: 0.46385625203870606







In [208]:
accuracys

[0.5044180799761872]