In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
from sklearn.decomposition import PCA 
from sklearn.preprocessing import normalize
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout
from keras.utils import np_utils
from keras.optimizers import RMSprop
from keras import backend as K


Using TensorFlow backend.


## Load the data

dataset is [datax, datay]

In [3]:
datax = pd.read_csv('../feature.csv', sep='\t')
datay = pd.read_csv('output_y.csv', sep='\t')

In [4]:
# merge x and y, and drop NA
pd_data = pd.concat([datax,datay],axis=1)
pd_data = pd_data.dropna(axis=0, how='any') 
#Remove bad data
pd_data = pd_data[~pd_data['discount_rate'].isin(['販売価格'])]
pd_data = pd_data[~pd_data['discount_rate'].isin(['ダミー'])]

### normalization

In [5]:
data1 = pd_data[['deal_category_cd','shop_id','deal_type_cd','deal_price','discount_rate','month','duration_days']]
data2 = pd_data.drop(['deal_category_cd','shop_id','deal_type_cd','deal_price','discount_rate','month','duration_days'], axis=1)
data1 = normalize(data1, axis=0, norm='max')
data2 = np.array(data2)
data1 = DataFrame(data1)
data2 = DataFrame(data2)

pd_data_normalized = pd.concat([data1,data2],axis=1)

dataset = np.array(pd_data_normalized)
dataset = dataset.astype(np.float64)

### pick data gruop via range of y

In [6]:
def dataPick(oriData, lowLmt, upLmt, idx_y=207):
    result = oriData[oriData[:,idx_y]<upLmt,:]
    result = result[result[:,idx_y]>=lowLmt,:]
    return result

In [7]:
dataPick(dataset, 350, 100000).shape[0]

556

# Set dataset

In [9]:
def sepXY(dataset, feature_dim, percentage=0.9):   
    traindata = dataset[:np.trunc(len(dataset)*percentage).astype(np.int)]
    testdata = dataset[np.trunc(len(dataset)*percentage).astype(np.int):]
    x_train = traindata[:,0:feature_dim]
    y_train = traindata[:,feature_dim:]
    x_test = testdata[:,0:feature_dim]
    y_test = testdata[:,feature_dim:]
    return x_train, y_train, x_test, y_test

## Set class tag

In [10]:
def biCat(amount,key):
    result = []
    for i in amount:
        if i<key:
            result = np.append(result,[0],0)
#        elif i>0 and i<200:
#            result = np.append(result,[1],0)
        else:
            result = np.append(result,[1],0)
    result = result.reshape(-1,1)
    result = result.astype(np.int)
    result = np_utils.to_categorical(result, num_classes=2)
    return result

In [11]:
def Amt2Cat(amount, keys, numClasses):
    result = []
    for i in amount:
        y_class = 0
        for j in keys:
            if i<j:
                result = np.append(result,[y_class],0)
                break
            y_class = y_class+1
            if y_class == numClasses-1:
                result = np.append(result,[y_class],0)
                break
    result = np_utils.to_categorical(result.astype(np.int), num_classes=numClasses)
    return result

In [24]:
def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives / (possible_positives + K.epsilon())

    
def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    return true_positives / (predicted_positives + K.epsilon())
    
def f1(y_true, y_pred):
    """true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))    
    recall = true_positives / (possible_positives + K.epsilon())
    precision = true_positives / (predicted_positives + K.epsilon())"""
    myprecision = precision(y_true, y_pred)
    myrecall = recall(y_true, y_pred)
    return 2*((myprecision*myrecall)/(myprecision+myrecall+K.epsilon()))

def tp(y_true, y_pred):
    return K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))

def fp(y_true, y_pred):
    return K.sum(K.round(K.clip(y_pred, 0, 1)))-K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))

def fn(y_true, y_pred):
    return K.sum(K.round(K.clip(y_true, 0, 1)))-K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))

# Traning and testing

##  >100, 10 classes
### Pick data

In [12]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 100, 1000000)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [13]:
# Set bins of y
keys=[110,120,130,140,160,190,250,350,500]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 10)
y_test_class = Amt2Cat(y_test, keys, 10)
#y_train_class = biCat(y_train, 200)
#y_test_class = biCat(y_test, 200)

In [14]:
counter = [0,0,0,0,0,0,0,0,0,0]
c = 0
for i in y_train:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 9:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[309, 247, 250, 206, 272, 342, 343, 340, 191, 291]
2791


In [15]:
counter = [0,0,0,0,0,0,0,0,0,0]
c=0
for i in y_test:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 9:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[31, 21, 16, 12, 37, 43, 41, 36, 25, 49]
311


In [16]:
dataPick(dataset, 140, 250).shape[0]

1078

### Build model and test

In [18]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(10, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy',
                                                                               recall, precision, f1])
    
    return model

In [19]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=15, batch_size = 10)

Training -----------
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9ca465fb00>

In [20]:
print('\nTesting ------------')
loss, accuracy, testrecall, testprecision, testf1 = model.evaluate(x_test, y_test_class, batch_size=10)

print('test loss: ', loss)
print('test accuracy: ', accuracy)
print('recall: ', testrecall)
print('precision: ', testprecision)
print('f1:', testf1)


Testing ------------
test loss:  2.295408232419054
test accuracy:  0.19614148226198277
recall:  0.0739549845457077
precision:  0.3670953650183233
f1: 0.11251404354426639


##  >100, 5 classes

### Pick data

In [21]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 100, 1000000)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [22]:
# Set bins of y
keys=[120,150,200,350]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 5)
y_test_class = Amt2Cat(y_test, keys, 5)

In [23]:
dataPick(data, 100, 120).shape[0]

608

### Build model and test

In [24]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(5, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy',
                                                                               recall, precision, f1])
    
    return model

In [25]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=5, batch_size = 10)

Training -----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9ca4870320>

In [26]:
print('\nTesting ------------')
loss, accuracy, testrecall, testprecision, testf1 = model.evaluate(x_test, y_test_class, batch_size=10)

print('test loss: ', loss)
print('test accuracy: ', accuracy)
print('recall: ', testrecall)
print('precision: ', testprecision)
print('f1:', testf1)



Testing ------------
test loss:  1.5534871812802036
test accuracy:  0.2990353732726198
recall:  0.07073955041419272
precision:  0.3590567798093201
f1: 0.10821502164629111


##  >100, 3 classes

### Pick data

In [30]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 100, 1000000)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [31]:
# Set bins of y
keys=[140,250]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 3)
y_test_class = Amt2Cat(y_test, keys, 3)

In [32]:
dataPick(data, 100, 120).shape[0]

608

### Build model and test

In [37]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(3, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy',
                                                                               tp,fp,fn,recall,precision,f1])
    
    return model

In [38]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=13, batch_size = 100)

Training -----------
Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


<keras.callbacks.History at 0x7f435218d358>

In [40]:
print('\nTesting ------------')
loss, accuracy, ttp, tfp, tfn,  testrecall, testprecision, testf1 = model.evaluate(x_test, y_test_class, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)
print('tp: ', ttp)
print('fp: ', tfp)
print('fn:', tfn)
print('recall: ', testrecall)
print('precision: ', testprecision)
print('f1:', testf1)



Testing ------------
test loss:  1.1543410058190202
test accuracy:  0.45659164322534174
recall:  0.24437299433053497
precision:  0.4592104994982385
f1: 0.31432108020475824


##  <100, 0 or not

### Pick data

In [12]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 0, 100)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [13]:
data.shape

(155335, 208)

In [14]:
# Set bins of y
keys=[1]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 2)
y_test_class = Amt2Cat(y_test, keys, 2)
#y_train_class = biCat(y_train, 200)
#y_test_class = biCat(y_test, 200)

In [15]:
counter = [0,0]
c = 0
for i in y_train:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 1:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[83150, 56651]
139801


In [16]:
counter = [0,0]
c=0
for i in y_test:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 1:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[7997, 7537]
15534


### Build model and test

In [25]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(2, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy',
                                                                               tp, fp, fn])
    
    return model

In [26]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=7, batch_size = 100)

Training -----------
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f43817da5f8>

In [27]:
print('\nTesting ------------')
loss, accuracy, testtp, testfp, testfn = model.evaluate(x_test, y_test_class, batch_size=10)

print('test loss: ', loss)
print('test accuracy: ', accuracy)
print('tp: ', testtp)
print('fp: ', testfp)
print('fn:', testfn)


Testing ------------
test loss:  0.6798461425910158
test accuracy:  0.5361787086721782
tp:  5.360628299214626
fp:  4.637826702716621
fn: 4.637826702716621


##  <100, 3 classes

### Pick data

In [2]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 0, 100)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

NameError: name 'dataPick' is not defined

In [53]:
data.shape

(155335, 208)

In [54]:
# Set bins of y
keys=[1,10]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 3)
y_test_class = Amt2Cat(y_test, keys, 3)
#y_train_class = biCat(y_train, 200)
#y_test_class = biCat(y_test, 200)

In [55]:
counter = [0,0,0]
c = 0
for i in y_train:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 2:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[83150, 37244, 19407]
139801


In [56]:
counter = [0,0,0]
c=0
for i in y_test:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 2:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[7997, 5325, 2212]
15534


In [57]:
counter = 0
for i in y_train_class:
    if i[1] == 1:
        counter = counter + 1
print(counter)

37244


### Build model and test

In [58]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(3, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy',
                                                                               tp, precision, f1])
    
    return model

In [59]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=5, batch_size = 20)

Training -----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9c2602e3c8>

In [60]:
print('\nTesting ------------')
loss, accuracy, testrecall, testprecision, testf1 = model.evaluate(x_test, y_test_class, batch_size=10)

print('test loss: ', loss)
print('test accuracy: ', accuracy)
print('recall: ', testrecall)
print('precision: ', testprecision)
print('f1:', testf1)


Testing ------------
test loss:  0.9843727154452413
test accuracy:  0.5141624859075558
recall:  0.379297030271174
precision:  0.6440077183514669
f1: 0.4571345203172536


## PCA

In [None]:
feature_dim = 207

In [None]:
feature = dataset[:,0:207]

In [21]:
pca=PCA(n_components=feature_dim)
feature_pca=pca.fit_transform(feature)

In [22]:
dataset_pca = np.concatenate((feature_pca,dataset[:,207:208]),axis=1)
dataset_pca.shape

(158437, 208)

In [23]:
dataset_pca_non_zero = dataset_pca[dataset_pca[:,feature_dim]!=0,:]
dataset_pca_non_zero.shape

(67290, 208)