In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
from sklearn.decomposition import PCA 
from sklearn.preprocessing import normalize
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout
from keras.utils import np_utils
from keras.optimizers import RMSprop
from keras import backend as K


Using TensorFlow backend.


## Load the data

dataset is [datax, datay]

In [3]:
datax = pd.read_csv('../feature.csv', sep='\t')
datay = pd.read_csv('output_y.csv', sep='\t')

In [4]:
datax

Unnamed: 0,deal_category_cd,shop_id,deal_type_cd,deal_price,discount_rate,month,duration_days,dspt_vec0,dspt_vec1,dspt_vec2,...,dspt_vec190,dspt_vec191,dspt_vec192,dspt_vec193,dspt_vec194,dspt_vec195,dspt_vec196,dspt_vec197,dspt_vec198,dspt_vec199
0,3,13869,1,3200,64,12,14,-0.487003,-0.487012,-0.300625,...,-0.129280,-0.008112,0.047803,-0.412086,-0.186055,-0.428090,-1.028613,-1.133246,0.102500,0.565922
1,8,10029,2,3130,0,12,7,-0.047797,-0.363245,0.328044,...,0.739456,-0.497679,0.344068,-0.614602,-0.931943,-0.404058,0.269973,-0.481750,-0.354643,0.680272
2,2,10479,2,3240,0,12,7,-0.236814,-0.042894,-0.312521,...,-0.496862,0.399478,-0.418426,-0.057347,-0.541013,-0.008830,-0.787262,-0.314165,-0.278627,0.008451
3,8,10029,2,3130,0,12,7,-0.035394,-0.398830,0.502443,...,0.606086,-0.485434,0.371555,-0.571355,-0.752632,-0.153454,0.511548,-0.381676,-0.207083,0.606146
4,9,8334,2,1400,35,12,14,-0.045568,-0.002191,-0.356119,...,-0.610163,0.271743,0.026410,-0.070720,-0.734749,-0.021287,-0.570350,-0.251306,-0.455745,0.053695
5,8,10029,2,3130,0,12,7,-0.157187,-0.196884,0.430383,...,0.556017,-0.516333,0.292942,-0.510219,-0.815570,-0.341179,0.393145,-0.299472,-0.290749,0.524980
6,9,8334,2,5480,37,12,14,-0.287976,0.077439,-0.724518,...,-0.730756,0.463583,-0.163860,-0.099533,-0.726087,-0.057904,-1.049588,-0.672365,-0.428750,0.020867
7,11,6593,2,6800,70,12,12,-0.401356,0.231198,-0.780717,...,-0.495988,0.503905,0.189778,-0.091715,-0.374547,0.154913,-0.720336,-1.186944,-0.503544,0.383948
8,8,10029,2,3130,0,12,7,-0.188005,-0.162056,0.440517,...,0.485737,-0.509055,0.205669,-0.514193,-0.662836,-0.327128,0.408174,-0.290132,-0.232235,0.469798
9,9,9833,2,3500,0,12,12,-0.282669,-0.026581,-0.610132,...,-0.822827,0.854255,-0.094156,0.024080,-0.746256,0.198009,-1.250182,-0.429074,-0.149078,0.051133


In [5]:
# merge x and y, and drop NA
pd_data = pd.concat([datax,datay],axis=1)
pd_data = pd_data.dropna(axis=0, how='any') 
#Remove bad data
pd_data = pd_data[~pd_data['discount_rate'].isin(['販売価格'])]
pd_data = pd_data[~pd_data['discount_rate'].isin(['ダミー'])]

### normalization

In [6]:
data1 = pd_data[['deal_category_cd','shop_id','deal_type_cd','deal_price','discount_rate','month','duration_days']]
data2 = pd_data.drop(['deal_category_cd','shop_id','deal_type_cd','deal_price','discount_rate','month','duration_days'], axis=1)
data1 = normalize(data1, axis=0, norm='max')
data2 = np.array(data2)
data1 = DataFrame(data1)
data2 = DataFrame(data2)

pd_data_normalized = pd.concat([data1,data2],axis=1)

dataset = np.array(pd_data_normalized)
dataset = dataset.astype(np.float64)

### pick data gruop via range of y

In [7]:
def dataPick(oriData, lowLmt, upLmt, idx_y=207):
    result = oriData[oriData[:,idx_y]<upLmt,:]
    result = result[result[:,idx_y]>=lowLmt,:]
    return result

In [8]:
dataPick(dataset, 350, 100000).shape[0]

556

# Set dataset

In [9]:
def sepTrainX(dataset, percentage=0.9):
    train = dataset[:np.trunc(len(dataset)*percentage).astype(np.int)]
    test = dataset[np.trunc(len(dataset)*percentage).astype(np.int):]
    return train, test

In [10]:
def sepXY(dataset, feature_dim, percentage=0.9):   
    traindata = dataset[:np.trunc(len(dataset)*percentage).astype(np.int)]
    testdata = dataset[np.trunc(len(dataset)*percentage).astype(np.int):]
    x_train = traindata[:,0:feature_dim]
    y_train = traindata[:,feature_dim:]
    x_test = testdata[:,0:feature_dim]
    y_test = testdata[:,feature_dim:]
    return x_train, y_train, x_test, y_test

## Set class tag

In [11]:
def biCat(amount,key):
    result = []
    for i in amount:
        if i<key:
            result = np.append(result,[0],0)
#        elif i>0 and i<200:
#            result = np.append(result,[1],0)
        else:
            result = np.append(result,[1],0)
    result = result.reshape(-1,1)
    result = result.astype(np.int)
    result = np_utils.to_categorical(result, num_classes=2)
    return result

In [12]:
def Amt2Cat(amount, keys, numClasses):
    result = []
    for i in amount:
        y_class = 0
        for j in keys:
            if i<j:
                result = np.append(result,[y_class],0)
                break
            y_class = y_class+1
            if y_class == numClasses-1:
                result = np.append(result,[y_class],0)
                break
    result = np_utils.to_categorical(result.astype(np.int), num_classes=numClasses)
    return result

# Traning and testing

##  >100, 10 classes
### Pick data

In [30]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 100, 1000000)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [31]:
# Set bins of y
keys=[110,120,130,140,160,190,250,350,500]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 10)
y_test_class = Amt2Cat(y_test, keys, 10)
#y_train_class = biCat(y_train, 200)
#y_test_class = biCat(y_test, 200)

In [32]:
counter = [0,0,0,0,0,0,0,0,0,0]
c = 0
for i in y_train:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 9:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[309, 247, 250, 206, 272, 342, 343, 340, 191, 291]
2791


In [33]:
counter = [0,0,0,0,0,0,0,0,0,0]
c=0
for i in y_test:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 9:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[31, 21, 16, 12, 37, 43, 41, 36, 25, 49]
311


In [34]:
dataPick(dataset, 140, 250).shape[0]

1078

### Build model and test

In [67]:
def grtRecall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def getPrecision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
    
    
def getF1(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))    
    recall = true_positives / (possible_positives + K.epsilon())
    precision = true_positives / (predicted_positives + K.epsilon())
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [68]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(10, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=[getF1])
    
    return model

In [69]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=6, batch_size = 10)

Training -----------


NameError: name 'getRecall' is not defined

In [63]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test, y_test_class, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  2.155497154622216
test accuracy:  0.10662434185432851


##  >100, 5 classes

### Pick data

In [148]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 100, 1000000)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [149]:
# Set bins of y
keys=[120,150,200,350]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 5)
y_test_class = Amt2Cat(y_test, keys, 5)

In [153]:
dataPick(data, 100, 120).shape[0]

608

### Build model and test

In [154]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(5, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    
    return model

In [155]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=5, batch_size = 10)

Training -----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7ef18fb0b8>

In [156]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test, y_test_class, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  1.5387168812215137
test accuracy:  0.28617363631533654


##  >100, 3 classes

### Pick data

In [159]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 100, 1000000)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [160]:
# Set bins of y
keys=[140,250]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 3)
y_test_class = Amt2Cat(y_test, keys, 3)

In [161]:
dataPick(data, 100, 120).shape[0]

608

### Build model and test

In [162]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(3, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    
    return model

In [163]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=5, batch_size = 10)

Training -----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7ef1918c88>

In [164]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test, y_test_class, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  1.0484810937255908
test accuracy:  0.398713830295483


##  <100, 0 or not

### Pick data

In [22]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 0, 100)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [23]:
data.shape

(155335, 208)

In [24]:
# Set bins of y
keys=[1]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 2)
y_test_class = Amt2Cat(y_test, keys, 2)
#y_train_class = biCat(y_train, 200)
#y_test_class = biCat(y_test, 200)

In [25]:
counter = [0,0]
c = 0
for i in y_train:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 1:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[83150, 56651]
139801


In [26]:
counter = [0,0]
c=0
for i in y_test:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 1:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[7997, 7537]
15534


### Build model and test

In [30]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(2, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    
    return model

In [33]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=5, batch_size = 100)

Training -----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fbccb819e48>

In [34]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test, y_test_class, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  0.6232094802366895
test accuracy:  0.6693060397276842


##  <100, 3 classes

### Pick data

In [36]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 0, 100)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [37]:
data.shape

(155335, 208)

In [38]:
# Set bins of y
keys=[1,10]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 3)
y_test_class = Amt2Cat(y_test, keys, 3)
#y_train_class = biCat(y_train, 200)
#y_test_class = biCat(y_test, 200)

In [40]:
counter = [0,0,0]
c = 0
for i in y_train:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 2:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[83150, 37244, 19407]
139801


In [41]:
counter = [0,0,0]
c=0
for i in y_test:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 2:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[7997, 5325, 2212]
15534


In [45]:
counter = 0
for i in y_train_class:
    if i[1] == 1:
        counter = counter + 1
print(counter)

37244


### Build model and test

In [42]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(3, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    
    return model

In [51]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=2, batch_size = 20)

Training -----------
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7faef5f00c18>

In [52]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test, y_test_class, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  0.944731870511175
test accuracy:  0.5189906014133117


## PCA

In [None]:
feature_dim = 207

In [None]:
feature = dataset[:,0:207]

In [21]:
pca=PCA(n_components=feature_dim)
feature_pca=pca.fit_transform(feature)

In [22]:
dataset_pca = np.concatenate((feature_pca,dataset[:,207:208]),axis=1)
dataset_pca.shape

(158437, 208)

In [23]:
dataset_pca_non_zero = dataset_pca[dataset_pca[:,feature_dim]!=0,:]
dataset_pca_non_zero.shape

(67290, 208)