In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
from sklearn.decomposition import PCA 
from sklearn.preprocessing import normalize
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout
from keras.utils import np_utils
from keras.optimizers import RMSprop


Using TensorFlow backend.


## Load the data

dataset is [datax, datay]

In [3]:
datax = pd.read_csv('../feature.csv', sep='\t')
datay = pd.read_csv('output_y.csv', sep='\t')

In [4]:
# merge x and y, and drop NA
pd_data = pd.concat([datax,datay],axis=1)
pd_data = pd_data.dropna(axis=0, how='any') 
#Remove bad data
pd_data = pd_data[~pd_data['discount_rate'].isin(['販売価格'])]
pd_data = pd_data[~pd_data['discount_rate'].isin(['ダミー'])]

### normalization

In [5]:
data1 = pd_data[['deal_category_cd','shop_id','deal_type_cd','deal_price','discount_rate','month','duration_days']]
data2 = pd_data.drop(['deal_category_cd','shop_id','deal_type_cd','deal_price','discount_rate','month','duration_days'], axis=1)
data1 = normalize(data1, axis=0, norm='max')
data2 = np.array(data2)
data1 = DataFrame(data1)
data2 = DataFrame(data2)

pd_data_normalized = pd.concat([data1,data2],axis=1)

dataset = np.array(pd_data_normalized)
dataset = dataset.astype(np.float64)

### pick data gruop via range of y

In [6]:
dataset_100 = dataset[dataset[:,207]<100,:]
dataset_1000 = dataset[dataset[:,207]<1000,:]
dataset_2000 = dataset[dataset[:,207]<2000,:]

In [7]:
def dataPick(oriData, lowLmt, upLmt, idx_y=207):
    result = dataset[dataset[:,idx_y]<upLmt,:]
    result = result[result[:,idx_y]>=lowLmt,:]
    return result

In [8]:
dataset_100 = dataPick(dataset, 0, 100)
dataset_1000 = dataPick(dataset, 0, 1000)
dataset_2000 = dataPick(dataset, 0, 2000)

In [94]:
z = dataPick(dataset,0,1)
z.shape

(91147, 208)

In [9]:
dataset_100.shape

(155335, 208)

In [10]:
dataset_1000.shape

(158267, 208)

In [11]:
dataset_2000.shape

(158351, 208)

In [12]:
dataset.shape

(158437, 208)

## Set data

- Set number of train data and test data
- seperate x and y  from dataset

In [13]:
feature_dim = 207

In [14]:
def sepTrainX(dataset, percentage=0.1):
    train = dataset[:np.trunc(len(dataset)*percentage).astype(np.int)]
    test = dataset[np.trunc(len(dataset)*percentage).astype(np.int):]
    return train, test

In [15]:
def sepXY(dataset, feature_dim, percentage=0.1):
    traindata, testdata = sepTrainX(dataset, percentage)
    x_train = traindata[:,0:feature_dim]
    y_train = traindata[:,feature_dim:]
    x_test = testdata[:,0:feature_dim]
    y_test = testdata[:,feature_dim:]
    return x_train, y_train, x_test, y_test

In [16]:
#seperate x and y  from dataset
x_train_100, y_train_100, x_test_100, y_test_100 = sepXY(dataset_100, feature_dim)
x_train_1000, y_train_1000, x_test_1000, y_test_1000 = sepXY(dataset_1000, feature_dim)
x_train_2000, y_train_2000, x_test_2000, y_test_2000 = sepXY(dataset_2000, feature_dim)

## Set class tag

In [78]:
def Amt2Cat(amount, yMin, yRange, numClasses=10):
    amount_class = np.trunc((amount-yMin)/(yRange/10)).astype(np.int)
    amount_class = np_utils.to_categorical(amount_class, num_classes=numClasses)
    return amount_class

In [79]:
y_train_class_100 = Amt2Cat(y_train_100, 0, 100)
y_train_class_1000 = Amt2Cat(y_train_1000, 0, 1000)
y_train_class_2000 = Amt2Cat(y_train_2000, 0, 2000)

y_test_class_100 = Amt2Cat(y_test_100, 0, 100)
y_test_class_1000 = Amt2Cat(y_test_1000, 0, 1000)
y_test_class_2000 = Amt2Cat(y_test_2000, 0, 2000)

## Build model

In [87]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(10, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    
    return model

## Traning and testing

### dataset y<100

In [88]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train_100, y_train_class_100, epochs=2, batch_size = 20)

Training -----------
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f12e115f898>

In [89]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test_100, y_test_class_100, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  2.4922738953418886
test accuracy:  0.8453741715410776


### dataset y<1000

In [50]:
# training
model = setModel()
print('Training -----------')
model.fit(x_train_1000, y_train_class_1000, epochs=2, batch_size = 20)

Training -----------
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f1346487550>

In [51]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test_1000, y_test_class_1000, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  0.3316611721127818
test accuracy:  0.9794230555958445


### dataset y<2000

In [52]:
# training
model = setModel()
print('Training -----------')
model.fit(x_train_2000, y_train_class_2000, epochs=2, batch_size = 20)

Training -----------
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f13454b3b70>

In [53]:
print('\nTesting ------------')
loss, accuracy = model_2000.evaluate(x_test_2000, y_test_class_2000, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  0.12972206600509553
test accuracy:  0.9919517790425714


## within 2000, 10 classifiers

In [84]:
for i in range(0, 10):
    dataset_temp = dataPick(dataset, i*200, (i+1)*200)
    x_train, y_train, x_test, y_test = sepXY(dataset_temp, feature_dim)
    y_train_class = Amt2Cat(y_train, i*200, 200)
    y_test_class = Amt2Cat(y_test, i*200, 200)
    print('range of y = (', i*200, ',', (i+1)*200, ')\n')
    
    model = setModel()
    
    print('Training -----------')
    model.fit(x_train, y_train_class, epochs=2, batch_size = 20)
    
    print('Testing ------------')
    loss, accuracy = model.evaluate(x_test, y_test_class, batch_size=20)
    print('test loss: ', loss)
    print('test accuracy: ', accuracy)
    print('\n')

range of y = ( 0 , 200 )
Training -----------
Epoch 1/2
Epoch 2/2

Testing ------------
test loss:  1.5944591833298587
test accuracy:  0.9010764525638925


range of y = ( 200 , 400 )
Training -----------
Epoch 1/2
Epoch 2/2

Testing ------------
test loss:  2.236192620939943
test accuracy:  0.18544936050447008


range of y = ( 400 , 600 )
Training -----------
Epoch 1/2
Epoch 2/2

Testing ------------
test loss:  2.354894205375954
test accuracy:  0.11728395227296853


range of y = ( 600 , 800 )
Training -----------
Epoch 1/2
Epoch 2/2

Testing ------------
test loss:  2.927478748654562
test accuracy:  0.14285714640503838


range of y = ( 800 , 1000 )
Training -----------
Epoch 1/2
Epoch 2/2

Testing ------------
test loss:  2.4728389627793255
test accuracy:  0.23529411852359772


range of y = ( 1000 , 1200 )
Training -----------
Epoch 1/2
Epoch 2/2

Testing ------------
test loss:  2.786308277220953
test accuracy:  0.23809523809523808


range of y = ( 1200 , 1400 )
Training -----------


## PCA

In [19]:
feature_dim = 207

In [20]:
feature = dataset[:,0:207]

In [21]:
pca=PCA(n_components=feature_dim)
feature_pca=pca.fit_transform(feature)

In [22]:
dataset_pca = np.concatenate((feature_pca,dataset[:,207:208]),axis=1)
dataset_pca.shape

(158437, 208)

In [23]:
dataset_pca_non_zero = dataset_pca[dataset_pca[:,feature_dim]!=0,:]
dataset_pca_non_zero.shape

(67290, 208)