In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
from sklearn.decomposition import PCA 
from sklearn.preprocessing import normalize
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout
from keras.utils import np_utils
from keras.optimizers import RMSprop


Using TensorFlow backend.


## Load the data

dataset is [datax, datay]

In [3]:
datax = pd.read_csv('../feature.csv', sep='\t')
datay = pd.read_csv('output_y.csv', sep='\t')

In [4]:
# merge x and y, and drop NA
pd_data = pd.concat([datax,datay],axis=1)
pd_data = pd_data.dropna(axis=0, how='any') 
#Remove bad data
pd_data = pd_data[~pd_data['discount_rate'].isin(['販売価格'])]
pd_data = pd_data[~pd_data['discount_rate'].isin(['ダミー'])]

### normalization

In [5]:
data1 = pd_data[['deal_category_cd','shop_id','deal_type_cd','deal_price','discount_rate','month','duration_days']]
data2 = pd_data.drop(['deal_category_cd','shop_id','deal_type_cd','deal_price','discount_rate','month','duration_days'], axis=1)
data1 = normalize(data1, axis=0, norm='max')
data2 = np.array(data2)
data1 = DataFrame(data1)
data2 = DataFrame(data2)

pd_data_normalized = pd.concat([data1,data2],axis=1)

dataset = np.array(pd_data_normalized)
dataset = dataset.astype(np.float64)

### pick data gruop via range of y

In [6]:
def dataPick(oriData, lowLmt, upLmt, idx_y=207):
    result = oriData[oriData[:,idx_y]<upLmt,:]
    result = result[result[:,idx_y]>=lowLmt,:]
    return result

In [7]:
dataPick(dataset, 350, 100000).shape[0]

556

# Set dataset

In [8]:
def sepTrainX(dataset, percentage=0.9):
    train = dataset[:np.trunc(len(dataset)*percentage).astype(np.int)]
    test = dataset[np.trunc(len(dataset)*percentage).astype(np.int):]
    return train, test

In [9]:
def sepXY(dataset, feature_dim, percentage=0.9):   
    traindata = dataset[:np.trunc(len(dataset)*percentage).astype(np.int)]
    testdata = dataset[np.trunc(len(dataset)*percentage).astype(np.int):]
    x_train = traindata[:,0:feature_dim]
    y_train = traindata[:,feature_dim:]
    x_test = testdata[:,0:feature_dim]
    y_test = testdata[:,feature_dim:]
    return x_train, y_train, x_test, y_test

## Set class tag

In [10]:
def biCat(amount,key):
    result = []
    for i in amount:
        if i<key:
            result = np.append(result,[0],0)
#        elif i>0 and i<200:
#            result = np.append(result,[1],0)
        else:
            result = np.append(result,[1],0)
    result = result.reshape(-1,1)
    result = result.astype(np.int)
    result = np_utils.to_categorical(result, num_classes=2)
    return result

In [11]:
def Amt2Cat(amount, keys, numClasses):
    result = []
    for i in amount:
        y_class = 0
        for j in keys:
            if i<j:
                result = np.append(result,[y_class],0)
                break
            y_class = y_class+1
            if y_class == numClasses-1:
                result = np.append(result,[y_class],0)
                break
    result = np_utils.to_categorical(result.astype(np.int), num_classes=numClasses)
    return result

# Traning and testing

##  >100, 10 classes
### Pick data

In [132]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 100, 1000000)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [133]:
# Set bins of y
keys=[110,120,130,140,160,190,250,350,500]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 10)
y_test_class = Amt2Cat(y_test, keys, 10)
#y_train_class = biCat(y_train, 200)
#y_test_class = biCat(y_test, 200)

In [135]:
counter = [0,0,0,0,0,0,0,0,0,0]
c = 0
for i in y_train:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 9:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[309, 247, 250, 206, 272, 342, 343, 340, 191, 291]
2791


In [136]:
counter = [0,0,0,0,0,0,0,0,0,0]
c=0
for i in y_test:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 9:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[31, 21, 16, 12, 37, 43, 41, 36, 25, 49]
311


In [158]:
dataPick(dataset, 140, 250).shape[0]

1078

### Build model and test

In [30]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(10, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    
    return model

In [49]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=5, batch_size = 10)

Training -----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7f02022588>

In [50]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test, y_test_class, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  2.1832537459407204
test accuracy:  0.22508038858317103


##  >100, 5 classes

### Pick data

In [148]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 100, 1000000)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [149]:
# Set bins of y
keys=[120,150,200,350]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 5)
y_test_class = Amt2Cat(y_test, keys, 5)

In [153]:
dataPick(data, 100, 120).shape[0]

608

### Build model and test

In [154]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(5, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    
    return model

In [155]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=5, batch_size = 10)

Training -----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7ef18fb0b8>

In [156]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test, y_test_class, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  1.5387168812215137
test accuracy:  0.28617363631533654


##  >100, 3 classes

### Pick data

In [159]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 100, 1000000)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [160]:
# Set bins of y
keys=[140,250]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 3)
y_test_class = Amt2Cat(y_test, keys, 3)

In [161]:
dataPick(data, 100, 120).shape[0]

608

### Build model and test

In [162]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(3, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    
    return model

In [163]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=5, batch_size = 10)

Training -----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7ef1918c88>

In [164]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test, y_test_class, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  1.0484810937255908
test accuracy:  0.398713830295483


##  <100, 0 or not

### Pick data

In [12]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 0, 100)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [14]:
data.shape

(155335, 208)

In [15]:
# Set bins of y
keys=[1]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 2)
y_test_class = Amt2Cat(y_test, keys, 2)
#y_train_class = biCat(y_train, 200)
#y_test_class = biCat(y_test, 200)

In [29]:
counter = [0,0]
c = 0
for i in y_train:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 1:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[83150, 56651]
139801


In [30]:
counter = [0,0]
c=0
for i in y_test:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 1:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[7997, 7537]
15534


### Build model and test

In [31]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(2, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    
    return model

In [34]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=3, batch_size = 20)

Training -----------
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7faef5e46b70>

In [35]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test, y_test_class, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  0.685622415554152
test accuracy:  0.6776104031951559


##  <100, 3 classes

### Pick data

In [36]:
feature_dim = 207

# Set range of data
data = dataPick(dataset, 0, 100)

# Seperate x and y  from dataset
x_train, y_train, x_test, y_test = sepXY(data, feature_dim)

In [37]:
data.shape

(155335, 208)

In [38]:
# Set bins of y
keys=[1,10]

# Generate tags of y
y_train_class = Amt2Cat(y_train, keys, 3)
y_test_class = Amt2Cat(y_test, keys, 3)
#y_train_class = biCat(y_train, 200)
#y_test_class = biCat(y_test, 200)

In [40]:
counter = [0,0,0]
c = 0
for i in y_train:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 2:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[83150, 37244, 19407]
139801


In [41]:
counter = [0,0,0]
c=0
for i in y_test:
    y_class = 0
    for j in keys:
        if i<j:
            counter[y_class] = counter[y_class]+1
            break
        y_class = y_class+1
        if y_class == 2:
            counter[y_class] = counter[y_class]+1
            break
    c=c+1
print(counter)
print(c)

[7997, 5325, 2212]
15534


In [45]:
counter = 0
for i in y_train_class:
    if i[1] == 1:
        counter = counter + 1
print(counter)

37244


### Build model and test

In [42]:
def setModel():
    model = Sequential()

    model.add(Dense(256, input_dim=feature_dim, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(512, activation='relu')) 
    model.add(Dropout(0.2))

    model.add(Dense(256, activation='relu')) 
    model.add(Dropout(0.2))
    
    model.add(Dense(3, activation='softmax')) 

    # Define your optimizer
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    
    return model

In [51]:
# training
print('Training -----------')
model = setModel()
model.fit(x_train, y_train_class, epochs=2, batch_size = 20)

Training -----------
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7faef5f00c18>

In [52]:
print('\nTesting ------------')
loss, accuracy = model.evaluate(x_test, y_test_class, batch_size=20)

print('test loss: ', loss)
print('test accuracy: ', accuracy)


Testing ------------
test loss:  0.944731870511175
test accuracy:  0.5189906014133117


## PCA

In [None]:
feature_dim = 207

In [None]:
feature = dataset[:,0:207]

In [21]:
pca=PCA(n_components=feature_dim)
feature_pca=pca.fit_transform(feature)

In [22]:
dataset_pca = np.concatenate((feature_pca,dataset[:,207:208]),axis=1)
dataset_pca.shape

(158437, 208)

In [23]:
dataset_pca_non_zero = dataset_pca[dataset_pca[:,feature_dim]!=0,:]
dataset_pca_non_zero.shape

(67290, 208)