In [160]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
os.chdir('/home/fractaluser/Downloads/omega')
from keras.layers import Input, Dense, Reshape, LSTM, RepeatVector
from keras.models import Model
from keras import callbacks
import datetime

In [161]:
sample_data = pd.read_csv('sample_simulated_transaction_history.csv')

In [162]:
pd.set_option('display.max_columns', None)

In [163]:
sample_data.head()

Unnamed: 0,id,date,purchaseamount,dept,category,chain,purchasequantity,company,brand,productsize,productmeasure
0,Donor_1,2015-03-04,203.174075,Sports,Sports,site_1,1,Company1,Brand3,x,xx
1,Donor_1,2016-04-17,300.639489,Sports,Sports,site_1,1,Company1,Brand1,x,xx
2,Donor_1,2015-04-06,374.096413,Women Apparel,Dresses/Jumpsuits,site_1,1,Company2,Brand1,x,xx
3,Donor_1,2015-09-04,348.347601,Women Apparel,Dresses/Jumpsuits,site_1,1,Company2,Brand3,x,xx
4,Donor_1,2016-03-08,287.477333,Women Apparel,Dresses/Jumpsuits,site_1,1,Company1,Brand3,x,xx


In [164]:
sample_data.shape

(939512, 11)

In [165]:
sample_data.date = pd.to_datetime(sample_data.date)

In [166]:
def name_trunc(x):
    x = "_".join(x.split("_", 2)[:2])
    return x

In [167]:
sample_data.id = sample_data.id.map(lambda x:name_trunc(x))

In [168]:
df = []
depts = []
for i in sample_data.dept.unique():
    df.append(sample_data[(sample_data.dept == i)].reset_index(drop = True)[['id','date', 'purchaseamount']])
    depts.append(i)

In [169]:
sample_data.isnull().values.any()

False

In [172]:
def obtain_encoding(dataframe, depts, sample_data, pre_end_date, pre_start_date, EPOCHS, BATCH_SIZE, DIM):
    complete_data = pd.DataFrame({'date':pd.date_range(sample_data.date.min(), sample_data.date.max())})
    complete_data.set_index('date', inplace = True)
    for i in df[0].id.unique():
        specific_donor = df[0][(df[0].id == i)].reset_index(drop = True).groupby(['date']).max().drop(labels = ['id'], axis = 1).reindex(pd.date_range(sample_data.date.min(), sample_data.date.max()), fill_value = 0)
        complete_data[i] = pd.DataFrame({i:specific_donor.purchaseamount})

    xtrain = complete_data[pre_start_date:pre_end_date]
    x_train = xtrain.T.values
    xtrain1 = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

    encoding_dim = DIM

    inputs = Input(shape=(xtrain1.shape[1], 1))
    encoded = LSTM(encoding_dim)(inputs)

    decoded = RepeatVector(xtrain1.shape[1])(encoded)
    decoded = LSTM(1, return_sequences=True)(decoded)

    sequence_autoencoder = Model(inputs, decoded)
    encoder = Model(inputs, encoded)
    
    encoded_input = Input(shape=(None,encoding_dim))
    decoder_layer = sequence_autoencoder.layers[-1]
    decoder = Model(encoded_input, decoder_layer(encoded_input))

    sequence_autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics = ['mae'])

    history = sequence_autoencoder.fit(xtrain1, xtrain1, epochs=EPOCHS, verbose = 1, batch_size=BATCH_SIZE, shuffle=True,
                   callbacks = [callbacks.EarlyStopping(monitor='loss', patience=10, mode='auto')])

    plt.plot(history.history['loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train'], loc='upper left')
    plt.show()
    plt.savefig('performance-%s.png' % depts)
    
    encoded_mat = encoder.predict(xtrain1)
    new_df = pd.DataFrame(encoded_mat)
    new_df[new_df<0] = 0
    new_df.insert(loc=0, column='id', value=complete_data.columns.values)
    
    original = pd.DataFrame(x_train)
    original.insert(loc = 0, column = 'id', value = complete_data.columns.values)
    
    encoded_mat = np.reshape(encoded_mat, (encoded_mat.shape[0], encoded_mat.shape[1], 1))
    decoded_mat = decoder.predict(encoded_mat)
    comparison_df = pd.DataFrame(decoded_mat)
    comparison_df[comparison_df<0] = 0
    comparison_df.insert(loc = 0, column = 'id', value = complete_data.columns.values)
    
    

    return new_df, original, comparison_df

In [None]:
encod = []
original = []
compare = []
pre_start_date = '2015-01-01'
pre_end_date = '2016-01-01'
for i in range(len(df)):
    alpha, beta, gamma = obtain_encoding(df[i], depts[i], sample_data, pre_end_date, pre_start_date, EPOCHS=1000, BATCH_SIZE=500, DIM=50)
    encod.append(alpha)
    original.append(beta)
    compare.append(gamma)
    encod[-1].to_csv('encoding_of_%s_customers.csv' %depts[i], index = False)
    original[-1].to_csv('original_of_%s_customers.csv' %depts[i], index = False)
    compare[-1].to_csv('comparison_of_%s_customers.csv' %depts[i], index = False)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000