In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from time import time

In [5]:
def ChangeDateFormat(date):
    month = date // 100
    day = date % 100
    date = '2020-' + str(month) + '-' + str(day)
    return pd.Timestamp(date)

In [3]:
# code for importing the original data.
info = pd.read_csv('/work/ma384/Share/Team_Repeat_Buyer/user_info_format1.csv')
log = pd.read_csv('/work/ma384/Share/Team_Repeat_Buyer/user_log_format1.csv')
df_train = pd.read_csv('/work/ma384/Share/Team_Repeat_Buyer/train_format1.csv')

In [4]:
info['age_range'] = info['age_range'].fillna(0)
info['gender'] = info['gender'].fillna(2)
info['age_range'] = info['age_range'].astype('category')
info['gender'] = info['gender'].astype('category')
info['age_range'] = info['age_range'].replace(8, 7)

In [6]:
log['time_stamp'] = log['time_stamp'].apply(lambda x: ChangeDateFormat(x))

### Seller infor

In [7]:
seller_item_count = log[['seller_id','item_id']]
seller_item_count = seller_item_count.drop_duplicates()
seller_item_count['seller_item_count'] = 1
seller_item_count = seller_item_count.groupby(['seller_id']).agg('sum')
seller_item_count = seller_item_count.drop(columns=['item_id'])
seller_item_count.head()

Unnamed: 0_level_0,seller_item_count
seller_id,Unnamed: 1_level_1
1,2977
2,154
3,171
4,155
5,660


In [8]:
seller_brand_count = log[['seller_id','brand_id']]
seller_brand_count = seller_brand_count.drop_duplicates()
seller_brand_count = seller_brand_count.groupby(['seller_id']).agg(len)
seller_brand_count.rename(columns = {"brand_id": "brand_count"},inplace=True) 
seller_brand_count.head()

Unnamed: 0_level_0,brand_count
seller_id,Unnamed: 1_level_1
1,3.0
2,2.0
3,2.0
4,3.0
5,2.0


In [9]:
item_info = log['item_id'].value_counts()
cat_info = log['cat_id'].value_counts()
brand_info = log['brand_id'].value_counts()
seller_info = log['seller_id'].value_counts()
print(item_info.head(), item_info.size)
print(cat_info.head(),cat_info.size)
print(brand_info.head(),brand_info.size)
print(seller_info.head(),seller_info.size)

67897     345905
783997    178005
636863     82480
631714     42771
61518      34801
Name: item_id, dtype: int64 1090390
662     4339025
737     1749753
1505    1724239
389     1621736
656     1538969
Name: cat_id, dtype: int64 1658
3738.0    763345
1360.0    737545
1446.0    729555
1214.0    541075
5376.0    528003
Name: brand_id, dtype: int64 8443
3760    743217
3828    730455
184     613811
1102    541374
4173    528755
Name: seller_id, dtype: int64 4995


### User action

In [10]:
actions = log.action_type
actions = pd.get_dummies(actions,prefix='action_count')
user_action = pd.concat([log.user_id,actions],axis=1)
user_action = user_action.groupby('user_id').agg('sum')
user_action.head()

Unnamed: 0_level_0,action_count_0,action_count_1,action_count_2,action_count_3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,27.0,0.0,6.0,0.0
2,47.0,0.0,14.0,2.0
3,63.0,0.0,4.0,1.0
4,49.0,0.0,1.0,0.0
5,150.0,0.0,13.0,10.0


### Merge

In [14]:
df_Train = pd.read_csv('/work/ma384/Share/Team_Repeat_Buyer/train_format1.csv')

In [15]:
seller = seller_item_count.merge(seller_brand_count,on='seller_id')
df_train = df_Train.merge(seller,left_on='merchant_id',right_on='seller_id')
df_train = df_train.merge(user_action,on='user_id')
log_dummy = pd.get_dummies(log['action_type'],prefix='action')
log_dummy = pd.concat([log.user_id,log.seller_id,actions],axis=1)
log_dummy.columns = ['user_id', 'merchant_id', 'action_0', 'action_1',
       'action_2', 'action_3']
log_dummy = log_dummy.groupby(['user_id','merchant_id']).agg('sum')
df_train = df_train.merge(log_dummy,on=['merchant_id','user_id'])

In [17]:
df_train.head(3)

Unnamed: 0,user_id,merchant_id,label,seller_item_count,brand_count,action_count_0,action_count_1,action_count_2,action_count_3,action_0,action_1,action_2,action_3
0,34176,3906,0,308,2.0,410.0,0.0,34.0,7.0,36.0,0.0,1.0,2.0
1,34176,121,0,1179,2.0,410.0,0.0,34.0,7.0,13.0,0.0,1.0,0.0
2,34176,4356,1,67,2.0,410.0,0.0,34.0,7.0,12.0,0.0,6.0,0.0


In [58]:
train, test = train_test_split(df_train,test_size=0.2,random_state=0,shuffle=True,stratify=df_train.label)

In [59]:
targets_train = train.label.values
targets_test = test.label.values
attributes_train = train.drop(['merchant_id','user_id','label'],axis=1).values
attributes_test = test.drop(['merchant_id','user_id','label'],axis=1).values

In [60]:
attributes_train = (attributes_train - attributes_train.mean(axis=0))/attributes_train.std(axis=0)
attributes_test = (attributes_test - attributes_test.mean(axis=0))/attributes_test.std(axis=0)

### NN (Stacked LR with ReLU)

In [71]:
stacked_lr_with_relu = Sequential()
stacked_lr_with_relu.add(Dense(10, input_shape=(10,), activation='relu'))
stacked_lr_with_relu.add(Dense(2, input_shape=(10,), activation=None))

stacked_lr_with_relu.compile(loss='sparse_categorical_crossentropy',
                            optimizer=Adam(learning_rate=1e-5),
                            metrics=['accuracy'])

stacked_lr_with_relu.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_26 (Dense)             (None, 10)                110       
_________________________________________________________________
dense_27 (Dense)             (None, 2)                 22        
Total params: 132
Trainable params: 132
Non-trainable params: 0
_________________________________________________________________


In [72]:
save_best = ModelCheckpoint('nn1.h5', save_best_only=True, verbose=2)
early_stopping = EarlyStopping(monitor="val_loss", patience=5)

In [73]:
time_start = time()
hist = stacked_lr_with_relu.fit(attributes_train, targets_train, epochs=50, 
                                callbacks=[save_best, early_stopping], 
                                validation_split=0.2, verbose=1)
time_stop = time()
time_elapsed = time_stop - time_start
print(time_elapsed / 60, '(min)')

Train on 166952 samples, validate on 41739 samples
Epoch 1/50
Epoch 00001: val_loss improved from inf to 2.58197, saving model to nn1.h5
Epoch 2/50
Epoch 00002: val_loss improved from 2.58197 to 2.35933, saving model to nn1.h5
Epoch 3/50
Epoch 00003: val_loss improved from 2.35933 to 2.16493, saving model to nn1.h5
Epoch 4/50
Epoch 00004: val_loss improved from 2.16493 to 1.95553, saving model to nn1.h5
Epoch 5/50
Epoch 00005: val_loss improved from 1.95553 to 1.81092, saving model to nn1.h5
Epoch 6/50
Epoch 00006: val_loss improved from 1.81092 to 1.70039, saving model to nn1.h5
Epoch 7/50
Epoch 00007: val_loss improved from 1.70039 to 1.55168, saving model to nn1.h5
Epoch 8/50
Epoch 00008: val_loss improved from 1.55168 to 1.43775, saving model to nn1.h5
Epoch 9/50
Epoch 00009: val_loss improved from 1.43775 to 1.36528, saving model to nn1.h5
Epoch 10/50
Epoch 00010: val_loss improved from 1.36528 to 1.30310, saving model to nn1.h5
Epoch 11/50
Epoch 00011: val_loss improved from 1.30

In [75]:
targeth_train = np.argmax(stacked_lr_with_relu.predict(attributes_train), axis=1)
targeth_test  = np.argmax(stacked_lr_with_relu.predict(attributes_test), axis=1)

In [80]:
print("train accuracy", round((targeth_train == targets_train).mean(), 3))
print("test accuracy", round((targeth_test == targets_test).mean(), 3))

train accuracy 0.806
test accuracy 0.803
