In [1]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate
from keras.layers import Bidirectional
from keras.optimizers import SGD
from keras.optimizers import Adam

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
import pyreadr

result = pyreadr.read_r('masked.rds') 
df = result[None] # extract the pandas data frame 

In [None]:
# Data cleaning steps specific to data-set received
# This portion has been removed.

In [3]:
# Using just the first 100,000 rows first for initial run first, to save on computational load
# raw_df = df.head(100000)
raw_df = df.copy()

In [4]:
# Rank refers to the last few rides before churning (In the range of 1-10, 10 is the latest ride)
raw_df = raw_df.sort_values(
  by=['person','rank'], 
  ascending=[True,True])

In [None]:
# latest_status refers to the whether the person has churned away from using the service or not
raw_df["latest_status"].value_counts()

In [6]:
# Removing other "latest_status" values to make it a simplier binary classification model first.
raw_df = raw_df[raw_df.latest_status != "Risk"]
raw_df = raw_df[raw_df.latest_status != "Casual"]
raw_df = raw_df[raw_df.latest_status != "Re-Activated"]

In [7]:
# Simplified into a Churned vs. Engaged binary LSTM classification problem
raw_df["latest_status"].value_counts()

Churned    55803
Engaged    37832
Name: latest_status, dtype: int64

In [8]:
# Permutate and left_join back (sequential padding) to get 3D matrix for LSTM training
from itertools import product

def expand_grid(dictionary):
   return pd.DataFrame([row for row in product(*dictionary.values())], 
                       columns=dictionary.keys())

dictionary = {'dax': raw_df["person"].unique(), 
              'rank': raw_df["rank"].unique()}

In [None]:
df_exp = expand_grid(dictionary)
df_exp2 = pd.merge(df_exp, raw_df.drop(["latest_status"], axis=1), how="left", on=["person", "rank"])
df_exp2

In [12]:
# Scaling of input values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_df_train = scaler.fit_transform(df_exp2) # Should fit_transform on train set and transform on test set

In [13]:
rescaled_df_train

array([[0.        , 0.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.82608696, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.95652174, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.26086957, 0.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [0.26086957, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [14]:
# Reshape into 3D array (sample, timesteps, n_features)
X = array(rescaled_df_train).reshape(len(df_exp2["person"].unique()), 10, len(df_exp2.columns)-10)
X

array([[[0.        , 0.        , 1.        , ..., 0.        ,
         1.        , 1.        ],
        [0.        , 0.        , 1.        , ..., 0.        ,
         0.        , 0.        ],
        [0.82608696, 0.        , 1.        , ..., 0.        ,
         0.        , 0.        ],
        ...,
        [0.        , 0.        , 1.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 1.        , ..., 0.        ,
         0.        , 1.        ],
        [0.95652174, 0.        , 1.        , ..., 0.        ,
         1.        , 1.        ]],

       [[0.91304348, 0.        , 1.        , ..., 0.        ,
         0.        , 0.        ],
        [0.43478261, 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.47826087, 0.        , 1.        , ..., 0.        ,
         0.        , 0.        ],
        ...,
        [0.86956522, 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.9

In [15]:
Y = raw_df[["person", "latest_status"]].drop_duplicates()
Y["latest_status"].value_counts()

Churned    8238
Engaged    3787
Name: latest_status, dtype: int64

In [17]:
dict = {"Churned": 0, "Engaged": 1}

In [20]:
Y = Y.replace({"latest_status": dict})

In [22]:
Y = Y["latest_status"]

In [32]:
Y2 = array(Y).reshape(12025, 1)

In [34]:
print(Y2.shape)

(12025, 1)


In [23]:
print(Y.shape)
print(X.shape)

(12025,)
(12025, 10, 14)


In [35]:
# Train Test Split (80/20)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y2, shuffle=True, test_size=0.2)

In [36]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(9620, 10, 14)
(9620, 1)
(2405, 10, 14)
(2405, 1)


In [50]:
# Simple LSTM model
model = Sequential()
model.add(LSTM(150, activation='relu', return_sequences=True, input_shape=(10, 14)))
model.add(LSTM(50, activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-6), metrics=['accuracy'])
#history = model.fit(X_train, Y_train, epochs=100, validation_split=0.2, verbose=1, shuffle=False, batch_size=200)

history = model.fit(X_train, Y_train, epochs=30, verbose=1, shuffle=False, batch_size=200, validation_data=(X_test, Y_test))

Train on 9620 samples, validate on 2405 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [46]:
print(model.summary())

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_12 (LSTM)               (None, 10, 150)           99000     
_________________________________________________________________
lstm_13 (LSTM)               (None, 50)                40200     
_________________________________________________________________
dropout_10 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 51        
Total params: 139,251
Trainable params: 139,251
Non-trainable params: 0
_________________________________________________________________
None
