In [2]:
import numpy as np
import pandas as pd

In [5]:
# We'll be using this file
df = pd.read_csv('../data/ecommerce_data.csv')
df.head()

Unnamed: 0,is_mobile,n_products_viewed,visit_duration,is_returning_visitor,time_of_day,user_action
0,1,0,0.65751,0,3,0
1,1,1,0.568571,0,2,1
2,1,0,0.042246,1,1,0
3,1,1,1.659793,1,1,2
4,0,1,2.014745,1,1,2


## Preprocessing

In [11]:
def get_data():
    df = pd.read_csv('../data/ecommerce_data.csv')
    # turn it into numpy matrix
    data = df.as_matrix()
    
    # Y is the last column, so X is everything else
    X = data[:,:-1]
    Y = data[:, -1]
    
    # Normalize numerical columns
    X[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()
    X[:,2] = (X[:,2] - X[:,2].mean()) / X[:,2].std()
    
    # Work on category column
    # Get shape of original X and make new one with shape of NxD+3 (there are 4 categories)
    N, D = X.shape
    X2 = np.zeros((N, D+3))
    # Most of X is going to be the same 0:D-1 column
    X2[:,0:(D-1)] = X[:,0:(D-1)]
    
    # one hot encoding for others
    # Get time of day (0/1/2/3) and set this value in X2 to 1
    for n in range(N):
        t = int(X[n,D-1])
        X2[n,t+D-1] = 1
        
        # method 2
    # Z = np.zeros((N, 4))
    # Z[np.arange(N), X[:,D-1].astype(np.int32)] = 1
    # # assign: X2[:,-4:] = Z
    # assert(np.abs(X2[:,-4:] - Z).sum() < 1e-10)
    
    return X2, Y

In [12]:
# For logistics class we only want binary data so we'll only use classes 0 and 1
def get_binary_data():
    # return only the data from the first 2 classes
    X, Y = get_data()
    X2 = X[Y <= 1]
    Y2 = Y[Y <= 1]
    return X2, Y2

## Making Predictions

In [13]:
X, Y = get_binary_data()

In [17]:
D = X.shape[1]
# Initialize weights using the dimensions
W = np.random.randn(D)
# Bias term
b = 0

In [15]:
# Functions for predictions - sigmoid and forward
def sigmoid(a):
    return 1 / (1 + np.exp(-a))

def forward(X, W, b):
    return sigmoid(X.dot(W) + b)

In [18]:
P_Y_given_X = forward(X, W, b)
predictions = np.round(P_Y_given_X)

In [19]:
# Determine classification rate
# Parameters: Targets and predictions
def classification_rate(Y, P):
    # Returns divides number of correct vs total numbers
    return np.mean(Y == P)

In [20]:
print("Score:", classification_rate(Y, predictions))

Score: 0.653266331658


This is a baseline score based on a random selection of weights. Next: Train weights to be more accurate.