In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize data

In [10]:
train_df = pd.DataFrame.from_csv('../data/clean/train_reviews.csv')
val_df = pd.DataFrame.from_csv('../data/clean/val_reviews.csv')
train_df.head()

Unnamed: 0_level_0,hour_of_gameplay,content_review,helpful_vote,total_vote,funny_vote,number_comment,polarity
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
730,484.4,Such a great game.... but it will suck you in ...,2,2,1,1,1
730,724.0,Silver=Global ...,1,2,0,1,0
730,1315.4,The game of a generation. Travel the globe and...,4,4,0,5,1
730,64.6,I have been playing Counter-Strike since 1.5 a...,1,1,0,0,0
730,630.0,I think this might be the best game ever.CSGO ...,2,2,0,0,1


In [25]:
train_df[train_df.content_review.isnull()]
train_df.dropna(inplace = True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13834 entries, 730 to 440
Data columns (total 7 columns):
hour_of_gameplay    13834 non-null float64
content_review      13834 non-null object
helpful_vote        13834 non-null int64
total_vote          13834 non-null int64
funny_vote          13834 non-null int64
number_comment      13834 non-null int64
polarity            13834 non-null int64
dtypes: float64(1), int64(5), object(1)
memory usage: 864.6+ KB


In [31]:
def vectorize_data(reviews_df, vectorizer = None):
    reviews = reviews_df.content_review.values
    
    if vectorizer is None:
        vectorizer =  CountVectorizer(stop_words = 'english', min_df = 0.001, max_df = 0.8, max_features = 10000)
        vectorizer.fit(reviews)
        
    X_content_review = vectorizer.transform(reviews).toarray()
    X = np.hstack((np.ones((len(X_content_review), 1)), X_content_review
                   , reviews_df.helpful_vote.values.reshape(-1, 1), reviews_df.total_vote.values.reshape(-1, 1)
                   , reviews_df.funny_vote.values.reshape(-1, 1), reviews_df.number_comment.values.reshape(-1, 1)))
    Y = reviews_df.polarity.values.reshape(-1, 1)
    return vectorizer, X, Y

In [32]:
vectorizer, X_train, Y_train = vectorize_data(train_df)
_, X_val, Y_val = vectorize_data(val_df, vectorizer)
print X_train.shape, Y_train.shape
print X_val.shape, Y_val.shape

(13834, 7266) (13834, 1)
(2769, 7266) (2769, 1)


# Preprocess data

In [33]:
def preprocess_data(X, X_mean = None, X_std = None):
    if X_mean is None:
        X_mean = X[:, 1 :].mean(axis = 0)
    if X_std is None:
        X_std = X[:, 1 :].std(axis = 0)
        
    X[:, 1 :] = (X[:, 1 :] - X_mean) / X_std
    return X_mean, X_std

In [34]:
X_mean, X_std = preprocess_data(X_train)
_ = preprocess_data(X_val, X_mean, X_std)
print X_train.mean(axis = 0), X_train.std(axis = 0)
print X_val.mean(axis = 0), X_val.std(axis = 0)

[  1.00000000e+00   1.88794894e-15  -4.63821871e-15 ...,  -4.35749295e-15
   3.55689086e-15  -2.07090120e-15] [ 0.  1.  1. ...,  1.  1.  1.]
[ 1.          0.01587495 -0.00708178 ..., -0.00375977  0.00669722
 -0.00899576] [ 0.          1.19452817  0.79919985 ...,  0.90281573  0.98006287
  0.91318821]


# Analyze

In [35]:
def sigmoid(S):
    '''
    Computes sigmoid function for each element of array S.
    '''
    return 1 / (1 + np.exp(-S))
def softmax(S):
    '''
    Computes softmax function for each row of array S.
    '''
    A = np.exp(S)
    A /= A.sum(axis=1, keepdims=True)
    return A
def compute_nnet_outputs(Ws, X, need_all_layer_outputs):
    '''
    Computes the outputs of Neural Net by forward propagating X through the net.
    
    Parameters
    ----------
    Ws : list of numpy arrays
        Ws[l-1] is W of layer l with l >= 1 (layer 0 is input layer; it doesn't have W);
        W of layer l will have the shape of (d^(l-1)+1, d^(l)), where 
        d^(l-1) is the number of neurons (not count the +1 neuron) of layer l-1 and 
        d^(l) is the number of neurons (not count the +1 neuron) of layer l.
    X : numpy array, shape (N, d+1)
        The matrix of input vectors (each row corresponds to an input vector); 
        the first column of this matrix is all ones (corresponding to x_0).
    need_all_layer_outputs : bool
        If this var is true, we'll return a list of layer's-outputs; 
        otherwise, we'll return the final layer's output.
    
    Returns
    -------
    If `need_all_layer_outputs` is false, return
        A : numpy array, shape (N, K=10)
            The maxtrix of output vectors of final layer; each row is an output vector (containing 
            each class's probability given the corresponding input vector).
    Else, return
        As : list of numpy arrays
            As[l] is the matrix of output vectors of layer l; each row is an output vector (corresponding 
            to an input vector).
    '''    
    A = X
    As = [X]
    for i in xrange(len(Ws)):
        A = A.dot(Ws[i])
        if i + 1 < len(Ws):
            A = np.hstack((np.ones((A.shape[0], 1)), sigmoid(A)))
        else:
            A = softmax(A)
        if need_all_layer_outputs:
            As.append(A)
    
    if need_all_layer_outputs:
        return As
    return A
    

In [36]:
def train_nnet(train_X, train_Y, val_X, val_Y, hid_layer_sizes, wd_level,
               mb_size, learning_rate, max_patience, max_epoch=1000000, momentum_param=0.):
    '''
    Trains Neural Net on the dataset (X, Y).
    Cost function: mean negative log likelihood + weight decay.
    Optimization algorithm: SGD; stopping criteria: early stopping and/or max epoch.
    
    Parameters
    ----------
    X : numpy array, shape (N, d + 1)
        The matrix of input vectors (each row corresponds to an input vector); 
        the first column of this matrix is all ones (corresponding to x_0).
    Y : numpy array, shape (N, 1)
        The vector of outputs.
    hid_layer_sizes : list
        The list of hidden layer sizes; e.g., hid_layer_sizes = [20, 10] means: 
        the Net has 2 hidden layers, the 1st one has 20 neurons, and the 2nd one has 
        10 neurons (not count the +1 neurons).
    wd_level : float
        The level (coefficient) of weight decay.
    mb_size : int
        Minibatch size of SGD.
    learning_rate : float
        Learning rate of SGD.
    max_patience : int (> 0) or None
        The parameter of early stopping. We'll have a `patience` variable with initial value equal to
        `max_patience`. During the training, we'll keep track of the best MBE (Mean Binary Error) 
        on the validation set; if the MBE on the validation set at the current epoch < the current 
        best one, we'll reset `patience` to `max_patience`; otherwise, `patience` -= 1. 
        When `patience` = 0 or `max_epoch` is reached, we'll terminate SGD.
        If `max_patience` is None, we don't use early stopping.
    max_epoch : int, default 1000000
        We'll terminate SGD after this number of epochs or when `patience` = 0 (if early stopping is used).
    
    Returns
    -------
    Ws : list of numpy arrays
        Ws[l-1] is W of layer l with l >= 1 (layer 0 is input layer; it doesn't have W);
        W of layer l will have the shape of (d^(l-1)+1, d^(l)), where 
        d^(l-1) is the number of neurons (not count the +1 neuron) of layer l-1 and 
        d^(l) is the number of neurons (not count the +1 neuron) of layer l.
        *If `max_patience` is None, Ws are weights after the final epoch (as previous homeworks); 
        otherwise, Ws are weights corresponding to the best MBE on the validation set.*
    train_errs : list, len = num epochs spent on training
        The list of MBEs on the training set after each epoch.
    val_errs : list, len = num epochs spent on training
        The list of MBEs on the validation set after each epoch.
    
    Notes
    -----
    After each *100-epochs* (in the experiments below, you'll not want to print after each single epoch), 
    you need to print out: 
    - The MBE on the training set and validation set (regardless of `max_patience`).
    - The value of `patience` (if `max_patience` is not None).
    E.g., `Epoch ..., training err ..., val err ..., patience ...` (in this case, `max_patience` is not None).
    
    After the training, you need to print out the info of returned Ws:
    - The corresponding epoch.
    - The corresponding MBE on the training set and validation set.
    E.g., `Info of returned Ws: epoch ..., train err ..., val err ...`.
    '''
    # Init Ws
    K = len(np.unique(train_Y)) # Num classes
    layer_sizes = [train_X.shape[1] - 1] + hid_layer_sizes + [K]
    np.random.seed(0) # This will fix the randomization; so, you and me will have the same results
    Ws = [np.random.randn(layer_sizes[l]+1, layer_sizes[l+1]) / np.sqrt(layer_sizes[l]+1) 
          for l in range(len(layer_sizes)-1)]
    best_Ws = [np.copy(W) for W in Ws]
    
    Vs = [np.zeros_like(W) for W in Ws]
    train_errs = []
    val_errs = []
    N = train_X.shape[0]
    mbid = np.arange(N)
    Y_onehot = np.zeros((N, K))
    Y_onehot[np.arange(N).reshape(-1, 1), train_Y] = 1
    patience = max_patience
    best_val_err = 1000
    best_epoch = 0
    
    for epoch in xrange(max_epoch):
        np.random.shuffle(mbid)
        
        for i in xrange(0, N, mb_size):
            M = min(N, i + mb_size) - i
            As = compute_nnet_outputs(Ws, train_X[mbid[i : i + mb_size], :], True)
            
            delta = As[-1] - Y_onehot[mbid[i : i + mb_size], :]
            Wgrad = As[-2].T.dot(delta) + 2 * M * wd_level * Ws[-1]
            
            for l in xrange(1, len(layer_sizes)):
                if l + 1 < len(layer_sizes):
                    delta = (delta.dot(Ws[-l].T) * As[-l - 1] * (1. - As[-l - 1]))[:, 1:]
                Vs[-l] = momentum_param * Vs[-l] - learning_rate * Wgrad / M
                Ws[-l] += Vs[-l]
                if l + 1 < len(layer_sizes):
                    Wgrad = As[-l - 2].T.dot(delta) + 2 * M * wd_level * Ws[-l - 1]
            
        A = compute_nnet_outputs(Ws, train_X, False)
        train_err = np.mean(A.argmax(axis = 1).reshape(-1, 1) != train_Y) * 100
        train_errs.append(train_err)
        
        A = compute_nnet_outputs(Ws, val_X, False)
        val_err = np.mean(A.argmax(axis = 1).reshape(-1, 1) != val_Y) * 100
        val_errs.append(val_err)
        
        if max_patience is not None:
            patience -= 1
            if val_err < best_val_err:
                best_val_err = val_err
                best_epoch = epoch
                patience = max_patience
                best_Ws = [np.copy(W) for W in Ws]
        else:
            best_epoch = epoch
        
        if epoch % 1 == 0:
            if max_patience is not None:
                print "Epoch %d, training err %.3f, val err %.3f, patience %d" % (epoch, train_err, val_err, patience)
            else:
                print "Epoch %d, training err %.3f, val err %.3f" % (epoch, train_err, val_err)
                
        if max_patience is not None and patience <= 0:
            break
            
    print "Info of returned Ws: epoch %d, train err %.3f, val err %.3f" % \
                (best_epoch, train_errs[best_epoch], val_errs[best_epoch])
        
    if max_patience is not None:
        return (best_Ws, train_errs, val_errs)
    return (Ws, train_errs, val_errs)
    

In [40]:
Ws_0, train_errs_0, val_errs_0 = train_nnet(X_train, Y_train, X_val, Y_val, hid_layer_sizes=[50], 
                                            wd_level=0.01, mb_size=32, learning_rate=0.03, 
                                            max_patience=5, max_epoch=50, momentum_param=0.0)

Epoch 0, training err 9.202, val err 10.798, patience 5
Epoch 1, training err 6.686, val err 8.162, patience 5
Epoch 2, training err 5.667, val err 7.692, patience 5
Epoch 3, training err 5.450, val err 7.548, patience 5
Epoch 4, training err 4.995, val err 7.476, patience 5
Epoch 5, training err 5.226, val err 7.728, patience 4
Epoch 6, training err 4.648, val err 7.584, patience 3
Epoch 7, training err 5.674, val err 7.801, patience 2
Epoch 8, training err 5.176, val err 7.765, patience 1
Epoch 9, training err 4.850, val err 7.692, patience 0
Info of returned Ws: epoch 4, train err 4.995, val err 7.476
