In [59]:
import numpy as np
import pandas as pd

# Prep Data

In [60]:
admissions = pd.read_csv('binary.csv')

# make dummy variables for rank
data = pd.concat([admissions, pd.get_dummies(admissions['rank'], prefix='rank')], axis=1)

# standardize features
for field in ['gre', 'gpa']:
    # 'field' is either going to be 'gre' or 'gpa'
    
    # get the mean and the standard deviation derived from all values in field
    mean, std = data[field].mean(), data[field].std()
    
    # this is where features are standardized 
    #     the mean is being subtracted from the value
    #     then that number is being divided by the 
    #     standard deviation
    data.loc[:, field] = (data[field]-mean/std)
    
# split off random 10% of data for testing

# initiate RandomState by seeding it *
np.random.seed(42)

# this will return an array of indexes 
sample = np.random.choice(data.index, size=int(len(data)*0.9), replace=False)

# split test data away from training data
data, test_data = data.ix[sample], data.drop(sample)


# split into features and targets
# these are the variables that will be used below

# training data by dropping admit column and putting that into the 
#     results array/vector
features, targets = data.drop('admit', axis=1), data['admit']

# testing data
features_test, targets_test = test_data.drop('admit', axis=1), test_data['admit']

\* [RandomState Documentation](https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.html) for the above references to pandas.DataFrame.random 

# Implement Gradient Descent

In [61]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_prime(x):
    '''
    Derivative of the sigmoid function
    '''
    return #sigmoid - np.square(sigmoid(x))

In [67]:
# in the data prep section the variables 
#     features, targets, features_test, and targets_test were created

# RandomState was seeded above
# np.randomseed(42)

# the shape of features is (360, 7) with 360 records and 7 features
#     this will set n_records to 360 and n_features to 7
n_records, n_features = features.shape 

last_loss = None

# initialize weights. the '**-0.5' 
weights = np.random.normal(scale=1 / n_features**.5, size=n_features)

print(weights)

# neural network hyperparameters
epochs = 10000
learnrate = 0.3


# train the network
for e in range(epochs):
    del_w = np.zeros(weights.shape)
    for x, y in zip(features.values, targets):
        
        h = np.dot(x, weights)
        
        output = sigmoid(h)
        
        error = y - output
        '''
        I'm pretty sure this is some calculus I don't understand.
        Before, we used the sigmoid_prime(h) to get the error_term
        but sigmoid_prime looks nothing like the below for me
        '''
        error_term = error * output * (1-output)
        
        # therefore I don't understand what it's doing right here
        #    it's setting the delta_w as a ratio of the x
        #    value I'm pretty sure
        del_w += error_term * x
        
    # this is using the learnrate to step up the weights
    #    by a multiple(learnrate) of the average change 
    #    in weights
    weights += learnrate * del_w / n_records
    
    # Printing out the mean square error on the training set
    if e % (epochs / 10) == 0:
        out = sigmoid(np.dot(features, weights))
        loss = np.mean((out - targets) ** 2)
        if last_loss and last_loss < loss:
            print("Train loss: ", loss, "  WARNING - Loss Increasing")
        else:
            print("Train loss: ", loss)
        last_loss = loss
        
# Calculate accuracy on test data
tes_out = sigmoid(np.dot(features_test, weights))
predictions = tes_out > 0.5
accuracy = np.mean(predictions == targets_test)
print("Prediction accuracy: {:.3f}".format(accuracy))













# learnrate = 0.5

# # This is where the linear combination (aggregation) happens
# # h = x[0] * w[0] + x[1] * w[1] 
# h = np.dot(x, w)

# # The neaural network output (y-hat) (activation function)
# nn_output = sigmoid(h)

# # error equals expected minus output
# error = y - nn_output

# # output gradient (f'(h)) <-- derivative of f(h) (the activation function)
# output_gradient = sigmoid_prime(h)

# # error term (lowercase delta)
# error_term = error * output_gradient

# # gradient descent step
# del_w = [learnrate * error_term * x]

[  5.26142084e-02   6.85480599e-04   2.85716680e-01   7.30810421e-01
   1.56429985e-01   2.14596272e-01   5.60074300e-02]
('Train loss: ', 0.67499997594253025)
('Train loss: ', 0.67499997098034248)
('Train loss: ', 0.67499996341321034)
('Train loss: ', 0.67499995043868555)
('Train loss: ', 0.67499992293208999)
('Train loss: ', 0.67499982375894108)
('Train loss: ', 0.3249999872578116)
('Train loss: ', 0.32499998454661438)
('Train loss: ', 0.32499998035755862)
('Train loss: ', 0.32499997302097661)
Prediction accuracy: 0.750
