In [59]:
import pandas as pd
import numpy as np

In [60]:
test = pd.read_pickle('data_test.p')

object_cols = list(test.dtypes[test.dtypes == type(object())].index)
drop_cols = object_cols + ['date', 'average_stars']

In [61]:
train = pd.read_pickle('data_train.p').drop(drop_cols, axis=1)

In [62]:
train_X = train.drop('stars', axis=1)
mean = train_X.mean()
std = train_X.std()

def transform(data):
    return ((data - mean)/std).fillna(0)

In [63]:
train_X = transform(train_X)
test = pd.read_pickle('data_test.p').drop(drop_cols, axis=1)
val = pd.read_pickle('data_val.p').drop(drop_cols, axis=1)

In [80]:
from sklearn.neural_network import MLPRegressor
train_y = train['stars']
val_X = transform(val.drop('stars', axis=1))
val_y = val['stars']
test_X = transform(test.drop('stars', axis=1))
test_y = test['stars']

def get_preds(clf, val_X):
    preds = clf.predict(val_X)
    preds[preds > 5] = 5
    preds[preds < 1] = 1
    return preds

def get_accuracy(solver, alpha, hidden_layer_sizes, activation='relu', batch_size=200, 
                 learning_rate='constant'):
    clf = MLPRegressor(solver=solver, alpha=alpha, hidden_layer_sizes=hidden_layer_sizes, 
        activation=activation, batch_size=batch_size, learning_rate=learning_rate)
    clf.fit(train_X, train_y)
    return np.mean((get_preds(clf, val_X) - val_y) ** 2)

In [65]:
n = train_X.shape[1]
hidden_layers = []

for i in range(2, 10):
    hidden_layers.append(tuple(np.linspace(101, 1, i).round().astype(int)[1:-1]))

hidden_layer_accuracies = []
# these accuracies perform comparably to the smaller sized layers
for sizes in hidden_layers:
    accuracy = get_accuracy(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=sizes)
    print(accuracy)
    hidden_layer_accuracies.append(accuracy)

KeyboardInterrupt: 

In [None]:
hidden_layers = [(5,2), (5), (5, 3, 2)]

accuracies = []
for sizes in hidden_layers:
    accuracy = get_accuracy(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=sizes)
    print(accuracy)
    accuracies.append(accuracy)

In [None]:
# accuracy for 0 hidden layer (i.e. logistic regression)
accuracy = get_accuracy(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=())
print(accuracy)

In [70]:
# results of baseline, using either average stars of the user or restaurant
print(np.mean((test.avg_stars_x - test['stars']) ** 2))

1.548369652113151


In [None]:
activations = np.array(['logistic', 'tanh', 'relu'])
solvers = np.array(['adam', 'sgd', 'lbfgs'])
alphas = np.array([1e-5, 1e-4, 1e-3])
batch_sizes = np.array([100, 200, 300])
learning_rates = np.array(['constant', 'invscaling', 'adaptive'])

accuracies = pd.DataFrame(\
    columns=['activation', 'solver', 'alpha', 'batch_size', 'learning_rate', 'accuracy'])

for i in range(10):
    solver = np.random.choice(solvers)
    alpha = np.random.choice(alphas)
    activation = np.random.choice(activations)
    batch_size = np.random.choice(batch_sizes)
    learning_rate = np.random.choice(learning_rates)
    
    accuracy = get_accuracy(solver, alpha, (5), activation, batch_size, learning_rate)
    
    accuracies = accuracies.append({'activation': activation, 'solver': solver,\
        'alpha': alpha, 'batch_size': batch_size, 'learning_rate': learning_rate,\
        'accuracy': accuracy}, ignore_index=True)
    print(accuracies.iloc[-1])
    print('\n')

In [None]:
accuracies

In [None]:
hidden_layers = [[5], [5, 2]]

for i in range(2, 10):
    hidden_layers.append(np.linspace(101, 1, i).round().astype(int)[1:-1])

hidden_layers = np.array(hidden_layers)

accuracies_with_hidden_layers = pd.DataFrame(columns=['activation', 'solver', 'alpha', \
    'batch_size', 'learning_rate', 'hidden_layers', 'accuracy'])

for i in range(100):
    solver = np.random.choice(solvers)
    alpha = np.random.choice(alphas)
    activation = np.random.choice(activations)
    batch_size = np.random.choice(batch_sizes)
    learning_rate = np.random.choice(learning_rates)
    hidden_layer_sizes = tuple(np.random.choice(hidden_layers))
    
    accuracy = get_accuracy(\
        solver, alpha, hidden_layer_sizes, activation, batch_size, learning_rate)
    
    accuracies_with_hidden_layers = accuracies_with_hidden_layers.append(\
        {'activation': activation, 'solver': solver, 'alpha': alpha, 'batch_size': batch_size, \
         'learning_rate': learning_rate, 'hidden_layers': hidden_layer_sizes, \
         'accuracy': accuracy}, ignore_index=True)
    print(accuracies_with_hidden_layers.iloc[-1])
    print('\n')

In [None]:
accuracies_with_hidden_layers.to_csv('nn_accuracies.csv')

In [66]:
accuracies_with_hidden_layers = pd.read_csv('nn_accuracies.csv')

In [85]:
from sklearn.model_selection import RandomizedSearchCV

best = accuracies_with_hidden_layers.sort_values('accuracy').loc[0]

clf = MLPRegressor(solver=row['solver'], alpha=row['alpha'],
    hidden_layer_sizes=tuple([int(x.strip('()')) for x in row['hidden_layers'].split(',')]), 
    activation=row['activation'], batch_size=row['batch_size'], 
    learning_rate=row['learning_rate'])

train_and_val_X = train_X.append(val_X)
train_and_val_y = train_y.append(val_y)

mse = 0

for train_index, test_index in TimeSeriesSplit(n_splits=3).split(train_and_val_X):
    clf.fit(train_and_val_X.iloc[train_index], train_and_val_y.iloc[train_index])
    mse += np.mean((get_preds(clf, train_and_val_X.iloc[test_index]) - \
                    train_and_val_y.iloc[test_index]) ** 2)/3

mse

1.682371641401535

In [84]:
np.mean((get_preds(clf, train_and_val_X.iloc[test_index]) - \
                    train_and_val_y.iloc[test_index]) ** 2)/3

2.5965100545887183