# Task 2
## Stock Prices Time Series Prediction
### Imports

In [19]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn import metrics

### Read data into memory

In [20]:
data = pd.read_csv('sp500.csv')
data = data.drop(['Unnamed: 0'], axis = 1)

### Correlate data

In [21]:
correlation_matrix = data.corr()

### Find Highest correlation
Get the single column with the highest correlation

In [22]:
sp_correlations = np.array(correlation_matrix[['SP500']])[1:]
correlation_index = np.argmax(sp_correlations) + 1

### Extract The Desired Dataset

In [23]:
desired_stock = data.iloc[:, correlation_index]

input_set_1d  = np.array(desired_stock)
output_set_1d = np.array(desired_stock.shift(-5).dropna())

def construct_time_series(best_stock_id,data):
    x = []
    y = []
    windowSize = 4
    for i in range(data.shape[0] - 2 * windowSize):
        x += [data[best_stock_id][i:i + windowSize+1]]
        y += [data[best_stock_id][i + windowSize + 1:i + 2 *windowSize + 1]]

    return np.array(x), np.array(y)

input_set, output_set = construct_time_series(data.columns[correlation_index],data)

train_ratio = 0.8
test_ratio  = 0.2
valid_ratio = 0.1 # Will be used in the MLPClassifier to take 10% of the training data as validation

training_input, testing_input, training_target, testing_target = train_test_split(input_set,output_set,
                                                                                  test_size=test_ratio,
                                                                                  random_state=1)

### Create the Linear regressor
Two multilayer perceptrons. One using a stochastic gradient descent optimizer, and the other using an adam optimizer.

Other than the optimizer, all other parameters will remain consistent for both models.

#### Scoring Metric Used:
*The mean suared error will be used as a scoring metric. The lower the error, the better the optimizer!*

#### Regularization Technique Used:
*L2 regularization was used to avoid overfitting*

In [24]:
hidden_layer        = (512, 512, 512) # Hidden layers and nodes, per layer
regularization_rate = 0.00001         # Rate for L2 regularization
learning_rate_start = 0.01            # Initial learning rate
learning_rate_mode  = 'adaptive'      # Mode for changing learning rate
batch_size          = 128             # Size of batch for one update
tolerance           = 1e-6            # Tolerance level for regressor
max_iteration       = 1000            # Maximum number of iterations before stopping
no_change_tolerance = 10              # Stop if no significant change happens in this number of iterations

# n_iter_no_change=10

model_adm = MLPRegressor(hidden_layer_sizes=hidden_layer, solver='adam', alpha=regularization_rate,
                          batch_size=batch_size, learning_rate=learning_rate_mode,
                          learning_rate_init=learning_rate_start, max_iter=max_iteration, tol=tolerance,
                          verbose=True, early_stopping=True, validation_fraction=valid_ratio,
                          n_iter_no_change=no_change_tolerance)
model_sgd = MLPRegressor(hidden_layer_sizes=hidden_layer, solver='sgd', alpha=regularization_rate,
                          batch_size=batch_size, learning_rate=learning_rate_mode,
                          learning_rate_init=learning_rate_start, max_iter=max_iteration, tol=tolerance,
                          verbose=True, early_stopping=True, validation_fraction=valid_ratio,
                          n_iter_no_change=no_change_tolerance)

print(training_input.shape)
print(training_target.shape)

(1001, 5)
(1001, 4)


### Give The Models The Training Data And Fit Them!

#### First, The SGD Optimizer

In [25]:
model_adm.fit(training_input, training_target)

Iteration 1, loss = 2758.37354507
Validation score: -2.688277
Iteration 2, loss = 173.52461618
Validation score: 0.669108
Iteration 3, loss = 27.48855758
Validation score: 0.982521
Iteration 4, loss = 8.16405745
Validation score: 0.957312
Iteration 5, loss = 2.38889970
Validation score: 0.991974
Iteration 6, loss = 0.86143929
Validation score: 0.990242
Iteration 7, loss = 1.11978942
Validation score: 0.989185
Iteration 8, loss = 1.75836021
Validation score: 0.987719
Iteration 9, loss = 3.17086889
Validation score: 0.961061
Iteration 10, loss = 2.29786606
Validation score: 0.969295
Iteration 11, loss = 1.85459794
Validation score: 0.987515
Iteration 12, loss = 1.82513335
Validation score: 0.916205
Iteration 13, loss = 6.22683849
Validation score: 0.932712
Iteration 14, loss = 3.60527545
Validation score: 0.984215
Iteration 15, loss = 1.93428558
Validation score: 0.981742
Iteration 16, loss = 1.71935371
Validation score: 0.991502
Validation score did not improve more than tol=0.000001 fo

MLPRegressor(activation='relu', alpha=1e-05, batch_size=128, beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(512, 512, 512), learning_rate='adaptive',
       learning_rate_init=0.01, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=1e-06,
       validation_fraction=0.1, verbose=True, warm_start=False)

In [26]:
predictions_adm = model_adm.predict(testing_input)
mse_adm = metrics.mean_squared_error(predictions_adm,testing_target)
print("Adam's Mean-Squared-Error:", mse_adm)

Adam's Mean-Squared-Error: 1.3787097563770685
