In [1]:
import warnings
import itertools
import pandas
import math
import sys
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
dataframe = pandas.read_csv('./data/realTweets/Twitter_volume_AAPL.csv')
time = dataframe[list(dataframe)[0]]
values = dataframe[list(dataframe)[1]]
training_ratio = 0.8
validation_ratio = 0.5

# Deviding the data set to training, validation, testing parts
training_end = int(math.floor(len(values)*training_ratio))
training_set_values = np.array(values[0:training_end])
training_set_time = np.array(time[0:training_end])

validation_start = training_end + 1
validation_end = validation_start + int(math.floor(len(values)*(1-training_ratio)*validation_ratio))
validation_set_values = np.array(values[validation_start:validation_end])
validation_set_time = np.array(time[validation_start:validation_end])

testing_start = validation_end + 1
testing_end = len(values)
testing_set_values = np.array(values[testing_start:testing_end])
testing_set_time = np.array(time[testing_start:testing_end])

# print(train_end)
# print(validation_start)
# print(validation_end)
# print(testing_start)
# print(testing_end)


In [14]:
class AR:
    def __init__(self, p):
        self.p = p
    
    # Setters
    def set_p(self,p):
        self.p=p 
        return 0
    
    def set_training_data_time(self, time):
        self.training_data_time = time
    
    def set_validation_data_time(self, time):
        self.validation_data_time = time
        
    def set_testing_data_time(self, time):
        self.testing_data_time = time
    
    def set_validation_data_set(self,data):
        self.validation_data_set = data
        
    def set_testing_data_set(self,data):
        self.testing_data_set = data
    
    def set_training_data_set(self,data):
        self.training_data = data
        self.training_data_mean = np.mean(data)
        self.training_data_std = np.std(data, ddof=1)
        self.Z = data - self.training_data_mean
        self.Z.shape = (len(data),1)
        self.Z_mean = np.mean(self.Z)
        self.Z_std = np.std(self.Z, ddof=1)
        return 0
    
    # Model
    def shock(self):
#         return np.random.normal(self.Z_mean, self.Z_std, 1)
        return 1
    
    def calculate_normal_matrix_x_row(self,data,t):
        row = np.zeros((1,self.p+1))
        j = 0
        for i in range(t-self.p,t):
            if i < 0:
                row[0][j] = 0
            else:
                row[0][j] = data[i]
            j+=1
        row[0][-1] = self.shock()
        return row
    
    def calculate_weights(self):
        normal_matrix = np.zeros((len(self.training_data),self.p+1))
        
        for i in range(0,len(self.training_data)):
            normal_matrix[i] = self.calculate_normal_matrix_x_row(self.Z,i)
        
        normal_matrix_tanspose = normal_matrix.transpose()
        self.weights = np.dot(np.dot(np.linalg.pinv(np.dot(normal_matrix_tanspose,normal_matrix)),normal_matrix_tanspose),self.Z)
        return 0
        
    def get_prediction(self,data_set):
        self.prediction = np.zeros((len(data_set),1))
        Z = data_set - np.mean(data_set)
        Z.shape = (len(data_set),1)
        for i in range(0,len(data_set)):
            self.prediction[i] = np.dot(self.calculate_normal_matrix_x_row(Z, i), self.weights)
        
        self.prediction = self.prediction.transpose()[0] + np.mean(data_set)
        return self.prediction
    
    # Diagnostics and identification messures
    def mse(self,values,pridicted):
        error = 0.0
        for i in range(0,len(values)):
            error += (values[i] - pridicted[i])**2
        return error/len(values)
    
    def get_mse(self, data, prediction):
        return self.mse(data,prediction)
    
    def plot_autocorrelation(self, data_set, lag):
        autocorrelations = np.zeros(lag)
        autocorrelations_x = np.arange(lag)
        autocorrelations[0] = 1.0
        for i in range(1,lag):
            autocorrelations[i] = np.corrcoef(data_set[i:],data_set[:-i])[0,1]
        
        trace = {"x": autocorrelations_x,
                 "y": autocorrelations,
                 'type': 'bar',
                 "name": 'Autocorrelation',         
                }
        
        traces = [trace]
        layout = dict(title = "Autocorrelation",
                  xaxis = dict(title = 'Lag'),
                  yaxis = dict(title = 'Autocorrelation')
                 )
        fig = dict(data=traces, layout=layout)
        iplot(fig)
    
    def plot_partial_autocorrelation(self, data_set, lag):
        pac = np.zeros(lag)
        pac_x = np.arange(lag)
        
        residualts = data_set
        slope, intercept = np.polyfit(data_set,residualts,1)
        estimate = intercept + slope*data_set
        residualts = residualts - estimate
        pac[0] = 1
        for i in range(1,lag):
            pac[i] = np.corrcoef(data_set[:-i],residualts[i:])[0,1]
            
            slope, intercept = np.polyfit(data_set[:-i],residualts[i:],1)
            estimate = intercept + slope*data_set[:-i]
            
            residualts[i:] = residualts[i:] - estimate
        
        trace = {"x": pac_x,
                 "y": pac,
                 'type': 'bar',
                 "name": 'Partial Autocorrelation',         
                }

        traces = [trace]
        layout = dict(title = "Partial Autocorrelation",
                  xaxis = dict(title = 'Lag'),
                  yaxis = dict(title = 'Partial Autocorrelation')
                 )
        fig = dict(data=traces, layout=layout)
        iplot(fig)
    
    def plot_residuals(self, data_set, prediction):
        x = np.arange(len(data_set))
        residual = data_set - prediction
        mean = np.ones(len(data_set))*np.mean(residual)
        
        trace = {"x": x,
                 "y": residual,
                 "mode": 'markers',
                 "name": 'Residual'}

        trace_mean = {"x": x,
                     "y": mean,
                     "mode": 'lines',
                     "name": 'Mean'}
        traces = [trace,trace_mean]
        layout = dict(title = "Residual",
                      xaxis = dict(title = 'X'),
                      yaxis = dict(title = 'Residual')
                     )
        fig = dict(data=traces, layout=layout)
        iplot(fig)
        print("Standard Deviation of Residuals : " + str(np.std(residual, ddof=1)))
        print("Mean of Residuals : " + str(np.mean(residual)))
    
    def plot_data(self, data_set, time):
        mean = np.mean(data_set)
        means = np.ones(len(data_set))*mean
        trace_value = {"x": time,
                     "y": data_set,
                     "mode": 'lines',
                     "name": 'value'}

        trace_mean = {"x": time,
                         "y": means,
                         "mode": 'lines',
                         "name": 'mean'}
        traces = [trace_value,trace_mean]
        layout = dict(title = "Values with mean",
                      xaxis = dict(title = 'Time'),
                      yaxis = dict(title = 'Value')
                     )
        fig = dict(data=traces, layout=layout)
        iplot(fig)
        
        normalized_data = data_set - mean
        trace_value = {"x": time,
                     "y": normalized_data,
                     "mode": 'lines',
                     "name": 'value'}
        traces = [trace_value]
        layout = dict(title = "After removing mean",
                      xaxis = dict(title = 'Time'),
                      yaxis = dict(title = 'Value')
                     )
        fig = dict(data=traces, layout=layout)
        iplot(fig)
    
    def print_stats(self,data,prediction):
        print("Mean Square Error : " + str(self.mse(data,prediction)))
        print("Mean of real values : " + str(np.mean(data)))
        print("Standard Deviation of real values : " + str(np.std(data, ddof=1)))
        print("Mean of predicted values : " + str(np.mean(prediction)))
        print("Standard Deviation of predicted values : " + str(np.std(prediction, ddof=1)))
        print("Number of data points : " + str(len(data)))
    
    def plot_result(self, time, data, prediction):
        trace_real = {"x": time,
                     "y": data,
                     "mode": 'lines',
                     "name": 'Real value'}

        trace_predicted = {"x": time,
                         "y": prediction,
                         "mode": 'lines',
                         "name": 'Predicted value'}
        traces = [trace_real,trace_predicted]
        layout = dict(title = "Training Data Set with AR("+str(self.p)+")",
                      xaxis = dict(title = 'Time'),
                      yaxis = dict(title = 'Value')
                     )
        fig = dict(data=traces, layout=layout)
        iplot(fig)
        self.print_stats(data,prediction)
        self.plot_residuals(data,prediction)

In [9]:
ma_model = AR(1)
ma_model.plot_data(training_set_values, training_set_time)

In [10]:
ma_model.plot_autocorrelation(training_set_values, 30)

In [11]:
ma_model.plot_partial_autocorrelation(training_set_values, 30)

In [12]:
ar_model = AR(1)
ar_model.set_training_data_set(training_set_values)
ar_model.set_training_data_time(training_set_time)

ar_model.set_validation_data_set(validation_set_values)
ar_model.set_validation_data_time(validation_set_time)

epochs = 30
mse = np.zeros(epochs-1)
mse_x = np.arange(1, epochs)
for i in range(1, epochs):
    ar_model.set_p(i)
    ar_model.calculate_weights()
    prediction = ar_model.get_prediction(ar_model.validation_data_set)
#     ar_model.plot_result(ar_model.validation_data_time, ar_model.validation_data_set, prediction)
    mse[i-1] = ar_model.get_mse(ar_model.validation_data_set, prediction)
# plot MSE of validation set
    print(i, end=',')
trace_mse = {"x": mse_x,
             "y": mse,
             "mode": 'lines+markers',
             "name": 'MSE'}
traces = [trace_mse]
layout = dict(title = "Mean Square Error",
              yaxis = dict(title = 'MSE'),
              xaxis = dict(title = 'P-value(parameter value)')
             )
fig = dict(data=traces, layout=layout)
iplot(fig)
print("MSE is minimum at P = "+str(np.argmin(mse)+1))

1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,

MSE is minimum at P = 2


In [15]:
ar_model = AR(2)
ar_model.set_training_data_set(training_set_values)
ar_model.set_training_data_time(training_set_time)

ar_model.set_testing_data_set(testing_set_values)
ar_model.set_testing_data_time(testing_set_time)

ar_model.calculate_weights()
prediction = ar_model.get_prediction(ar_model.testing_data_set)

ar_model.plot_result(ar_model.testing_data_time, ar_model.testing_data_set, prediction)


Mean Square Error : 13271.479875147123
Mean of real values : 82.69540591567024
Standard Deviation of real values : 187.8012115864526
Mean of predicted values : 82.71536637037906
Standard Deviation of predicted values : 155.34966749128358
Number of data points : 1589


Standard Deviation of Residuals : 115.23817436753146
Mean of Residuals : -0.019960454708821945
