In [63]:
import pandas as pd
import arima
from sklearn.linear_model import LinearRegression

In [64]:
df = pd.read_csv("data/pars_normal_daily.csv", index_col=0)
df.head()


Unnamed: 0,mu,sigma
2021-12-01,-6e-05,0.00053
2021-12-02,3.9e-05,0.000643
2021-12-03,-2.2e-05,0.000703
2021-12-06,3e-05,0.000565
2021-12-07,5.3e-05,0.000659


In [65]:
# Check for stationarity
arima.adf_check(df["mu"])
arima.adf_check(df["sigma"])

Augmented Dickey-Fuller Test:
ADF Test Statistic : -6.380010424707947
p-value : 2.2343714059272166e-08
Number of Lags Used : 1
Number of Observations Used : 67
strong evidence against the null hypothesis, reject the null hypothesis. Data has no unit root and is stationary
Augmented Dickey-Fuller Test:
ADF Test Statistic : -4.020289290202549
p-value : 0.0013079185626142368
Number of Lags Used : 0
Number of Observations Used : 68
strong evidence against the null hypothesis, reject the null hypothesis. Data has no unit root and is stationary


In [66]:
idx = int(len(df)*0.8)
train, test = df.iloc[:idx], df.iloc[idx:]
train.head(), test.head()

(                  mu     sigma
 2021-12-01 -0.000060  0.000530
 2021-12-02  0.000039  0.000643
 2021-12-03 -0.000022  0.000703
 2021-12-06  0.000030  0.000565
 2021-12-07  0.000053  0.000659,
                   mu     sigma
 2022-02-18 -0.000016  0.000581
 2022-02-22 -0.000027  0.000746
 2022-02-23 -0.000046  0.000690
 2022-02-24  0.000038  0.001633
 2022-02-25  0.000056  0.000609)

In [72]:
def AR(p, df, key):
    '''
    Generating the lagged p terms
    '''
    df_temp = df

    for i in range(1,p+1):
        df_temp['Shifted_values_%d' % i ] = df_temp[key].shift(i)

    train_size = (int)(0.8 * df_temp.shape[0])

    #Breaking data set into test and training
    df_train = pd.DataFrame(df_temp[0:train_size])
    df_val = pd.DataFrame(df_temp[train_size:df_temp.shape[0]])
    
    print("df_train")
    print(df_train)
#     print("df_val")
#     print(df_val)

    df_train_2 = df_train.dropna()
#     print("df_train_2")
#     print(df_train_2)

    print(df_train_2)
    print("Reshaped")
#     print(df_train_2.iloc[:,1:].values.reshape(-1,p))
    #X contains the lagged values ,hence we skip the first column
    try:
        print("Trying...")
        X_train = df_train_2.iloc[:,1:].values.reshape(-1,p)
    except:
        print("Failed.")
        return [pd.DataFrame(), pd.DataFrame(), 0, 0, float("inf")]
    #Y contains the value,it is the first column
    y_train = df_train_2.iloc[:,1].values.reshape(-1,1)
    
    print("X_train")
    print(X_train)
    print("y_train")
    print(y_train)

    #Running linear regression to generate the coefficents of lagged terms
    lr = LinearRegression()
    lr.fit(X_train,y_train)

    theta  = lr.coef_.T
    intercept = lr.intercept_
    df_train_2['Predicted_Values'] = X_train.dot(lr.coef_.T) + lr.intercept_
    # df_train_2[['Value','Predicted_Values']].plot()

    X_val = df_val.iloc[:,1:].values.reshape(-1,p)
    df_val['Predicted_Values'] = X_val.dot(lr.coef_.T) + lr.intercept_
    # df_test[['Value','Predicted_Values']].plot()

    RMSE = np.sqrt(mean_squared_error(df_val[key], df_val['Predicted_Values']))

    print("The RMSE is :", RMSE,", Value of p : ",p)
    return [df_train_2,df_val,theta,intercept,RMSE]

In [74]:
df['mu']

2021-12-01   -0.000060
2021-12-02    0.000039
2021-12-03   -0.000022
2021-12-06    0.000030
2021-12-07    0.000053
                ...   
2022-03-04   -0.000021
2022-03-07   -0.000076
2022-03-08   -0.000020
2022-03-09    0.000068
2022-03-10   -0.000011
Name: mu, Length: 69, dtype: float64

In [75]:
df_train_2, df_test, theta, intercept, RMSE = AR(10, df['mu'], 'mu')

KeyError: 'mu'

In [56]:
df

Unnamed: 0,mu,sigma,Shifted_values_1,Shifted_values_2,Shifted_values_3,Shifted_values_4,Shifted_values_5,Shifted_values_6,Shifted_values_7,Shifted_values_8,...,Shifted_values_486,Shifted_values_487,Shifted_values_488,Shifted_values_489,Shifted_values_490,Shifted_values_491,Shifted_values_492,Shifted_values_493,Shifted_values_494,Shifted_values_495
2021-12-01,-0.000060,0.000530,,,,,,,,,...,,,,,,,,,,
2021-12-02,0.000039,0.000643,-0.000060,,,,,,,,...,,,,,,,,,,
2021-12-03,-0.000022,0.000703,0.000039,-0.000060,,,,,,,...,,,,,,,,,,
2021-12-06,0.000030,0.000565,-0.000022,0.000039,-0.000060,,,,,,...,,,,,,,,,,
2021-12-07,0.000053,0.000659,0.000030,-0.000022,0.000039,-0.000060,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-03-04,-0.000021,0.000730,-0.000013,0.000046,-0.000039,-0.000007,0.000056,0.000038,-0.000046,-0.000027,...,,,,,,,,,,
2022-03-07,-0.000076,0.000604,-0.000021,-0.000013,0.000046,-0.000039,-0.000007,0.000056,0.000038,-0.000046,...,,,,,,,,,,
2022-03-08,-0.000020,0.000994,-0.000076,-0.000021,-0.000013,0.000046,-0.000039,-0.000007,0.000056,0.000038,...,,,,,,,,,,
2022-03-09,0.000068,0.001189,-0.000020,-0.000076,-0.000021,-0.000013,0.000046,-0.000039,-0.000007,0.000056,...,,,,,,,,,,


In [10]:
df_c = pd.concat([df_train_2, df_test])
res = pd.DataFrame()
res['Residuals'] = df_c.mu - df_c.Predicted_Values

AttributeError: 'DataFrame' object has no attribute 'mu'