# Problem Statement

- See 2.4-main.ipynb
- Using pandas

---

# Questions + Futher Explore
1. [LEW dataset](https://www.itl.nist.gov/div898/handbook/eda/section3/eda35c.htm)
2. [ ] Need to better understand the pd [`shift` attribute](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.shift.html) and how shape df while using [-lags or +lags](https://github.com/Brinkley97/book-intro_to_tsf_with_python/blob/main/5-basic_feature_engineering.ipynb)


# TODOs

1. [x] Imports + Load Data
    1. [x] Properly format data

2. [x] (2.1.12) **Estimate the ACov @ lag k :** $c_k = \hat{\gamma}_{k} $ (gamma hat) = $ {\dfrac{1}{N}} $ * $ \sum_{t=1}^{N - k} (z_{t} - \bar{z})(z_{t+k} - \bar{z}) \space$ k = 0, 1, 2,..., K

3. [x] (2.1.11) **Estimate the ACor @ lag k :** $ r_k = \hat{\rho}{_k} $ (rho hat) = $ {\dfrac{c_k}{c_0}} $

- [ ] Graph $ r_k $

## 1. Imports + Load Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
lew_data = pd.read_csv("lewDataset.csv")
# lew_data

In [3]:
N = len(lew_data)
print(N)

sample_mean = lew_data.iloc[0:, 0].mean()
print(sample_mean)

200
-177.435


### 1.A. Properly format data
- Use the pd `shift` attribute on the df to get the lag (also [sliding window](https://github.com/Brinkley97/book-intro_to_tsf_with_python/blob/main/5-basic_feature_engineering.ipynb)) values
- Note that `-1 * k` differs from `1 * k` WRT the ordering of lags values, see sliding window link for more info

In [4]:
def create_lag_values(df, N):
    """
    Parameters:
    df -- pd df (dataset)
    N -- int (length of df)
    
    Return:
    lag_values -- list (lags)
    lag_col_names -- list (column names with lag, respectively) 
    """
    
    lag_values = []
    lag_col_names = []
    
    for k in range(1, N):
       
        lag = -1 * k
        lag_values.append(df.shift(lag))
        new_col_name = "zt @ lag " + str(k)
        lag_col_names.append(new_col_name)
        
    return lag_values, lag_col_names

In [5]:
test_data = pd.DataFrame([100, 200, 300, 400, 500])
test_N = len(test_data)
test_sample_mean = test_data.mean()
# test_lag_ks, lag_cols = create_lag_values(test_data, test_N)
# test_lag_ks_df = pd.concat(test_lag_ks, axis=1)
# test_lag_ks_df

lag_ks, lag_cols = create_lag_values(lew_data, N)
lag_ks_df = pd.concat(lag_ks, axis=1)
lag_ks_df.columns=[lag_cols]
lag_ks_df

Unnamed: 0,zt @ lag 1,zt @ lag 2,zt @ lag 3,zt @ lag 4,zt @ lag 5,zt @ lag 6,zt @ lag 7,zt @ lag 8,zt @ lag 9,zt @ lag 10,...,zt @ lag 190,zt @ lag 191,zt @ lag 192,zt @ lag 193,zt @ lag 194,zt @ lag 195,zt @ lag 196,zt @ lag 197,zt @ lag 198,zt @ lag 199
0,-564.0,-35.0,-15.0,141.0,115.0,-420.0,-360.0,203.0,-338.0,-431.0,...,72.0,-550.0,-190.0,172.0,-424.0,-385.0,198.0,-218.0,-536.0,96.0
1,-35.0,-15.0,141.0,115.0,-420.0,-360.0,203.0,-338.0,-431.0,194.0,...,-550.0,-190.0,172.0,-424.0,-385.0,198.0,-218.0,-536.0,96.0,
2,-15.0,141.0,115.0,-420.0,-360.0,203.0,-338.0,-431.0,194.0,-220.0,...,-190.0,172.0,-424.0,-385.0,198.0,-218.0,-536.0,96.0,,
3,141.0,115.0,-420.0,-360.0,203.0,-338.0,-431.0,194.0,-220.0,-513.0,...,172.0,-424.0,-385.0,198.0,-218.0,-536.0,96.0,,,
4,115.0,-420.0,-360.0,203.0,-338.0,-431.0,194.0,-220.0,-513.0,154.0,...,-424.0,-385.0,198.0,-218.0,-536.0,96.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,198.0,-218.0,-536.0,96.0,,,,,,,...,,,,,,,,,,
196,-218.0,-536.0,96.0,,,,,,,,...,,,,,,,,,,
197,-536.0,96.0,,,,,,,,,...,,,,,,,,,,
198,96.0,,,,,,,,,,...,,,,,,,,,,


In [6]:
lag_zero = "zt @ lag 0"
rename_measurements_series = lew_data["measurements"].rename(lag_zero)
df = pd.concat([rename_measurements_series, lag_ks_df], axis=1)
df

Unnamed: 0,zt @ lag 0,"(zt @ lag 1,)","(zt @ lag 2,)","(zt @ lag 3,)","(zt @ lag 4,)","(zt @ lag 5,)","(zt @ lag 6,)","(zt @ lag 7,)","(zt @ lag 8,)","(zt @ lag 9,)",...,"(zt @ lag 190,)","(zt @ lag 191,)","(zt @ lag 192,)","(zt @ lag 193,)","(zt @ lag 194,)","(zt @ lag 195,)","(zt @ lag 196,)","(zt @ lag 197,)","(zt @ lag 198,)","(zt @ lag 199,)"
0,-213,-564.0,-35.0,-15.0,141.0,115.0,-420.0,-360.0,203.0,-338.0,...,72.0,-550.0,-190.0,172.0,-424.0,-385.0,198.0,-218.0,-536.0,96.0
1,-564,-35.0,-15.0,141.0,115.0,-420.0,-360.0,203.0,-338.0,-431.0,...,-550.0,-190.0,172.0,-424.0,-385.0,198.0,-218.0,-536.0,96.0,
2,-35,-15.0,141.0,115.0,-420.0,-360.0,203.0,-338.0,-431.0,194.0,...,-190.0,172.0,-424.0,-385.0,198.0,-218.0,-536.0,96.0,,
3,-15,141.0,115.0,-420.0,-360.0,203.0,-338.0,-431.0,194.0,-220.0,...,172.0,-424.0,-385.0,198.0,-218.0,-536.0,96.0,,,
4,141,115.0,-420.0,-360.0,203.0,-338.0,-431.0,194.0,-220.0,-513.0,...,-424.0,-385.0,198.0,-218.0,-536.0,96.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,-385,198.0,-218.0,-536.0,96.0,,,,,,...,,,,,,,,,,
196,198,-218.0,-536.0,96.0,,,,,,,...,,,,,,,,,,
197,-218,-536.0,96.0,,,,,,,,...,,,,,,,,,,
198,-536,96.0,,,,,,,,,...,,,,,,,,,,


## 2. Estimate the ACov @ lag k


\begin{align}
\hat{\gamma}_{k} = \frac{1}{N} \times \sum_{t=1}^{N - k} (z_{t} - \bar{z})(z_{t+k} - \bar{z}),
\space where 
\end{align}

\begin{align}
k = 0, 1, 2,..., K
\end{align}

\begin{align}
\bar{z} = \sum_{t=1}^{N} \frac{z_t}{N}
\end{align}



- P 31 in book has to obtain a usefule estimate of the ACor func, need 
    - at least 50 observations so $ N \le 50 $
    - the estimated ACor ($ r_k $) would be calculated for $ k = 0, 1, ..., K $, where $ K \le N/4 $ (maybe use df.head(50) or df.head(N/4) which will give a subset of the data)

In [7]:
# find the difference between each observation and the sample observation (mean)
zts_minus_zs_df = df - sample_mean
zts_minus_zs_df

Unnamed: 0,zt @ lag 0,"(zt @ lag 1,)","(zt @ lag 2,)","(zt @ lag 3,)","(zt @ lag 4,)","(zt @ lag 5,)","(zt @ lag 6,)","(zt @ lag 7,)","(zt @ lag 8,)","(zt @ lag 9,)",...,"(zt @ lag 190,)","(zt @ lag 191,)","(zt @ lag 192,)","(zt @ lag 193,)","(zt @ lag 194,)","(zt @ lag 195,)","(zt @ lag 196,)","(zt @ lag 197,)","(zt @ lag 198,)","(zt @ lag 199,)"
0,-35.565,-386.565,142.435,162.435,318.435,292.435,-242.565,-182.565,380.435,-160.565,...,249.435,-372.565,-12.565,349.435,-246.565,-207.565,375.435,-40.565,-358.565,273.435
1,-386.565,142.435,162.435,318.435,292.435,-242.565,-182.565,380.435,-160.565,-253.565,...,-372.565,-12.565,349.435,-246.565,-207.565,375.435,-40.565,-358.565,273.435,
2,142.435,162.435,318.435,292.435,-242.565,-182.565,380.435,-160.565,-253.565,371.435,...,-12.565,349.435,-246.565,-207.565,375.435,-40.565,-358.565,273.435,,
3,162.435,318.435,292.435,-242.565,-182.565,380.435,-160.565,-253.565,371.435,-42.565,...,349.435,-246.565,-207.565,375.435,-40.565,-358.565,273.435,,,
4,318.435,292.435,-242.565,-182.565,380.435,-160.565,-253.565,371.435,-42.565,-335.565,...,-246.565,-207.565,375.435,-40.565,-358.565,273.435,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,-207.565,375.435,-40.565,-358.565,273.435,,,,,,...,,,,,,,,,,
196,375.435,-40.565,-358.565,273.435,,,,,,,...,,,,,,,,,,
197,-40.565,-358.565,273.435,,,,,,,,...,,,,,,,,,,
198,-358.565,273.435,,,,,,,,,...,,,,,,,,,,


In [8]:
zts_series = zts_minus_zs_df.iloc[0:, 0]
# print(type(zts_series), zts_series)

ztks_df = zts_minus_zs_df.iloc[0:, 0:200]
# print(type(ztks_df), ztks_df)

cks_series = ztks_df.mul(zts_series, axis=0).sum() / N
print(cks_series)

zt @ lag 0         76528.565775
(zt @ lag 1,)     -23517.595646
(zt @ lag 2,)     -56657.944042
(zt @ lag 3,)      59285.855337
(zt @ lag 4,)      15700.251416
                       ...     
(zt @ lag 195,)     -573.490444
(zt @ lag 196,)      -21.640641
(zt @ lag 197,)      894.990438
(zt @ lag 198,)     -464.740183
(zt @ lag 199,)      -48.623579
Length: 200, dtype: float64


## 3. Estimate the ACor @ lag k

In [9]:
c_not = cks_series.iloc[0:, ][0]
c_not

76528.56577500004

In [10]:
rho_hat_k = (cks_series / c_not).round(2)
print(rho_hat_k)

zt @ lag 0         1.00
(zt @ lag 1,)     -0.31
(zt @ lag 2,)     -0.74
(zt @ lag 3,)      0.77
(zt @ lag 4,)      0.21
                   ... 
(zt @ lag 195,)   -0.01
(zt @ lag 196,)   -0.00
(zt @ lag 197,)    0.01
(zt @ lag 198,)   -0.01
(zt @ lag 199,)   -0.00
Length: 200, dtype: float64


In [11]:
rho_hat_k.head(50)

zt @ lag 0        1.00
(zt @ lag 1,)    -0.31
(zt @ lag 2,)    -0.74
(zt @ lag 3,)     0.77
(zt @ lag 4,)     0.21
(zt @ lag 5,)    -0.90
(zt @ lag 6,)     0.38
(zt @ lag 7,)     0.63
(zt @ lag 8,)    -0.77
(zt @ lag 9,)    -0.12
(zt @ lag 10,)    0.82
(zt @ lag 11,)   -0.40
(zt @ lag 12,)   -0.55
(zt @ lag 13,)    0.73
(zt @ lag 14,)    0.07
(zt @ lag 15,)   -0.76
(zt @ lag 16,)    0.40
(zt @ lag 17,)    0.48
(zt @ lag 18,)   -0.70
(zt @ lag 19,)   -0.03
(zt @ lag 20,)    0.70
(zt @ lag 21,)   -0.41
(zt @ lag 22,)   -0.43
(zt @ lag 23,)    0.67
(zt @ lag 24,)   -0.00
(zt @ lag 25,)   -0.66
(zt @ lag 26,)    0.42
(zt @ lag 27,)    0.39
(zt @ lag 28,)   -0.65
(zt @ lag 29,)    0.03
(zt @ lag 30,)    0.63
(zt @ lag 31,)   -0.42
(zt @ lag 32,)   -0.36
(zt @ lag 33,)    0.64
(zt @ lag 34,)   -0.05
(zt @ lag 35,)   -0.60
(zt @ lag 36,)    0.43
(zt @ lag 37,)    0.32
(zt @ lag 38,)   -0.64
(zt @ lag 39,)    0.08
(zt @ lag 40,)    0.58
(zt @ lag 41,)   -0.45
(zt @ lag 42,)   -0.28
(zt @ lag 4

## 4. Graph $ r_k $