In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tcs-stock-price/TCS.NS.csv


<div style="
    background: #f7f9fc;
    border: 1px solid #d6e0f5;
    border-radius: 8px;
    padding: 16px 18px;
    margin: 12px 0;
    box-shadow: 0 1px 3px rgba(15, 23, 42, 0.06);
">

<h3 style="margin-top: 0;">Autocorrelation Function (ACF) ‚Äì Intuition & Formula</h3>

<b>Intuition</b><br>
<ul>
  <li>The ACF measures how strongly the current value \(y_t\) is related to a past value \(y_{t-k}\) at lag \(k\).</li>
  <li>Large \(\rho_k\) (positive or negative) means past values carry information about the present value.</li>
</ul>

<b>Formula</b><br>

\[
\rho_k =
\frac{\sum_{t=k+1}^{T} (y_t-\bar{y})(y_{t-k}-\bar{y})}
     {\sum_{t=1}^{T} (y_t-\bar{y})^2}
\]

<ul>
  <li>\(y_t\): value at time \(t\)</li>
  <li>\(\bar{y}\): mean of the series</li>
  <li>\(T\): total number of observations</li>
  <li>\(\rho_k\): autocorrelation at lag \(k\)</li>
</ul>

</div>


In [2]:
df=pd.read_csv('/kaggle/input/tcs-stock-price/TCS.NS.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2002-08-12,38.724998,40.0,38.724998,39.700001,29.51914,212976.0
1,2002-08-13,39.75,40.387501,38.875,39.162498,29.119476,153576.0
2,2002-08-14,39.25,39.25,35.724998,36.462502,27.111877,822776.0
3,2002-08-15,36.462502,36.462502,36.462502,36.462502,27.111877,0.0
4,2002-08-16,36.275002,38.0,35.75,36.375,27.046812,811856.0


In [3]:
df1=df.loc[:,['Date','Close']]

In [4]:
df1

Unnamed: 0,Date,Close
0,2002-08-12,39.700001
1,2002-08-13,39.162498
2,2002-08-14,36.462502
3,2002-08-15,36.462502
4,2002-08-16,36.375000
...,...,...
4758,2021-09-23,3869.250000
4759,2021-09-24,3871.300049
4760,2021-09-27,3836.949951
4761,2021-09-28,3779.149902


<div style="
    background: #f7f9fc;
    border: 1px solid #d6e0f5;
    border-radius: 8px;
    padding: 16px 18px;
    margin: 12px 0;
    box-shadow: 0 1px 3px rgba(15, 23, 42, 0.06);
">

<h3 style="margin-top: 0;">Partial Autocorrelation Function (PACF)</h3>

<b>Intuition</b><br>
<ul>
  <li>Like ACF, PACF measures correlation between \(y_t\) and \(y_{t-k}\) at lag \(k\).</li>
  <li>However, PACF shows the <i>direct</i> relationship, removing the effect of intermediate lags (e.g., lags \(1,\dots,k-1\)).</li>
</ul>

<b>Notation</b><br>
\[
\text{PACF}(y_t, k)
\]

<b>Conceptual formula</b><br>
- PACF at lag \(k\) is the correlation between the residuals of:
  - \(y_t\) regressed on \(y_{t-1},\dots,y_{t-k+1}\), and  
  - \(y_{t-k}\) regressed on \(y_{t-1},\dots,y_{t-k+1}\).

<p style="margin-top:8px;">
In practice, statistical packages compute PACF by fitting these regressions and taking the correlation of residuals at each lag.
</p>

</div>


<div style="
    background:#f7f9fc;
    border:1px solid #d6e0f5;
    border-radius:8px;
    padding:16px 18px;
    margin:12px 0;
    box-shadow:0 1px 3px rgba(15,23,42,0.06);
">

<h3 style="margin-top:0;">Steps of PACF Calculation</h3>

<b>Step&nbsp;1:</b> Centre the data (subtract the mean from each \(y_t\)).  
<b>Step&nbsp;2:</b> Regress \(y_t\) and \(y_{t-2}\) separately on \(y_{t-1}\) and obtain fitted lines.  
<b>Step&nbsp;3:</b> Compute residuals \(e_t\) and \(e_{t-2}\) from these regressions.  
<b>Step&nbsp;4:</b> Calculate the correlation between \(e_t\) and \(e_{t-2}\); this is PACF at lag 2.  
<b>Step&nbsp;5:</b> Repeat for higher lags and plot the PACF correlogram.

</div>


In [5]:
df1.dtypes

Date      object
Close    float64
dtype: object

In [6]:
df1['Date']=pd.to_datetime(df1['Date'])

In [7]:
df1

Unnamed: 0,Date,Close
0,2002-08-12,39.700001
1,2002-08-13,39.162498
2,2002-08-14,36.462502
3,2002-08-15,36.462502
4,2002-08-16,36.375000
...,...,...
4758,2021-09-23,3869.250000
4759,2021-09-24,3871.300049
4760,2021-09-27,3836.949951
4761,2021-09-28,3779.149902


In [8]:
mask = (df1['Date'] >= '2021-01-22') & (df1['Date'] <= '2021-09-23')
filtered_df = df1.loc[mask].copy()

In [9]:
filtered_df

Unnamed: 0,Date,Close
4594,2021-01-22,3303.100098
4595,2021-01-25,3291.300049
4596,2021-01-27,3261.050049
4597,2021-01-28,3196.550049
4598,2021-01-29,3111.350098
...,...,...
4754,2021-09-17,3827.850098
4755,2021-09-20,3823.500000
4756,2021-09-21,3862.949951
4757,2021-09-22,3862.149902


In [10]:
filtered_df['Close_t']   = filtered_df['Close']
filtered_df['Close_t-1'] = filtered_df['Close'].shift(1)
filtered_df['Close_t-2'] = filtered_df['Close'].shift(2)

In [11]:
filtered_df.tail()
filtered_df=filtered_df[2:]
filtered_df.head()

Unnamed: 0,Date,Close,Close_t,Close_t-1,Close_t-2
4596,2021-01-27,3261.050049,3261.050049,3291.300049,3303.100098
4597,2021-01-28,3196.550049,3196.550049,3261.050049,3291.300049
4598,2021-01-29,3111.350098,3111.350098,3196.550049,3261.050049
4599,2021-02-01,3139.350098,3139.350098,3111.350098,3196.550049
4600,2021-02-02,3203.449951,3203.449951,3139.350098,3111.350098


<div style="background-color:#f7fbff; padding:20px; border-radius:12px; border-left:6px solid #2a9df4">

## üîπ Manual PACF Computation (Step-by-Step)

---

### **Step 1: Center the Data**
All series are centered by subtracting the mean of \( y_t \):

\[
\tilde{y}_t = y_t - \bar{y}
\]

This removes the intercept and simplifies subsequent regression steps.

---

### **Step 2: Calculate Slopes**
Regress \( y_t \) and \( y_{t-2} \) on \( y_{t-1} \):

\[
y_t = 0.98\,y_{t-1} + e_t
\]

\[
y_{t-2} = 0.93\,y_{t-1} + e_{t-2}
\]

The coefficients are obtained using ordinary least squares (OLS).

---

### **Step 3: Compute Residuals**
Remove the linear dependence on \( y_{t-1} \):

\[
e_t = y_t - 0.98\,y_{t-1}
\]

\[
e_{t-2} = y_{t-2} - 0.93\,y_{t-1}
\]

These residuals represent the unexplained components.

---

### **Step 4: Correlate Residuals**
The partial autocorrelation at lag 2 is the correlation between the residuals:

\[
\text{PACF}(y_t, 2) = \text{Corr}(e_t, e_{t-2})
\]

---

### **Step 5: Final Result**
\[
\boxed{\text{PACF}(y_t, 2) = -0.104}
\]

---

### üìå Interpretation
- The partial autocorrelation at lag 2 is **small and negative**
- Indicates **weak direct dependence** between \( y_t \) and \( y_{t-2} \)
- Confirms that most of the temporal dependence is captured by **lag 1**

---

### üß† Key Takeaway
Manual PACF computation provides insight into:
- AR order selection
- Model parsimony
- Dependence structure in time series

This approach avoids black-box estimation and strengthens intuition.

</div>


In [12]:
mean_close_t = filtered_df['Close_t'].mean()
mean_close_t

np.float64(3267.5552207361957)

In [13]:
filtered_df['Centered_y_t']   = filtered_df['Close_t']   - mean_close_t
filtered_df['Centered_y_t-1'] = filtered_df['Close_t-1'] - mean_close_t
filtered_df['Centered_y_t-2'] = filtered_df['Close_t-2'] - mean_close_t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Centered_y_t']   = filtered_df['Close_t']   - mean_close_t
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Centered_y_t-1'] = filtered_df['Close_t-1'] - mean_close_t
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Centered_y_t-2'] = filtered_df['Close_t-2'] - me

In [14]:
filtered_df.head()

Unnamed: 0,Date,Close,Close_t,Close_t-1,Close_t-2,Centered_y_t,Centered_y_t-1,Centered_y_t-2
4596,2021-01-27,3261.050049,3261.050049,3291.300049,3303.100098,-6.505172,23.744828,35.544877
4597,2021-01-28,3196.550049,3196.550049,3261.050049,3291.300049,-71.005172,-6.505172,23.744828
4598,2021-01-29,3111.350098,3111.350098,3196.550049,3261.050049,-156.205123,-71.005172,-6.505172
4599,2021-02-01,3139.350098,3139.350098,3111.350098,3196.550049,-128.205123,-156.205123,-71.005172
4600,2021-02-02,3203.449951,3203.449951,3139.350098,3111.350098,-64.10527,-128.205123,-156.205123


In [15]:
beta_1 = np.cov(filtered_df['Centered_y_t'], 
                filtered_df['Centered_y_t-1'])[0,1] / np.var(filtered_df['Centered_y_t-1'])

beta_2 = np.cov(filtered_df['Centered_y_t-2'], 
                filtered_df['Centered_y_t-1'])[0,1] / np.var(filtered_df['Centered_y_t-1'])

In [16]:
import numpy as np

x = filtered_df['Centered_y_t-1']
y = filtered_df['Centered_y_t']

slope_yt_on_yt1 = np.polyfit(x, y, 1)[0]

In [17]:
x = filtered_df['Centered_y_t-1']
y = filtered_df['Centered_y_t-2']

slope_yt2_on_yt1 = np.polyfit(x, y, 1)[0]

In [18]:
print(slope_yt_on_yt1)
print(slope_yt2_on_yt1)

1.0038546721962551
0.964807886303593


In [19]:
filtered_df['e_t'] = (
    filtered_df['Centered_y_t']
    - beta_1 * filtered_df['Centered_y_t-1']
)

filtered_df['e_t-2'] = (
    filtered_df['Centered_y_t-2']
    - beta_2 * filtered_df['Centered_y_t-1']
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['e_t'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['e_t-2'] = (


In [20]:
filtered_df

Unnamed: 0,Date,Close,Close_t,Close_t-1,Close_t-2,Centered_y_t,Centered_y_t-1,Centered_y_t-2,e_t,e_t-2
4596,2021-01-27,3261.050049,3261.050049,3291.300049,3303.100098,-6.505172,23.744828,35.544877,-30.488667,12.494265
4597,2021-01-28,3196.550049,3196.550049,3261.050049,3291.300049,-71.005172,-6.505172,23.744828,-64.434615,30.059811
4598,2021-01-29,3111.350098,3111.350098,3196.550049,3261.050049,-156.205123,-71.005172,-6.505172,-84.486256,62.424057
4599,2021-02-01,3139.350098,3139.350098,3111.350098,3196.550049,-128.205123,-156.205123,-71.005172,29.570065,80.633058
4600,2021-02-02,3203.449951,3203.449951,3139.350098,3111.350098,-64.105270,-128.205123,-156.205123,65.388482,-31.748270
...,...,...,...,...,...,...,...,...,...,...
4754,2021-09-17,3827.850098,3827.850098,3903.300049,3954.550049,560.294877,635.744828,686.994828,-81.840017,69.836960
4755,2021-09-20,3823.500000,3823.500000,3827.850098,3903.300049,555.944779,560.294877,635.744828,-9.981793,91.831017
4756,2021-09-21,3862.949951,3862.949951,3823.500000,3827.850098,595.394730,555.944779,560.294877,33.861980,20.603983
4757,2021-09-22,3862.149902,3862.149902,3862.949951,3823.500000,594.594681,595.394730,555.944779,-6.784544,-22.042687


In [21]:
pacf_2 = np.corrcoef(
    filtered_df['e_t'].dropna(),
    filtered_df['e_t-2'].dropna()
)[0, 1]


<div style="background-color:#f6f8fa; padding:18px; border-radius:10px; border-left:6px solid #4c72ff">

## üìå Interpreting the Partial Autocorrelation Result

### ‚úÖ Computed Value
\[
\text{PACF}(2) = 0.03936931496406402
\]

---

### üîç What does this number represent?

This value is the **partial autocorrelation at lag 2**, denoted as:

\[
\boxed{\text{PACF}(2)}
\]

It measures the **direct relationship between \(y_t\) and \(y_{t-2}\)** **after removing** the linear effect of \(y_{t-1}\).

> **Interpretation:**  
> Once yesterday‚Äôs price effect is removed, **today‚Äôs value and the value two days ago are almost uncorrelated**.

---

### üìâ Why is this considered a *small* value?

- \(|\text{PACF}(2)| \approx 0.04\)
- This is **very close to zero**
- It implies that **lag-2 contributes almost no additional information** beyond lag-1

This behavior is **typical in financial time series**, where dependence is often dominated by the first lag.

---

### üß† Statistical intuition (rule of thumb)

For \(N\) observations, the approximate significance band is:

\[
\pm \frac{1.96}{\sqrt{N}}
\]

With ~60 observations:
- Threshold ‚âà ¬±0.25  
- Observed value = **0.039**

‚û°Ô∏è **Not statistically significant**

---

### üìä Practical conclusion

- Strong **AR(1)** behavior  
- Negligible **AR(2)** contribution  

This suggests the process is well-approximated by:

\[
y_t \approx \phi_1 y_{t-1} + \varepsilon_t
\]

rather than a higher-order autoregressive model.

---

### üß™ Why this matters

This confirms‚Äî*without fitting a full AR model*‚Äîthat:
- Higher-order lags add little predictive power
- Model complexity can be kept low
- PACF is a powerful diagnostic tool

</div>


In [22]:
pacf_2

np.float64(0.03936931496406402)