### Task 2: Change Point Modeling and Insight Generation

In [2]:
import pandas as pd
import numpy as np
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
import os
import sys 
sys.path.append(os.path.abspath("../Model"))
import warnings
warnings.filterwarnings('ignore')

In [3]:
os.chdir('..')

In [4]:
from model_data import *

In [5]:
# # Load our Brent oil price data (with change points)
df_price = load_and_preprocess('Data/BrentOilPrices.csv')
df_price

Unnamed: 0,Date,Price,LogReturn
1,1987-05-21,18.45,-0.009709
2,1987-05-22,18.55,0.005405
3,1987-05-25,18.60,0.002692
4,1987-05-26,18.63,0.001612
5,1987-05-27,18.60,-0.001612
...,...,...,...
9006,2022-11-08,96.85,-0.030706
9007,2022-11-09,93.05,-0.040026
9008,2022-11-10,94.25,0.012814
9009,2022-11-11,96.37,0.022244


In [6]:
# Load key events from 
df_events = load_event("Data/key_event.csv")
df_events

Unnamed: 0,Date,Event_type,Severity,Description,Region
0,1990-08-02,Geopolitical,Very High,Iraq invades Kuwait (Start of Gulf War),Middle East
1,1991-01-17,Geopolitical,Very High,Operation Desert Storm begins,Middle East
2,1997-01-01,Economic,Medium,Asian Financial Crisis begins,Asia
3,1998-03-01,OPEC,High,OPEC production increases amid Asian crisis,Global
4,2001-09-11,Geopolitical,Very High,9/11 Terrorist Attacks,North America
5,2003-03-20,Geopolitical,Very High,US invades Iraq,Middle East
6,2005-08-29,Environmental,High,Hurricane Katrina disrupts US Gulf production,North America
7,2008-09-15,Economic,Very High,Lehman Brothers collapse (Global Financial Cri...,Global
8,2011-02-15,Geopolitical,High,Arab Spring uprisings disrupt oil supplies,Middle East
9,2011-03-11,Environmental,Very High,Fukushima nuclear disaster (Japan),Asia


In [7]:
df_events['Date']

0    1990-08-02
1    1991-01-17
2    1997-01-01
3    1998-03-01
4    2001-09-11
5    2003-03-20
6    2005-08-29
7    2008-09-15
8    2011-02-15
9    2011-03-11
10   2014-06-01
11   2014-11-27
12   2015-07-14
13   2016-11-30
14   2017-09-01
15   2018-05-08
16   2019-09-14
17   2020-01-03
27   2020-01-03
18   2020-03-11
28   2020-03-11
29   2020-04-12
19   2020-04-20
30   2020-11-03
38   2021-02-15
31   2021-03-23
20   2021-03-23
32   2021-07-18
21   2022-02-24
33   2022-02-24
22   2022-03-08
34   2022-03-08
39   2022-06-02
23   2022-10-05
35   2022-10-05
24   2023-04-02
36   2023-04-02
40   2023-09-05
25   2023-10-07
37   2023-10-07
26   2024-01-01
Name: Date, dtype: datetime64[ns]

In [8]:
# Simulated result from our PyMC model: Assume you already have change points (e.g., from Bayesian inference)
change_point_indices = df_events['Date']  # index in df_price
# change_dates = df_price['Date'].iloc[change_point_indices].reset_index(drop=True)
change_dates = df_price[df_price['Date'].isin(df_events['Date'])]['Date'].reset_index(drop=True)

change_dates

0    1990-08-02
1    1991-01-17
2    2001-09-11
3    2003-03-20
4    2005-08-29
5    2008-09-15
6    2011-02-15
7    2011-03-11
8    2015-07-14
9    2016-11-30
10   2017-09-01
11   2018-05-08
12   2020-01-03
13   2020-03-11
14   2020-04-20
15   2020-11-03
16   2021-02-15
17   2021-03-23
18   2022-02-24
19   2022-03-08
20   2022-10-05
Name: Date, dtype: datetime64[ns]

In [9]:
from datetime import timedelta

# How close should we consider a match? e.g., ±7 days
tolerance_days = 17

associated_events = []

for cp_date in change_dates:
    matched = df_events[
        (df_events['Date'] >= cp_date - timedelta(days=tolerance_days)) &
        (df_events['Date'] <= cp_date + timedelta(days=tolerance_days))
    ]
    if not matched.empty:
        for _, row in matched.iterrows():
            associated_events.append({
                "Change_Point_Date": cp_date.date(),
                "Event_Date": row["Date"].date(),
                "Event": row["Description"]
            })
    else:
        associated_events.append({
            "Change_Point_Date": cp_date.date(),
            "Event_Date": None,
            "Event_Description": "No matching event within ±7 days"
        })

# Convert to DataFrame
df_association = pd.DataFrame(associated_events)

print("Change Points and Associated Events:")
print(df_association)


Change Points and Associated Events:
   Change_Point_Date  Event_Date  \
0         1990-08-02  1990-08-02   
1         1991-01-17  1991-01-17   
2         2001-09-11  2001-09-11   
3         2003-03-20  2003-03-20   
4         2005-08-29  2005-08-29   
5         2008-09-15  2008-09-15   
6         2011-02-15  2011-02-15   
7         2011-03-11  2011-03-11   
8         2015-07-14  2015-07-14   
9         2016-11-30  2016-11-30   
10        2017-09-01  2017-09-01   
11        2018-05-08  2018-05-08   
12        2020-01-03  2020-01-03   
13        2020-01-03  2020-01-03   
14        2020-03-11  2020-03-11   
15        2020-03-11  2020-03-11   
16        2020-04-20  2020-04-12   
17        2020-04-20  2020-04-20   
18        2020-11-03  2020-11-03   
19        2021-02-15  2021-02-15   
20        2021-03-23  2021-03-23   
21        2021-03-23  2021-03-23   
22        2022-02-24  2022-02-24   
23        2022-02-24  2022-02-24   
24        2022-02-24  2022-03-08   
25        2022-02-24  2022-

In [10]:
returns = df_price["LogReturn"].values
dates = df_price["Date"].values
returns

array([-0.00970881,  0.00540542,  0.00269179, ...,  0.01281384,
        0.02224412, -0.02927141], shape=(9010,))

In [None]:
# model, trace = build_model_advi(returns).lkhf
model, trace = build_model_advi_continuous_tau(returns)


In [None]:
most_probable_tau, change_date = extract_change_point(trace, dates)
print(f"Most probable change point at index {most_probable_tau}, date: {change_date}")

In [None]:
plot_results(df, change_date, trace)