# Setting up the environment (note: please use both R and python kernal for this script)

## Loading python packages

In [1]:
from statsmodels.tsa.stattools import adfuller
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import openpyxl
from openpyxl import load_workbook
from openpyxl.drawing.image import Image
from PIL import Image as PilImage

# !pip install openpyxl_image_loader
from openpyxl_image_loader import SheetImageLoader

import PIL 
import pandas as pd # load and manipulate data for One-Hot ncoding
import numpy as np # calculate the mean and std. dev. 
import matplotlib.pyplot as plt # to plot figures
import seaborn as sns

import os
os.environ['R_HOME'] = r'C:\Program Files\R\R-4.4.1'
r_home = os.environ.get("R_HOME")

!pip install rpy2 --quiet
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface as ri
from rpy2.robjects import r

# Extra RPY2 items

from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.vectors import IntVector, FloatVector
import rpy2.ipython.html
import rpy2.robjects.lib.ggplot2 as gp
from rpy2.ipython.ggplot import image_png
import rpy2.rinterface as rinterface

rinterface.initr()
pandas2ri.activate()

%load_ext rpy2.ipython


plt.style.use("fivethirtyeight")



PackageNotInstalledError: The R package "rlang" is not installed.

## Loading R packages

In [None]:
%%R
install.packages("htmltools")
suppressPackageStartupMessages(library(Microsoft365R))
suppressPackageStartupMessages(library(strucchange))
suppressPackageStartupMessages(library(tseries))
suppressPackageStartupMessages(library(forecast))
suppressPackageStartupMessages(library(seasonal))
suppressPackageStartupMessages(library(seasonalview))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(Microsoft365R))
suppressPackageStartupMessages(library(readxl))
suppressPackageStartupMessages(library(writexl))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(knitr))

# Loading data 

## Minor clean up

In [None]:
df = pd.DataFrame(pd.read_excel(r'C:\Users\asifr\OneDrive - State of New Mexico\Documents\Project Oil Price\Outputs\pred_sead_n_plots.xlsx', sheet_name = 'df_final'))
df_pred = pd.DataFrame(pd.read_excel(r'C:\Users\asifr\OneDrive - State of New Mexico\Documents\Project Oil Price\Outputs\df_pred_final.xlsx', sheet_name = 'df_pred'))
pd.options.display.max_columns = 500
# pd.options.display.max_rows = 500
df = df[df['index_china_gdp_sead'].notna()]
df['sv'] = ''

In [None]:
df

## Dropping non-regressors

In [None]:
df.drop(['nm_oil_price_sead', 'wti_spot_price_sea', 'brent_spot_price_sea', 'index_china_gdp_sea', 'index_china_gdp_sead', 'index_us_gdp_sea',\
         'us_unemploy_sea', 'index_ip_mining_sea', 'index_cpi_china_sea', 'opec_sea', 'non_opec_sea', 'del_supply_nonopec_sea', 'oecd_inventory_sea',\
         'del_world_consump_sea'], 
        axis = 1, inplace = True) # we set axis = 1 to remove columns, axis = 0 to remove rows. And inplace means we are modifying our dataframe, 
                                    # we ae not making a copy`

## Renaming variables for the ease of use

In [None]:
df.rename(columns = {"Unnamed: 0": "Date", "dubai_price_sea" : "dubai_price_sead", "index_cboe_volt_sea"  : "index_cboe_volt_sead", "index_cpi_us_sea" : "index_cpi_us_sead",\
                    "index_energy_sea" : "index_energy_sead", "index_metal_sea" : "index_metal_sead", "index_precious_sea" : "index_precious_sead","del_cap_per_sea"\
                    : "del_cap_per_sead", "del_world_gdp_per_sea": "del_world_gdp_per_sead", "del_supply_saudi_sea": "del_supply_saudi_sead",\
                    "opec_spare_capacity_sea": "opec_spare_capacity_sead", "index_china_gdp_sead_1": "index_china_gdp_sead"}, inplace = True)

# Creating structural break variables

## 911 attacks: sv_911

In [None]:
pd.set_option('mode.chained_assignment', None)
sv_911 = df.loc[(df['Date'] >= '2001-01-01') & (df['Date'] <= '2002-12-30')] 
sv_911['sv'] = ('sv_911')
sv_911['Date'] = pd.to_datetime(sv_911['Date'])
sv_911 = sv_911.set_index('Date')

## Low spare capacity: sv_lpc

In [None]:
pd.set_option('mode.chained_assignment', None)
sv_lpc = df.loc[(df['Date'] >= '2005-04-01') & (df['Date'] <= '2006-10-01')]
sv_lpc['sv'] = ('sv_lpc')
sv_lpc['Date'] = pd.to_datetime(sv_lpc['Date'])
sv_lpc = sv_lpc.set_index('Date')

## Global financial collapse: sv_gfc

In [None]:
pd.set_option('mode.chained_assignment', None)
sv_gfc = df.loc[(df['Date'] >= '2007-01-01') & (df['Date'] <= '2008-12-30')]
sv_gfc['sv'] = ('sv_gfc')
sv_gfc['Date'] = pd.to_datetime(sv_gfc['Date'])
sv_gfc = sv_gfc.set_index('Date')

## OPEC production cut: sv_opec

In [None]:
pd.set_option('mode.chained_assignment', None)
sv_opec = df.loc[(df['Date'] >= '2015-01-01') & (df['Date'] <= '2016-12-30')]
sv_opec['sv'] = ('sv_opec')
sv_opec['Date'] = pd.to_datetime(sv_opec['Date'])
sv_opec = sv_opec.set_index('Date')

## Global pandemic: sv_covid19

In [None]:
pd.set_option('mode.chained_assignment', None)
sv_covid19 = df.loc[(df['Date'] >= '2019-06-01') & (df['Date'] <= '2020-12-30')]
sv_covid19['sv'] = ('sv_covid19')
sv_covid19['Date'] = pd.to_datetime(sv_covid19['Date'])
sv_covid19 = sv_covid19.set_index('Date')

## Russia-ukrain conflict: sv_con22

In [None]:
pd.set_option('mode.chained_assignment', None)
sv_con22 = df.loc[(df['Date'] >= '2022-01-01') & (df['Date'] <= '2022-12-30')]
sv_con22['sv'] = ('sv_con22')
sv_con22['Date'] = pd.to_datetime(sv_con22['Date'])
sv_con22 = sv_con22.set_index('Date')

# Adding the structual variables to the dataset

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')


In [None]:
df = df.drop(pd.date_range('2001-01-01', '2002-12-30'), errors='ignore')
df = df.drop(pd.date_range('2005-04-01', '2006-10-01'), errors='ignore')
df = df.drop(pd.date_range('2007-01-01', '2008-12-30'), errors='ignore')
df = df.drop(pd.date_range('2015-01-01', '2016-12-30'), errors='ignore')
df = df.drop(pd.date_range('2019-06-01', '2020-12-30'), errors='ignore')
df = df.drop(pd.date_range('2022-01-01', '2022-12-30'), errors='ignore')

In [None]:
df = pd.concat([sv_911, sv_lpc, sv_gfc, sv_opec, sv_covid19, sv_con22, df], ignore_index = False)
df.sort_index(inplace=True)

In [None]:
df

## Some last moment clean-up

In [None]:
df = pd.concat([df, pd.get_dummies(df['sv']).astype(int)], axis=1)
df.rename(columns = {"": "remove"}, inplace = True)
df.drop(['sv', 'remove'], axis = 1, inplace = True)

In [None]:
pd.options.display.max_rows = 500
df_pred

# Exporting the dataset to an Excel file

In [None]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(r'C:\Users\asifr\OneDrive - State of New Mexico\Documents\Project Oil Price\Outputs\df_final.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.

df.index = df.index.date
df.index.name = 'Date'
df.to_excel(writer, sheet_name='df')

# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book
worksheet = writer.sheets['df']
writer.close()

# Merging the dataframes 

In [None]:
%%R

# odb <- get_business_onedrive()
# data_unadjusted <- odb$load_dataframe("Documents/Project Oil Price/dat.csv")
df_final <- read_xlsx("C:/Users/asifr/OneDrive - State of New Mexico/Documents/Project Oil Price/Outputs/df_final.xlsx")
df_pred <- read_xlsx("C:/Users/asifr/OneDrive - State of New Mexico/Documents/Project Oil Price/Outputs/df_pred_final.xlsx")

df_final<- rbind(df_final, df_pred)

# # Using openxlsx
suppressPackageStartupMessages(library(openxlsx))

OUT <- createWorkbook()

# ## adding some worksheet to the workbook OUT

addWorksheet(OUT, "df")

# ## Writing the data to the worksheets

writeData(OUT, sheet = "df", x = df_final)

                      
# # Exporting the file

saveWorkbook(OUT, "C:/Users/asifr/OneDrive - State of New Mexico/Documents/Project Oil Price/Outputs/df_final.xlsx", overwrite = TRUE)