In [None]:
import yfinance as yf
import pandas as pd
from datetime import datetime

# Define the tickers and their descriptions
tickers = {
    "^GSPC": "S&P 500",
    "^IXIC": "NASDAQ",
    "^DJI": "Dow Jones",
    "^NYFANG": "NYSE FANG+",
    "ARKK": "ARK Innovation ETF",
    "^VIX": "CBOE Volatility Index",
    "EEM": "iShares MSCI Emerging Markets ETF",
    "000001.SS": "Shanghai Composite Index",
    "DX-Y.NYB": "USD Index (DXY)",
    "EURUSD=X": "EUR to USD Exchange Rate"
}

# Set the date range
start_date = "2014-11-14"
end_date = "2024-11-18"

# Fetch data and save to individual CSV files
for ticker, description in tickers.items():
    print(f"Fetching data for {description} ({ticker})...")
    data = yf.download(ticker, start=start_date, end=end_date)
    if not data.empty:
        # Save with both ticker and description in the filename
        file_name = f"{ticker}_{description.replace(' ', '_').replace('+', '').replace('/', '_')}.csv"
        data.to_csv(f"../data/raw/{file_name}")
        print(f"Saved {description} data to {file_name}")
    else:
        print(f"No data found for {description} ({ticker})")

print("Data fetching complete.")

In [None]:
# import pandas as pd
# from statsmodels.tsa.arima.model import ARIMA

# # Helper function for ARIMA-based imputation
# def arima_impute(series):
#     if series.isnull().sum() > 0:  # Apply ARIMA only if there are still missing values
#         model = ARIMA(series, order=(1, 1, 1))  # Adjust (p, d, q) as needed
#         fitted_model = model.fit()
#         series = series.fillna(fitted_model.fittedvalues)
#     return series

# # Load the cleaned data for existing indices
# dji_data = pd.read_csv("^DJI_data.csv", parse_dates=["Date"], index_col="Date")
# gspc_data = pd.read_csv("^GSPC_data.csv", parse_dates=["Date"], index_col="Date")
# ixic_data = pd.read_csv("^IXIC_data.csv", parse_dates=["Date"], index_col="Date")
# btc_data = pd.read_csv("BTC-USD_data.csv", parse_dates=["Date"], index_col="Date")

# # Load the cleaned data for new indicators
# nyfang_data = pd.read_csv("NYSE_FANG.csv", parse_dates=["Date"], index_col="Date")
# arkk_data = pd.read_csv("ARK_Innovation_ETF.csv", parse_dates=["Date"], index_col="Date")
# vix_data = pd.read_csv("CBOE_Volatility_Index.csv", parse_dates=["Date"], index_col="Date")
# eem_data = pd.read_csv("iShares_MSCI_Emerging_Markets_ETF.csv", parse_dates=["Date"], index_col="Date")
# shanghai_data = pd.read_csv("Shanghai_Composite_Index.csv", parse_dates=["Date"], index_col="Date")
# dxy_data = pd.read_csv("USD_Index_(DXY).csv", parse_dates=["Date"], index_col="Date")
# eurusd_data = pd.read_csv("EUR_to_USD_Exchange_Rate.csv", parse_dates=["Date"], index_col="Date")

# # Retain only the Close prices and rename columns
# dji_data_cleaned = dji_data["Close"].to_frame().rename(columns={"Close": "DJI"})
# gspc_data_cleaned = gspc_data["Close"].to_frame().rename(columns={"Close": "GSPC"})
# ixic_data_cleaned = ixic_data["Close"].to_frame().rename(columns={"Close": "IXIC"})
# btc_data_cleaned = btc_data["Close"].to_frame().rename(columns={"Close": "BTC"})

# # Retain only the Close prices and rename columns
# nyfang_data_cleaned = nyfang_data["Close"].to_frame().rename(columns={"Close": "NYSE FANG+"})
# arkk_data_cleaned = arkk_data["Close"].to_frame().rename(columns={"Close": "ARK Innovation ETF"})
# vix_data_cleaned = vix_data["Close"].to_frame().rename(columns={"Close": "CBOE Volatility Index"})
# eem_data_cleaned = eem_data["Close"].to_frame().rename(columns={"Close": "iShares MSCI Emerging Markets ETF"})
# shanghai_data_cleaned = shanghai_data["Close"].to_frame().rename(columns={"Close": "Shanghai Composite Index"})
# dxy_data_cleaned = dxy_data["Close"].to_frame().rename(columns={"Close": "USD Index (DXY)"})
# eurusd_data_cleaned = eurusd_data["Close"].to_frame().rename(columns={"Close": "EUR to USD Exchange Rate"})

# # Combine all datasets using an outer join on Date
# combined_data = pd.concat([
#     dji_data_cleaned, gspc_data_cleaned, ixic_data_cleaned, btc_data_cleaned,
#     nyfang_data_cleaned, arkk_data_cleaned, vix_data_cleaned,
#     eem_data_cleaned, shanghai_data_cleaned, dxy_data_cleaned, eurusd_data_cleaned
# ], axis=1, join="outer")

# # Forward fill missing values
# combined_data.ffill(inplace=True)

# # Save the combined DataFrame to a CSV file
# combined_data.to_csv("macro_financial_factors.csv")

# print("Data successfully combined and saved to 'macro_financial_factors.csv'")

Data successfully combined and saved to 'full_combined_data.csv'


In [12]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

# Helper function for ARIMA-based imputation
def arima_impute(series):
    if series.isnull().sum() > 0 and series.notnull().sum() > 5:  # Ensure enough valid points
        model = ARIMA(series, order=(1, 1, 1))  # Adjust (p, d, q) as needed
        fitted_model = model.fit()
        series = series.fillna(fitted_model.fittedvalues)
    return series

# Load datasets
datasets = {
    "DJI": "^DJI_Dow_Jones.csv",
    "GSPC": "^GSPC_S&P_500.csv",
    "IXIC": "^IXIC_NASDAQ.csv",
    "NYSE FANG+": "^NYFANG_NYSE_FANG+.csv",
    "ARK Innovation ETF": "ARKK_ARK_Innovation_ETF.csv",
    "CBOE Volatility Index": "^VIX_CBOE_Volatility_Index.csv",
    "iShares MSCI Emerging Markets ETF": "EEM_iShares_MSCI_Emerging_Markets_ETF.csv",
    "Shanghai Composite Index": "000001.SS_Shanghai_Composite_Index.csv",
    "USD Index (DXY)": "DX-Y.NYB_USD_Index_(DXY).csv",
    "EUR to USD Exchange Rate": "EURUSD=X_EUR_to_USD_Exchange_Rate.csv",
}

# Read datasets and retain only 'Close' prices, renaming appropriately
dataframes = []
for name, filepath in datasets.items():
    df = pd.read_csv(f"../data/raw/{filepath}", parse_dates=["Date"], index_col="Date")
    df_cleaned = df["Close"].to_frame(name=name)  # Retain only 'Close' column and rename
    dataframes.append(df_cleaned)

# Combine all datasets using an outer join
combined_data = pd.concat(dataframes, axis=1, join="outer")

# Step 1: Create a full date range for the combined data
full_date_range = pd.date_range(start=combined_data.index.min(), end=combined_data.index.max())
combined_data = combined_data.reindex(full_date_range)

# Step 2: Forward Fill for Edge Cases
combined_data = combined_data.fillna(method="ffill")

# Step 3: Linear Interpolation for Small Gaps
combined_data = combined_data.interpolate(method="linear")

# Step 4: ARIMA-Based Imputation for Large Gaps
for column in combined_data.columns:
    combined_data[column] = arima_impute(combined_data[column])

# Step 5: Final Fallback for Remaining Missing Values
combined_data = combined_data.fillna(combined_data.mean())  # Fallback strategy

combined_data.index.name = "Date"

# Save the combined DataFrame to a CSV file
combined_data.to_csv("../data/processed/macro_financial_factors.csv")

print("Data successfully combined and saved to 'macro_financial_factors.csv'")

Data successfully combined and saved to 'macro_financial_factors.csv'


  combined_data = combined_data.fillna(method="ffill")
