In [4]:
import pandas as pd
import numpy as np
import plotly.express as px
from dateutil import parser

In [5]:
gold_df = pd.read_csv('./raw_data/gold.csv')
gold_df['DateTime'] = gold_df['DateTime'].apply(lambda x: pd.to_datetime(parser.parse(str(x), dayfirst=True), errors='coerce'))
gold_df = gold_df[gold_df['DateTime'] >= pd.to_datetime('1915-01-01')]
gold_df.set_index('DateTime', inplace=True)
gold_df = gold_df.resample('BM').last()
gold_df.reset_index(inplace=True)
gold_df


  gold_df = gold_df.resample('BM').last()


Unnamed: 0,DateTime,Gold
0,1915-02-26,
1,1915-03-31,
2,1915-04-30,
3,1915-05-31,
4,1915-06-30,
...,...,...
1311,2024-05-31,2322.899902
1312,2024-06-28,2327.699951
1313,2024-07-31,2426.500000
1314,2024-08-30,2493.800049


In [7]:
bonds_df = pd.read_csv('./raw_data/bonds.csv')
bonds_df['DateTime'] = bonds_df['DateTime'].apply(lambda x: pd.to_datetime(parser.parse(str(x), dayfirst=True), errors='coerce'))
bonds_df = bonds_df[bonds_df['DateTime'] >= pd.to_datetime('1915-01-01')]
bonds_df = bonds_df.rename(columns={'Total Return Bond Index': 'Bonds'})
bonds_df.set_index('DateTime', inplace=True)
bonds_df = bonds_df.resample('BM').last()
bonds_df.reset_index(inplace=True)
bonds_df


  bonds_df = bonds_df.resample('BM').last()


Unnamed: 0,DateTime,Bonds
0,1915-01-29,13.652601
1,1915-02-26,
2,1915-03-31,
3,1915-04-30,
4,1915-05-31,
...,...,...
1312,2024-05-31,3219.530000
1313,2024-06-28,3239.270000
1314,2024-07-31,3316.660000
1315,2024-08-30,3367.370000


In [9]:
commodities_df = pd.read_csv('./raw_data/commodities.csv')
commodities_df['DateTime'] = commodities_df['DateTime'].apply(lambda x: pd.to_datetime(parser.parse(str(x), dayfirst=True), errors='coerce'))
commodities_df = commodities_df[commodities_df['DateTime'] >= pd.to_datetime('1915-01-01')]
commodities_df = commodities_df.rename(columns={'PPI (Producer Price Index)': 'PPI'})
commodities_df.set_index('DateTime', inplace=True)
commodities_df = commodities_df.resample('BM').last()
commodities_df.reset_index(inplace=True)
commodities_df


  commodities_df = commodities_df.resample('BM').last()


Unnamed: 0,DateTime,PPI
0,1915-01-29,11.800
1,1915-02-26,11.800
2,1915-03-31,11.800
3,1915-04-30,11.800
4,1915-05-31,11.900
...,...,...
1312,2024-05-31,255.453
1313,2024-06-28,256.015
1314,2024-07-31,257.485
1315,2024-08-30,255.613


In [10]:
sp500_df = pd.read_csv('./processed_data/sp500_and_inflation_preprocessed.csv')[['DateTime', 'S&P 500']]
sp500_df['DateTime'] = pd.to_datetime(sp500_df['DateTime'])
sp500_df


Unnamed: 0,DateTime,S&P 500
0,1915-01-29,7.48
1,1915-02-26,7.38
2,1915-03-31,7.57
3,1915-04-30,8.14
4,1915-05-31,7.95
...,...,...
1312,2024-05-31,5277.51
1313,2024-06-28,5460.48
1314,2024-07-31,5522.30
1315,2024-08-30,5648.40


In [11]:
merged_df = pd.concat([
    gold_df.set_index('DateTime'),
    bonds_df.set_index('DateTime'),
    commodities_df.set_index('DateTime'),
    sp500_df.set_index('DateTime')
], axis=1).reset_index()

merged_df

Unnamed: 0,DateTime,Gold,Bonds,PPI,S&P 500
0,1915-01-29,,13.652601,11.800,7.48
1,1915-02-26,,,11.800,7.38
2,1915-03-31,,,11.800,7.57
3,1915-04-30,,,11.800,8.14
4,1915-05-31,,,11.900,7.95
...,...,...,...,...,...
1312,2024-05-31,2322.899902,3219.530000,255.453,5277.51
1313,2024-06-28,2327.699951,3239.270000,256.015,5460.48
1314,2024-07-31,2426.500000,3316.660000,257.485,5522.30
1315,2024-08-30,2493.800049,3367.370000,255.613,5648.40


In [12]:
nan_proportions = merged_df.isna().mean()
nan_proportions

DateTime    0.000000
Gold        0.444951
Bonds       0.517084
PPI         0.000759
S&P 500     0.000000
dtype: float64

In [13]:
merged_df.to_csv('./processed_data/asset_classes_preprocessed.csv', index=False) 