In [199]:
import pandas as pd
from pathlib import Path
import hvplot.pandas

Fixing up S&P 500 CSV

In [200]:
#using pandas read_csv function to pull in file and set index to the date
sandp500_df = pd.read_csv("../Resources/S&P500.csv", index_col='Unnamed: 0', parse_dates=True, infer_datetime_format=True).dropna()

# replacing arbitrary index date column with Date 
sandp500_df.rename(columns = {'Unnamed: 0':'Date'}, inplace = True)

# displaying first 10 rows of the dataframe
sandp500_df.head(10)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
2023-01-13,3960.6,4003.95,3947.67,3999.09,3999.09,2305897000
2023-01-09,3910.82,4003.95,3877.29,3999.09,3999.09,20846120000
2023-01-02,3853.29,3906.19,3794.33,3895.08,3895.08,16190230000
2022-12-26,3843.34,3858.19,3780.78,3839.5,3839.5,12097370000
2022-12-19,3853.79,3889.82,3764.49,3844.82,3844.82,18506410000
2022-12-12,3939.29,4100.96,3827.91,3852.36,3852.36,25443390000
2022-12-05,4052.02,4052.45,3918.39,3934.38,3934.38,20662410000
2022-11-28,4005.36,4100.51,3937.65,4071.7,4071.7,22280580000
2022-11-21,3956.23,4034.02,3933.34,4026.12,4026.12,12724860000
2022-11-14,3977.97,4028.84,3906.54,3965.34,3965.34,21831700000


In [201]:
# getting rid of commas, dashes and converting the dataframe values to numeric with pandas and replace functions

for column in sandp500_df.columns[0::]:
    sandp500_df[column] = sandp500_df[column].replace(",", "", regex=True)
    sandp500_df[column] = sandp500_df[column].replace("-", "", regex=True)
    sandp500_df[column] = pd.to_numeric(sandp500_df[column])

# displaying the converted data from the s&p 500 csv file
sandp500_df.head(10)


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
2023-01-13,3960.6,4003.95,3947.67,3999.09,3999.09,2305897000.0
2023-01-09,3910.82,4003.95,3877.29,3999.09,3999.09,20846120000.0
2023-01-02,3853.29,3906.19,3794.33,3895.08,3895.08,16190230000.0
2022-12-26,3843.34,3858.19,3780.78,3839.5,3839.5,12097370000.0
2022-12-19,3853.79,3889.82,3764.49,3844.82,3844.82,18506410000.0
2022-12-12,3939.29,4100.96,3827.91,3852.36,3852.36,25443390000.0
2022-12-05,4052.02,4052.45,3918.39,3934.38,3934.38,20662410000.0
2022-11-28,4005.36,4100.51,3937.65,4071.7,4071.7,22280580000.0
2022-11-21,3956.23,4034.02,3933.34,4026.12,4026.12,12724860000.0
2022-11-14,3977.97,4028.84,3906.54,3965.34,3965.34,21831700000.0


In [202]:
# showing the plotable, converted s&p 500 data

sandp500_df.hvplot(
    # setting axes data
    x=0, y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='S&P 500 Close',
)

In [203]:
# writing to csv

sandp500_df.to_csv("../Resources/clean-data/s-and-p-500_clean.csv")


TNX treasury data cleaning

In [204]:
#using pandas read_csv function to pull in file and set index to the date
tnx_df = pd.read_csv("../Resources/^TNX Treasury Yield 10 Year.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
tnx_df = tnx_df.dropna()

# displaying first 10 rows of the dataframe
tnx_df.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1998-12-21,4.591,4.65,4.591,4.626,4.626,0.0
1998-12-22,4.702,4.702,4.658,4.69,4.69,0.0
1998-12-23,4.71,4.793,4.698,4.781,4.781,0.0
1998-12-24,4.789,4.878,4.785,4.846,4.846,0.0
1998-12-28,4.818,4.822,4.761,4.769,4.769,0.0
1998-12-29,4.777,4.793,4.693,4.701,4.701,0.0
1998-12-30,4.682,4.685,4.634,4.642,4.642,0.0
1998-12-31,4.634,4.678,4.618,4.638,4.638,0.0
1999-01-04,4.638,4.721,4.634,4.677,4.677,0.0
1999-01-05,4.725,4.745,4.709,4.729,4.729,0.0


In [205]:

# displaying the converted data from the tnx csv file
tnx_df.hvplot(
    # setting axes data
    x='Date', y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='TNX Treasury Close',
)

In [206]:
# writing to csv

tnx_df.to_csv("../Resources/clean-data/tnx_clean.csv")

cleaning VIX data and saving to csv

In [207]:
#using pandas read_csv function to pull in file and set index to the date
vix_df = pd.read_csv("../Resources/^VIX CBOE Volatility Index.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
vix_df = vix_df.dropna()

# displaying first 10 rows of the dataframe
vix_df.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1998-12-21,24.639999,24.639999,23.09,23.860001,23.860001,0
1998-12-22,24.049999,24.450001,22.75,22.780001,22.780001,0
1998-12-23,21.889999,22.059999,20.08,20.209999,20.209999,0
1998-12-24,21.0,21.780001,20.66,21.48,21.48,0
1998-12-28,22.92,23.969999,22.91,23.5,23.5,0
1998-12-29,23.68,24.0,22.110001,22.18,22.18,0
1998-12-30,22.030001,23.559999,22.030001,23.34,23.34,0
1998-12-31,23.74,24.76,23.67,24.42,24.42,0
1999-01-04,25.379999,26.959999,24.74,26.17,26.17,0
1999-01-05,25.92,25.98,24.360001,24.459999,24.459999,0


In [208]:
# displaying the converted data from the vix csv file

vix_df.hvplot(
    # setting axes data
    x='Date', y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='VIX CBOE Close',
)

In [209]:
# writing to csv

tnx_df.to_csv("../Resources/clean-data/vix_clean.csv")

cleaning crude oil data and saving as csv

In [210]:
#using pandas read_csv function to pull in file and set index to the date
oil_df = pd.read_csv("../Resources/Crude Oil WTI Futures.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
oil_df = oil_df.dropna()

# displaying first 10 rows of the dataframe
oil_df.head(10)

Unnamed: 0_level_0,Price,Open,High,Low,Vol.,Change %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-27,53.14,53.7,53.71,52.13,712.88K,-1.94%
2020-01-24,54.19,55.69,55.95,53.85,586.20K,-2.52%
2020-01-23,55.59,56.11,56.27,54.77,704.00K,-2.03%
2020-01-22,56.74,58.26,58.38,56.03,620.12K,-2.74%
2020-01-21,58.34,59.17,59.73,57.68,50.18K,-0.66%
2020-01-17,58.54,58.59,58.98,58.27,122.56K,0.03%
2020-01-16,58.52,58.1,58.87,57.56,182.87K,1.23%
2020-01-15,57.81,58.2,58.36,57.36,433.17K,-0.72%
2020-01-14,58.23,58.03,58.72,57.72,507.71K,0.26%
2020-01-13,58.08,59.04,59.27,57.91,584.00K,-1.63%


DX-Y data cleaning and saving to csv

In [211]:
#using pandas read_csv function to pull in file and set index to the date
usd_index_df = pd.read_csv("../Resources/DX-Y.NYB US Dollar Index.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
usd_index_df = usd_index_df.dropna()

# displaying first 10 rows of the dataframe
usd_index_df.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1998-12-21,93.889999,94.529999,93.739998,94.440002,94.440002,0.0
1998-12-22,94.410004,94.809998,94.309998,94.5,94.5,0.0
1998-12-23,94.550003,94.580002,94.330002,94.519997,94.519997,0.0
1998-12-24,94.459999,94.910004,94.419998,94.889999,94.889999,0.0
1998-12-25,94.459999,94.910004,94.419998,94.889999,94.889999,0.0
1998-12-28,94.839996,95.010002,94.650002,94.660004,94.660004,0.0
1998-12-29,94.669998,94.75,94.220001,94.400002,94.400002,0.0
1998-12-30,94.459999,94.830002,94.25,94.690002,94.690002,0.0
1998-12-31,94.75,94.769997,94.019997,94.169998,94.169998,0.0
1999-01-01,94.75,94.769997,94.019997,94.169998,94.169998,0.0


In [212]:
# displaying the converted data from the usd index csv file

usd_index_df.hvplot(
    # setting axes data
    x='Date', y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='USD index Close',
)

In [213]:
# writing to csv

tnx_df.to_csv("../Resources/clean-data/usd_index_clean.csv")

Cleaning and saving real estate etf data as csv

In [214]:
#using pandas read_csv function to pull in file and set index to the date
real_estate_etf_df = pd.read_csv("../Resources/IYR US Real Estate ETF.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
real_estate_etf_df = real_estate_etf_df.dropna()

# displaying first 10 rows of the dataframe
real_estate_etf_df.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-06-19,35.03125,35.03125,35.03125,35.03125,13.924595,200
2000-06-20,35.125,35.179688,35.125,35.179688,13.983594,400
2000-06-21,35.390625,35.390625,35.351563,35.351563,14.051915,20600
2000-06-22,35.367188,35.367188,35.367188,35.367188,14.058119,400
2000-06-23,35.367188,35.367188,35.367188,35.367188,14.058119,0
2000-06-26,35.125,35.125,34.875,34.875,13.862484,2000
2000-06-27,35.359375,35.375,35.359375,35.375,14.061225,2600
2000-06-28,35.375,35.375,35.375,35.375,14.061225,0
2000-06-29,35.375,35.375,35.375,35.375,14.061225,0
2000-06-30,35.796875,35.8125,35.375,35.375,14.061225,2800


In [215]:
# displaying the converted data from the real estate etf csv file

real_estate_etf_df.hvplot(
    # setting axes data
    x='Date', y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='Real Estate ETF Close',
)

In [216]:
# writing to csv

real_estate_etf_df.to_csv("../Resources/clean-data/real_estate_etf_clean.csv")

cleaning LBMA gold data and saving as csv

In [217]:
#using pandas read_csv function to pull in file and set index to the date
lmba_gold_df = pd.read_csv("../Resources/LBMA-GOLD.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
lmba_gold_df = lmba_gold_df.dropna()

# renaming arbitrary columns to match others
lmba_gold_df = lmba_gold_df.rename(columns={'USD (AM)':'Open', 'USD (PM)':'Close'})

# displaying first 10 rows of the dataframe
lmba_gold_df.head(10)

Unnamed: 0_level_0,Open,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-23,1927.2,1914.85
2023-01-20,1928.75,1924.9
2023-01-19,1907.5,1918.6
2023-01-18,1911.55,1920.7
2023-01-17,1904.95,1913.8
2023-01-16,1915.1,1917.0
2023-01-13,1904.05,1907.15
2023-01-12,1883.1,1882.55
2023-01-11,1884.25,1872.35
2023-01-10,1875.2,1878.65


In [218]:
# displaying the converted data from the LMBA gold csv file

lmba_gold_df.hvplot(
    # setting axes data
    x='Date', y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='LMBA GOLD Close',
)

In [219]:
# writing to csv

lmba_gold_df.to_csv("../Resources/clean-data/lmba_gold_clean.csv")

cleaning XLE energy sector etf data and saving to csv

In [220]:
#using pandas read_csv function to pull in file and set index to the date
energy_df = pd.read_csv("../Resources/XLE Energy Sector ETF.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
energy_df = energy_df.dropna()

# displaying first 10 rows of the dataframe
energy_df.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1998-12-23,23.34375,23.75,23.3125,23.75,13.048624,67800
1998-12-24,23.8125,23.8125,23.59375,23.625,12.979946,12300
1998-12-28,23.75,23.75,23.34375,23.5,12.911271,13500
1998-12-29,23.5625,23.734375,23.34375,23.734375,13.040041,22000
1998-12-30,23.71875,23.71875,23.359375,23.375,12.842599,21800
1998-12-31,23.34375,23.34375,23.21875,23.34375,12.825418,13000
1999-01-04,23.28125,23.75,23.28125,23.328125,12.816839,28600
1999-01-05,23.46875,23.46875,23.171875,23.203125,12.748167,42700
1999-01-06,23.46875,24.0,23.3125,23.96875,13.168812,56400
1999-01-07,23.96875,23.984375,23.578125,23.859375,13.108723,72600


In [221]:
# displaying the converted data from the cleaned csv

energy_df.hvplot(
    # setting axes data
    x='Date', y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='Energy Sector Close',
)

In [222]:
# writing to csv

energy_df.to_csv("../Resources/clean-data/energy_sector_clean.csv")

Cleaning financial sector data and saving to csv

In [223]:
#using pandas read_csv function to pull in file and set index to the date
finance_sector_df = pd.read_csv("../Resources/XLF Financial Sector ETF.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
finance_sector_df = finance_sector_df.dropna()

# displaying first 10 rows of the dataframe
finance_sector_df.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1998-12-23,18.861698,19.2171,18.861698,19.2171,11.948964,78784
1998-12-24,19.2171,19.344028,19.153635,19.344028,12.027889,43824
1998-12-28,19.331335,19.331335,19.039398,19.090172,11.87004,51948
1998-12-29,19.115557,19.293259,18.887083,19.293259,11.996318,100819
1998-12-30,19.191713,19.318644,19.090172,19.2171,11.948964,154614
1998-12-31,19.2171,19.242485,19.039398,19.039398,11.838473,65366
1999-01-04,18.810926,19.356722,18.810926,19.039398,11.838473,250632
1999-01-05,18.963242,19.242485,18.963242,19.2171,11.948964,47763
1999-01-06,19.572502,19.851748,19.445574,19.826361,12.327795,129871
1999-01-07,19.623274,20.130991,19.572502,20.130991,12.517207,89986


In [224]:
# displaying the converted data from the cleaned csv

finance_sector_df.hvplot(
    # setting axes data
    x='Date', y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='Financial Sector Close',
)

In [225]:
# writing to csv

finance_sector_df.to_csv("../Resources/clean-data/financial_sector_clean.csv")

cleaning technology sector data and saving to csv

In [226]:
#using pandas read_csv function to pull in file and set index to the date
tech_sector_df = pd.read_csv("../Resources/XLK Technology Sector ETF.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
tech_sector_df = tech_sector_df.dropna()

# displaying first 10 rows of the dataframe
tech_sector_df.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1998-12-23,32.75,32.8125,32.15625,32.8125,24.982464,338300
1998-12-24,32.8125,32.8125,32.5625,32.6875,24.8873,243700
1998-12-28,33.1875,33.1875,32.59375,32.78125,24.958675,309300
1998-12-29,32.9375,33.015625,32.5,32.875,25.030058,123800
1998-12-30,32.6875,33.140625,32.578125,32.75,24.934885,95600
1998-12-31,32.75,32.90625,32.34375,32.625,24.839712,77200
1999-01-04,32.65625,33.5625,32.59375,33.0,25.125221,650600
1999-01-05,33.0625,34.03125,33.0625,33.84375,25.767639,295200
1999-01-06,34.6875,34.9375,34.40625,34.84375,26.529005,624700
1999-01-07,34.5,35.03125,34.28125,34.734375,26.445732,534600


In [227]:
# displaying the converted data from the cleaned csv

tech_sector_df.hvplot(
    # setting axes data
    x='Date', y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='Technology Sector Close',
)

In [228]:
# writing to csv

tech_sector_df.to_csv("../Resources/clean-data/technology_sector_clean.csv")

Utilities sector data cleaned and saved to csv

In [229]:
#using pandas read_csv function to pull in file and set index to the date
utilities_sector_df = pd.read_csv("../Resources/XLU Utilities Sector ETF.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
utilities_sector_df = utilities_sector_df.dropna()

# displaying first 10 rows of the dataframe
utilities_sector_df.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1998-12-23,29.796875,29.796875,29.640625,29.703125,12.827038,24100
1998-12-24,30.21875,30.3125,30.15625,30.25,13.063194,23800
1998-12-28,30.1875,30.25,30.046875,30.09375,12.995724,3100
1998-12-29,30.21875,30.671875,30.21875,30.59375,13.211655,12000
1998-12-30,30.75,30.828125,30.34375,30.34375,13.103686,70400
1998-12-31,30.234375,30.234375,30.03125,30.234375,13.056453,21600
1999-01-04,30.015625,30.25,29.625,29.75,12.847287,109300
1999-01-05,29.984375,30.109375,29.859375,30.109375,13.00247,111200
1999-01-06,30.203125,30.515625,30.203125,30.421875,13.137423,41000
1999-01-07,30.421875,30.734375,30.25,30.3125,13.090192,222300


In [230]:
# displaying the converted data from the cleaned csv

utilities_sector_df.hvplot(
    # setting axes data
    x='Date', y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='Utility Sector Close',
)

In [None]:
# writing to csv

utilities_sector_df.to_csv("../Resources/clean-data/utilities_sector_clean.csv")