In [42]:
import pandas as pd
from pathlib import Path
import hvplot.pandas

Fixing up S&P 500 CSV

In [43]:
#using pandas read_csv function to pull in file and set index to the date
sandp500_df = pd.read_csv("../Resources/S&P500.csv", index_col='Unnamed: 0', parse_dates=True, infer_datetime_format=True).dropna()

# replacing arbitrary index date column with Date 
sandp500_df.rename(columns = {'Unnamed: 0':'Date'}, inplace = True)

# displaying first 10 rows of the dataframe
sandp500_df.head(10)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
2023-01-13,3960.6,4003.95,3947.67,3999.09,3999.09,2305897000
2023-01-09,3910.82,4003.95,3877.29,3999.09,3999.09,20846120000
2023-01-02,3853.29,3906.19,3794.33,3895.08,3895.08,16190230000
2022-12-26,3843.34,3858.19,3780.78,3839.5,3839.5,12097370000
2022-12-19,3853.79,3889.82,3764.49,3844.82,3844.82,18506410000
2022-12-12,3939.29,4100.96,3827.91,3852.36,3852.36,25443390000
2022-12-05,4052.02,4052.45,3918.39,3934.38,3934.38,20662410000
2022-11-28,4005.36,4100.51,3937.65,4071.7,4071.7,22280580000
2022-11-21,3956.23,4034.02,3933.34,4026.12,4026.12,12724860000
2022-11-14,3977.97,4028.84,3906.54,3965.34,3965.34,21831700000


In [44]:
# getting rid of commas, dashes and converting the dataframe values to numeric with pandas and replace functions

for column in sandp500_df.columns[0::]:
    sandp500_df[column] = sandp500_df[column].replace(",", "", regex=True)
    sandp500_df[column] = sandp500_df[column].replace("-", "", regex=True)
    sandp500_df[column] = pd.to_numeric(sandp500_df[column])

# displaying the converted data from the s&p 500 csv file
sandp500_df.head(10)


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
2023-01-13,3960.6,4003.95,3947.67,3999.09,3999.09,2305897000.0
2023-01-09,3910.82,4003.95,3877.29,3999.09,3999.09,20846120000.0
2023-01-02,3853.29,3906.19,3794.33,3895.08,3895.08,16190230000.0
2022-12-26,3843.34,3858.19,3780.78,3839.5,3839.5,12097370000.0
2022-12-19,3853.79,3889.82,3764.49,3844.82,3844.82,18506410000.0
2022-12-12,3939.29,4100.96,3827.91,3852.36,3852.36,25443390000.0
2022-12-05,4052.02,4052.45,3918.39,3934.38,3934.38,20662410000.0
2022-11-28,4005.36,4100.51,3937.65,4071.7,4071.7,22280580000.0
2022-11-21,3956.23,4034.02,3933.34,4026.12,4026.12,12724860000.0
2022-11-14,3977.97,4028.84,3906.54,3965.34,3965.34,21831700000.0


In [45]:
# showing the plotable, converted s&p 500 data

sandp500_df.hvplot(
    # setting axes data
    x=0, y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='S&P 500 Close',
)

In [46]:
# writing to csv

sandp500_df.to_csv("../Resources/clean-data/s-and-p-500_clean.csv")


TNX treasury data cleaning

In [47]:
#using pandas read_csv function to pull in file and set index to the date
tnx_df = pd.read_csv("../Resources/^TNX Treasury Yield 10 Year.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
tnx_df = tnx_df.dropna()

# displaying first 10 rows of the dataframe
tnx_df.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1998-12-21,4.591,4.65,4.591,4.626,4.626,0.0
1998-12-22,4.702,4.702,4.658,4.69,4.69,0.0
1998-12-23,4.71,4.793,4.698,4.781,4.781,0.0
1998-12-24,4.789,4.878,4.785,4.846,4.846,0.0
1998-12-28,4.818,4.822,4.761,4.769,4.769,0.0
1998-12-29,4.777,4.793,4.693,4.701,4.701,0.0
1998-12-30,4.682,4.685,4.634,4.642,4.642,0.0
1998-12-31,4.634,4.678,4.618,4.638,4.638,0.0
1999-01-04,4.638,4.721,4.634,4.677,4.677,0.0
1999-01-05,4.725,4.745,4.709,4.729,4.729,0.0


In [48]:

# displaying the converted data from the tnx csv file
tnx_df.hvplot(
    # setting axes data
    x='Date', y='Close',
    # setting axes labels
    xlabel='Date', ylabel='Close',
    # title
    title='TNX Treasury Close',
)

In [None]:
# writing to csv

tnx_df.to_csv("../Resources/clean-data/tnx_clean.csv")