# Average Coders

### Gathering and Cleaning the Data

In [1]:
import pandas as pd


Interest Rates

In [2]:
# Read the interest rate CSV files and load into a dataframe
# Short term interest rates are represented by the 3-month US Treasury Yield
short_term_csv = "Interest_Rates/US_Treasury/GS3M.csv"
short_term_df = pd.read_csv (short_term_csv)
short_term_df.rename(columns={"DATE":"date", "VALUE":"st_interest"}, inplace=True)
print(f'The number of rows of data for 3-month Treasury is {len(short_term_df.index)}')
short_term_df.head()

The number of rows of data for 3-month Treasury is 444


Unnamed: 0,date,st_interest
0,1982-01-01,12.92
1,1982-02-01,14.28
2,1982-03-01,13.31
3,1982-04-01,13.34
4,1982-05-01,12.71


In [3]:
# Intermediate term interest rates are represented by the 2-year US Treasury Yield
interm_term_csv = "Interest_Rates/US_Treasury/GS2.csv"
interm_term_df = pd.read_csv (interm_term_csv)
interm_term_df.rename(columns={"DATE":"date", "VALUE":"it_interest"}, inplace=True)
print(f'The number of rows of data for 2-year Treasury is {len(interm_term_df.index)}')
interm_term_df.head()

The number of rows of data for 2-year Treasury is 511


Unnamed: 0,date,it_interest
0,1976-06-01,7.06
1,1976-07-01,6.85
2,1976-08-01,6.63
3,1976-09-01,6.42
4,1976-10-01,5.98


In [4]:
# Long term interest rates are represented by the 10-year US Treasury Yield
long_term_csv = "Interest_Rates/US_Treasury/GS10.csv"
long_term_df = pd.read_csv (long_term_csv)
long_term_df.rename(columns={"DATE":"date", "VALUE":"lt_interest"}, inplace=True)
print(f'The number of rows of data for 10-year Treasury is {len(long_term_df.index)}')
long_term_df.head()

The number of rows of data for 10-year Treasury is 789


Unnamed: 0,date,lt_interest
0,1953-04-01,2.83
1,1953-05-01,3.05
2,1953-06-01,3.11
3,1953-07-01,2.93
4,1953-08-01,2.95


In [5]:
# Merge the Short, Intermediate and Long Term Treasury Yields into one dataframee
merged_si = pd.merge (short_term_df, interm_term_df, on="date")
interest_rates_df = pd.merge (merged_si, long_term_df, on="date")
print(f'The number of rows of data for all interest rates is {len(interest_rates_df.index)}')
interest_rates_df.head()

The number of rows of data for all interest rates is 444


Unnamed: 0,date,st_interest,it_interest,lt_interest
0,1982-01-01,12.92,14.57,14.59
1,1982-02-01,14.28,14.82,14.43
2,1982-03-01,13.31,14.19,13.86
3,1982-04-01,13.34,14.2,13.87
4,1982-05-01,12.71,13.78,13.62


Inflation

In [6]:
# Read the CPI Inflation data
inflation_csv = "CPI_Inflation/monthly_cpi.csv"
inflation_df = pd.read_csv (inflation_csv)
print(f'The number of rows of data for CPI Inflation is {len(inflation_df.index)}')
inflation_df.head()

The number of rows of data for CPI Inflation is 869


Unnamed: 0,date,cpi_index,pct_chg_cpi
0,1947-01-01,21.48,
1,1947-02-01,21.62,
2,1947-03-01,22.0,
3,1947-04-01,22.0,
4,1947-05-01,21.95,


In [8]:
# Merge the Interest Rates and Inflation into one dataframee
int_and_infl_df = pd.merge (interest_rates_df, inflation_df, on="date")
print(f'The number of rows of data for interest rates and inflation is {len(int_and_infl_df.index)}')
int_and_infl_df.head()

The number of rows of data for interest rates and inflation is 444


Unnamed: 0,date,st_interest,it_interest,lt_interest,cpi_index,pct_chg_cpi
0,1982-01-01,12.92,14.57,14.59,94.4,8.26
1,1982-02-01,14.28,14.82,14.43,94.7,7.61
2,1982-03-01,13.31,14.19,13.86,94.7,6.88
3,1982-04-01,13.34,14.2,13.87,95.0,6.62
4,1982-05-01,12.71,13.78,13.62,95.9,6.91


Oil Prices

In [13]:
# Read the oil price data
oil_csv = "Oil_Prices/monthly_oil_prices.csv"
oil_df = pd.read_csv (oil_csv)
print(f'The number of rows of data for Oil Prices is {len(oil_df.index)}')
oil_df.head()

The number of rows of data for Oil Prices is 401


Unnamed: 0,date,oil_price,pct_chg_oil_price
0,1986-01-01,22.93,
1,1986-02-01,15.46,
2,1986-03-01,12.61,
3,1986-04-01,12.84,
4,1986-05-01,15.38,


In [11]:
# Merge the Oil Prices into the main dataframee
ii_and_oil_df = pd.merge (int_and_infl_df, oil_df, on="date")
print(f'The number of rows of data for the main dataframe is now {len(ii_and_oil_df.index)}')
ii_and_oil_df.head()

The number of rows of data for the main dataframe is now 396


Unnamed: 0,date,st_interest,it_interest,lt_interest,cpi_index,pct_chg_cpi,oil_price,pct_chg_oil_price
0,1986-01-01,7.3,8.14,9.19,109.9,3.97,22.93,
1,1986-02-01,7.29,7.97,8.7,109.7,3.2,15.46,
2,1986-03-01,6.76,7.21,7.78,109.1,2.15,12.61,
3,1986-04-01,6.24,6.7,7.3,108.7,1.59,12.84,
4,1986-05-01,6.33,7.07,7.71,109.0,1.68,15.38,


Mortgage Originations Data

In [15]:
# Read the mortgage data
mortgage_csv = "Mortgage_data/mortgage_data.csv"
mortgage_df = pd.read_csv (mortgage_csv)
print(f'The number of rows of data for Mortgage Data is {len(mortgage_df.index)}')
mortgage_df.head()

The number of rows of data for Mortgage Data is 358


Unnamed: 0,date,mtg_purchase,mtg_refi,pct_chg_mtg_purchase,pct_cht_mtg_refi
0,1990-01-01,31.0,7.0,,
1,1990-02-01,31.0,7.0,,
2,1990-03-01,31.0,7.0,,
3,1990-04-01,36.0,5.0,,
4,1990-05-01,36.0,5.0,,


In [16]:
# Merge the Mortgage Data into the main dataframee
main_and_mortgage_df = pd.merge (ii_and_oil_df, mortgage_df, on="date")
print(f'The number of rows of data for the main dataframe is now {len(main_and_mortgage_df.index)}')
main_and_mortgage_df.head()

The number of rows of data for the main dataframe is now 348


Unnamed: 0,date,st_interest,it_interest,lt_interest,cpi_index,pct_chg_cpi,oil_price,pct_chg_oil_price,mtg_purchase,mtg_refi,pct_chg_mtg_purchase,pct_cht_mtg_refi
0,1990-01-01,7.9,8.09,8.21,127.5,5.2,22.86,26.86,31.0,7.0,,
1,1990-02-01,8.0,8.37,8.47,128.0,5.26,22.11,23.24,31.0,7.0,,
2,1990-03-01,8.17,8.63,8.59,128.6,5.24,20.39,4.67,31.0,7.0,,
3,1990-04-01,8.04,8.72,8.79,128.9,4.71,18.43,-12.53,36.0,5.0,,
4,1990-05-01,8.01,8.64,8.76,129.1,4.37,18.2,-9.54,36.0,5.0,,


US Employment

In [17]:
# Read the employment data
employment_csv = "US_Employment/monthly_employment.csv"
employment_df = pd.read_csv (employment_csv)
print(f'The number of rows of data for Employment Data is {len(employment_df.index)}')
employment_df.head()

The number of rows of data for Employment Data is 965


Unnamed: 0,date,employment,pct_chg_employment
0,1939-01-01,29923,
1,1939-02-01,30100,
2,1939-03-01,30280,
3,1939-04-01,30094,
4,1939-05-01,30299,


In [18]:
# Merge the Employment Data into the main dataframee
main_and_employment_df = pd.merge (main_and_mortgage_df, employment_df, on="date")
print(f'The number of rows of data for the main dataframe is now {len(main_and_employment_df.index)}')
main_and_employment_df.head()

The number of rows of data for the main dataframe is now 348


Unnamed: 0,date,st_interest,it_interest,lt_interest,cpi_index,pct_chg_cpi,oil_price,pct_chg_oil_price,mtg_purchase,mtg_refi,pct_chg_mtg_purchase,pct_cht_mtg_refi,employment,pct_chg_employment
0,1990-01-01,7.9,8.09,8.21,127.5,5.2,22.86,26.86,31.0,7.0,,,109197,1.9
1,1990-02-01,8.0,8.37,8.47,128.0,5.26,22.11,23.24,31.0,7.0,,,109435,1.87
2,1990-03-01,8.17,8.63,8.59,128.6,5.24,20.39,4.67,31.0,7.0,,,109644,1.88
3,1990-04-01,8.04,8.72,8.79,128.9,4.71,18.43,-12.53,36.0,5.0,,,109688,1.76
4,1990-05-01,8.01,8.64,8.76,129.1,4.37,18.2,-9.54,36.0,5.0,,,109839,1.78


Average Hourly Earnings

In [19]:
# Read the Avg Hourly Earnings data
earnings_csv = "Avg_Hourly_Earnings/avg_hourly_earnings.csv"
earnings_df = pd.read_csv (earnings_csv)
print(f'The number of rows of data for Avg Hourly Earnings Data is {len(earnings_df.index)}')
earnings_df.head()

The number of rows of data for Avg Hourly Earnings Data is 665


Unnamed: 0,date,avg_earnings,pct_chg_avg_earnings
0,1964-01-01,2.5,
1,1964-02-01,2.5,
2,1964-03-01,2.51,
3,1964-04-01,2.52,
4,1964-05-01,2.52,


In [20]:
# Merge the Avg Hourly Earnings Data into the main dataframee
main_and_earnings_df = pd.merge (main_and_employment_df, earnings_df, on="date")
print(f'The number of rows of data for the main dataframe is now {len(main_and_earnings_df.index)}')
main_and_earnings_df.head()

The number of rows of data for the main dataframe is now 348


Unnamed: 0,date,st_interest,it_interest,lt_interest,cpi_index,pct_chg_cpi,oil_price,pct_chg_oil_price,mtg_purchase,mtg_refi,pct_chg_mtg_purchase,pct_cht_mtg_refi,employment,pct_chg_employment,avg_earnings,pct_chg_avg_earnings
0,1990-01-01,7.9,8.09,8.21,127.5,5.2,22.86,26.86,31.0,7.0,,,109197,1.9,10.02,3.83
1,1990-02-01,8.0,8.37,8.47,128.0,5.26,22.11,23.24,31.0,7.0,,,109435,1.87,10.07,4.03
2,1990-03-01,8.17,8.63,8.59,128.6,5.24,20.39,4.67,31.0,7.0,,,109644,1.88,10.11,4.23
3,1990-04-01,8.04,8.72,8.79,128.9,4.71,18.43,-12.53,36.0,5.0,,,109688,1.76,10.12,3.79
4,1990-05-01,8.01,8.64,8.76,129.1,4.37,18.2,-9.54,36.0,5.0,,,109839,1.78,10.16,4.31


# Dave, insert your API code here.  Add to "main_and_earnings_df".  When done, name the last dataframe "main_???__df".  In the box below, ??? is "with__API"

In [25]:
main_with_API_df = main_and_earnings_df

# Drop any rows that have one or more "NaN"
main_df = main_with_API_df.dropna()
print(f'The number of rows of data for the clean main dataframe is now {len(main_df.index)}')
main_df.head()

The number of rows of data for the clean main dataframe is now 336


Unnamed: 0,date,st_interest,it_interest,lt_interest,cpi_index,pct_chg_cpi,oil_price,pct_chg_oil_price,mtg_purchase,mtg_refi,pct_chg_mtg_purchase,pct_cht_mtg_refi,employment,pct_chg_employment,avg_earnings,pct_chg_avg_earnings
12,1991-01-01,6.41,7.13,8.09,134.7,5.65,25.23,10.37,25.0,11.0,-21.28,45.45,109056,-0.13,10.38,3.59
13,1991-02-01,6.12,6.87,7.85,134.8,5.31,20.48,-7.37,25.0,11.0,-21.28,45.45,108735,-0.64,10.39,3.18
14,1991-03-01,6.09,7.1,8.11,134.8,4.82,19.9,-2.4,25.0,11.0,-21.28,45.45,108576,-0.97,10.41,2.97
15,1991-04-01,5.83,6.95,8.04,135.1,4.81,20.83,13.02,33.0,12.0,-6.54,118.75,108367,-1.2,10.46,3.36
16,1991-05-01,5.63,6.78,8.07,135.6,5.03,21.23,16.65,33.0,12.0,-6.54,118.75,108251,-1.45,10.49,3.25


In [27]:
# Write out the dataframe to a CSV file
output_filename = "main_dataframe.csv"
main_df.to_csv (output_filename, index=False)