# Imports

In [1]:
import pandas as pd
from sqlalchemy import create_engine

# Read Excel file and find worksheet names

In [2]:
bud_xls_file = pd.ExcelFile('Fed_Budget_Deficit_Surplus_OMB_ERM.xlsx')
bud_xls_file.sheet_names

['Table']

# Dataframe formatting and cleaning (drop rows, rename columns)

In [3]:
# Load excel file into dataframe
df = bud_xls_file.parse('Table')
df

Unnamed: 0,"Table 1.3 - SUMMARY OF RECEIPTS, OUTLAYS, AND SURPLUSES OR DEFICITS ( - ) IN CURRENT DOLLARS, CONSTANT (FY 2012) DOLLARS, AND AS PERCENTAGES OF GDP: 1940 - 2024",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,(dollar amounts in billions),,,,,,,,,,
1,Fiscal Year,In Current Dollars,,,In Constant (FY 2012) Dollars,,,Addendum: Composite Deflator,As Percentages of GDP,,
2,,Receipts,Outlays,Surplus or Deficit (-),Receipts,Outlays,Surplus or Deficit (-),,Receipts,Outlays,Surplus or Deficit (-)
3,1940,6.5,9.5,-2.9,100.3,145,-44.7,0.0653,6.7,9.6,-3
4,1941,8.7,13.7,-4.9,120.3,188.6,-68.2,0.0724,7.5,11.7,-4.3
...,...,...,...,...,...,...,...,...,...,...,...
84,2020 estimate,3644.8,4745.6,-1100.8,3209.3,4178.5,-969.3,1.1357,16.3,21.2,-4.9
85,2021 estimate,3876.9,4945.2,-1068.3,3339.8,4260.2,-920.3,1.1608,16.5,21,-4.5
86,2022 estimate,4128.6,5177.5,-1048.8,3479.1,4362.9,-883.8,1.1867,16.7,20.9,-4.2
87,2023 estimate,4421.5,5330.1,-908.6,3644.8,4393.8,-749,1.2131,17,20.5,-3.5


In [4]:
#Identify columns in the dataframe
df.columns

Index(['Table 1.3 - SUMMARY OF RECEIPTS, OUTLAYS, AND SURPLUSES OR DEFICITS ( - ) IN CURRENT DOLLARS, CONSTANT (FY 2012) DOLLARS, AND AS PERCENTAGES OF GDP:  1940 - 2024',
       'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10'],
      dtype='object')

In [5]:
# rename columns
df = df.rename(columns={'Table 1.3 - SUMMARY OF RECEIPTS, OUTLAYS, AND SURPLUSES OR DEFICITS ( - ) IN CURRENT DOLLARS, CONSTANT (FY 2012) DOLLARS, AND AS PERCENTAGES OF GDP:  1940 - 2024':'tax_year',
                          'Unnamed: 1':'current_$_receipts', 'Unnamed: 2':'current_$_outlays', 'Unnamed: 3':'current_$_surplus_deficit', 'Unnamed: 4':'constant_$_receipts', 'Unnamed: 5':'constant_$_outlays',
                          'Unnamed: 6':'constant_$_surplus_deficit'})

In [6]:
# confirm name changes and df structure looks correct
df

Unnamed: 0,tax_year,current_$_receipts,current_$_outlays,current_$_surplus_deficit,constant_$_receipts,constant_$_outlays,constant_$_surplus_deficit,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,(dollar amounts in billions),,,,,,,,,,
1,Fiscal Year,In Current Dollars,,,In Constant (FY 2012) Dollars,,,Addendum: Composite Deflator,As Percentages of GDP,,
2,,Receipts,Outlays,Surplus or Deficit (-),Receipts,Outlays,Surplus or Deficit (-),,Receipts,Outlays,Surplus or Deficit (-)
3,1940,6.5,9.5,-2.9,100.3,145,-44.7,0.0653,6.7,9.6,-3
4,1941,8.7,13.7,-4.9,120.3,188.6,-68.2,0.0724,7.5,11.7,-4.3
...,...,...,...,...,...,...,...,...,...,...,...
84,2020 estimate,3644.8,4745.6,-1100.8,3209.3,4178.5,-969.3,1.1357,16.3,21.2,-4.9
85,2021 estimate,3876.9,4945.2,-1068.3,3339.8,4260.2,-920.3,1.1608,16.5,21,-4.5
86,2022 estimate,4128.6,5177.5,-1048.8,3479.1,4362.9,-883.8,1.1867,16.7,20.9,-4.2
87,2023 estimate,4421.5,5330.1,-908.6,3644.8,4393.8,-749,1.2131,17,20.5,-3.5


In [7]:
# pull desired columns into new dataframe
budget_df = df[['tax_year','current_$_receipts','current_$_outlays','current_$_surplus_deficit','constant_$_receipts','constant_$_outlays','constant_$_surplus_deficit']].copy()

In [8]:
# confirm new df structure
budget_df

Unnamed: 0,tax_year,current_$_receipts,current_$_outlays,current_$_surplus_deficit,constant_$_receipts,constant_$_outlays,constant_$_surplus_deficit
0,(dollar amounts in billions),,,,,,
1,Fiscal Year,In Current Dollars,,,In Constant (FY 2012) Dollars,,
2,,Receipts,Outlays,Surplus or Deficit (-),Receipts,Outlays,Surplus or Deficit (-)
3,1940,6.5,9.5,-2.9,100.3,145,-44.7
4,1941,8.7,13.7,-4.9,120.3,188.6,-68.2
...,...,...,...,...,...,...,...
84,2020 estimate,3644.8,4745.6,-1100.8,3209.3,4178.5,-969.3
85,2021 estimate,3876.9,4945.2,-1068.3,3339.8,4260.2,-920.3
86,2022 estimate,4128.6,5177.5,-1048.8,3479.1,4362.9,-883.8
87,2023 estimate,4421.5,5330.1,-908.6,3644.8,4393.8,-749


In [9]:
# drop rows that aren't needed
budget_df = budget_df.drop(0)
budget_df = budget_df.drop(1)
budget_df = budget_df.drop(2)
budget_df = budget_df.drop(40)
budget_df = budget_df.drop(83)
budget_df = budget_df.drop(84)
budget_df = budget_df.drop(85)
budget_df = budget_df.drop(86)
budget_df = budget_df.drop(87)
budget_df = budget_df.drop(88)

In [10]:
budget_df.dtypes

tax_year                      object
current_$_receipts            object
current_$_outlays             object
current_$_surplus_deficit     object
constant_$_receipts           object
constant_$_outlays            object
constant_$_surplus_deficit    object
dtype: object

In [11]:
# convert file types to numeric
budget_df['tax_year'] = pd.to_numeric(budget_df['tax_year'])
budget_df['current_$_receipts'] = pd.to_numeric(budget_df['current_$_receipts'])
budget_df['current_$_outlays'] = pd.to_numeric(budget_df['current_$_outlays'])
budget_df['current_$_surplus_deficit'] = pd.to_numeric(budget_df['current_$_surplus_deficit'])
budget_df['constant_$_receipts'] = pd.to_numeric(budget_df['constant_$_receipts'])
budget_df['constant_$_outlays'] = pd.to_numeric(budget_df['constant_$_outlays'])
budget_df['constant_$_surplus_deficit'] = pd.to_numeric(budget_df['constant_$_surplus_deficit'])

In [12]:
budget_df.dtypes

tax_year                        int64
current_$_receipts            float64
current_$_outlays             float64
current_$_surplus_deficit     float64
constant_$_receipts           float64
constant_$_outlays            float64
constant_$_surplus_deficit    float64
dtype: object

# Final Dataframe

In [13]:
budget_df

Unnamed: 0,tax_year,current_$_receipts,current_$_outlays,current_$_surplus_deficit,constant_$_receipts,constant_$_outlays,constant_$_surplus_deficit
3,1940,6.5,9.5,-2.9,100.3,145.0,-44.7
4,1941,8.7,13.7,-4.9,120.3,188.6,-68.2
5,1942,14.6,35.1,-20.5,177.8,426.9,-249.1
6,1943,24.0,78.6,-54.6,263.2,861.3,-598.2
7,1944,43.7,91.3,-47.6,522.7,1090.8,-568.2
...,...,...,...,...,...,...,...
78,2014,3021.5,3506.3,-484.8,2932.9,3403.5,-470.6
79,2015,3249.9,3691.8,-442.0,3137.6,3564.2,-426.7
80,2016,3268.0,3852.6,-584.7,3133.2,3693.8,-560.5
81,2017,3316.2,3981.6,-665.4,3126.4,3753.8,-627.4


# Create / Connect to local database

In [14]:
rds_connection_string = "postgres:postgres@localhost:5432/TaxAndBudget_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

# Check for Tables

In [15]:
 engine.table_names()

['budget']

# Use pandas to load converted DataFrame into database

In [None]:
# load new data to database (do NOT run again !!!!)
# budget_df.to_sql(name='budget', con=engine, if_exists='append', index=False)