In [613]:
# Dependencies and Setup
import pandas as pd  
import numpy as np

In [614]:
# Specify the path to your Excel file
household_income_data = "Resources/5204055011do001.xlsx"  

In [615]:
# Define the row indices to skip
rows_to_skip = list(range(0, 6)) + [7] + list(range(9, 26)) + list(range(27, 40)) + list(range(42, 127))  

In [616]:
# Read ABS income data file and store into Pandas DataFrames
dfs = pd.read_excel(household_income_data, sheet_name=None, skiprows=rows_to_skip)  

In [617]:
# Create an empty list to store DataFrames with tab names as a column
dfs_with_tab_name = [] 

In [618]:
# Flag to skip the first and last tab
skip_first_tab = True
skip_last_tab = False

In [619]:
# Iterate over each sheet in the dictionary
for sheet_name, df in dfs.items():
    if skip_first_tab:
        skip_first_tab = False
        continue  # Skip processing the first tab
        
    if skip_last_tab:
        break  # Exit the loop if the last tab is reached
        
    # Determine the column indices dynamically based on the actual number of columns
    num_cols = min(7, len(df.columns))
    
    # Ensure we don't exceed the number of columns in the DataFrame
    columns_to_read = list(range(num_cols))
    
    # Select only the specified columns
    df_selected = df.iloc[:, columns_to_read].copy()
    
    # Add a new column with the tab name to the selected DataFrame using loc
    df_selected.loc[:, 'Year/s'] = sheet_name
    
    # Append the modified DataFrame to the list
    dfs_with_tab_name.append(df_selected)     

In [620]:
# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dfs_with_tab_name, ignore_index=True)  

In [621]:
# Define a dictionary to map old values to new values
value_mapping = {'Table 1.1': '2003-2004', 'Table 1.2': '2005-2006', 'Table 1.3': '2007-2008', 'Table 1.4': '2009-2010', 'Table 1.5': '2011-2012', 'Table 1.6': '2013-2014', 'Table 1.7': '2015-2016', 'Table 1.8': '2017-2018', 'Table 1.9': '2019-2020', 'Table 1.10': '2020-2021', 'Table 1.11': '2021-2022', 'Explanatory Notes': '', 'tab_name': 'Year/s'}

In [622]:
# Replace values in column 'tab_name' using the value_mapping dictionary
combined_df['Year/s'] = combined_df['Year/s'].replace(value_mapping)

In [623]:
# Remove rows where column 'Year/s' has value '2020-2021' or '2021-2022'
values_to_remove = ['2020-2021', '2021-2022']
cleaned_df = combined_df[~combined_df['Year/s'].isin(values_to_remove)]

In [624]:
# Convert values to numeric data type
cleaned_df.loc[:, 'Wages and salaries'] = cleaned_df['Wages and salaries'].astype(float)
cleaned_df.loc[:, 'Income from  unincorporated business (a)'] = cleaned_df['Income from  unincorporated business (a)'].astype(float)
cleaned_df.loc[:, 'Property income and superannuation'] = cleaned_df['Property income and superannuation'].astype(float)
cleaned_df.loc[:, 'Government pensions and allowances'] = cleaned_df['Government pensions and allowances'].astype(float)
cleaned_df.loc[:, 'Other'] = cleaned_df['Other'].astype(float)
cleaned_df.loc[:, 'All households (b)'] = cleaned_df['All households (b)'].astype(float)
cleaned_df.loc[:, 'Year/s'] = cleaned_df['Year/s'].astype(str)

In [625]:
# Specify the file path for the CSV file
csv_file_path = 'ABS_income_data.csv'  

In [626]:
# Save the DataFrame to a CSV file
cleaned_df.to_csv(csv_file_path, index=False) 

In [627]:
print(f"DataFrame successfully saved to '{csv_file_path}'.")

DataFrame successfully saved to 'ABS_income_data.csv'.


In [628]:
# Create a pivot table with 'Category' as rows, 'Year' as columns, and 'Value' as values
pivot_df = cleaned_df.pivot_table(columns='Year/s', values='All households (b)', aggfunc='sum')


In [629]:
# rename the index as "Household income"
pivot_df = cleaned_df.pivot_table(index='Unnamed: 0', columns='Year/s', values='All households (b)', aggfunc='sum')
pivot_df = pivot_df.rename_axis('Household income')

In [630]:
# Define a function to format values as currency with 2 decimal places
def format_currency(value):
    return "${:,.2f}".format(value)

In [631]:
# Apply the currency formatting with 2 decimal places to the pivot table values
formatted_pivot_df = pivot_df.map(format_currency)

In [632]:
# Display the formatted pivot table
formatted_pivot_df
##########NEED TO FIX FORMATTING OF FIRST ROW############################

Year/s,2003-2004,2005-2006,2007-2008,2009-2010,2011-2012,2013-2014,2015-2016,2017-2018,2019-2020
Household income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Estimated number of households in population,"$7,954,585.00","$8,160,856.00","$8,327,818.00","$8,664,857.00","$8,912,566.00","$9,048,583.00","$9,246,191.00","$9,554,316.00","$10,016,972.00"
Gross disposable income,"$536,356.00","$608,301.00","$725,929.00","$827,246.00","$939,989.00","$1,022,855.00","$1,095,050.00","$1,165,497.00","$1,282,944.00"
Total gross income,"$711,537.00","$819,132.00","$985,903.00","$1,083,258.00","$1,248,033.00","$1,336,473.00","$1,433,237.00","$1,530,133.00","$1,656,039.00"
Total income payable,"$175,181.00","$210,831.00","$259,974.00","$256,011.00","$308,044.00","$313,618.00","$338,187.00","$364,636.00","$373,095.00"
