In [1]:
import pandas as pd
import numpy as np


## Organisation of Data
A method is created to organise the data in the desired format of Category: Age group, Suburb and Gender, Sub Category: Age Group:25-34 years  2023   
2   Age Group                                      55-64 years  2023   
3   Age Group                                      18-24 years  2023   
4   Age Group                                        65+ years  2023   
5   Age Group                                      45-54 years  2023   
6   Age Group                                      35-44 years  2023 

Gender: Female, Male and Suburb:  North Melbourne 3051 / West Melbourne 3003  2023   
8      Suburb                                   Parkville 3052  2023   
9      Suburb                     South Wharf / Southbank 3006  2023   
10     Suburb                     Kensington / Flemington 3031  2023   
11     Suburb                                City of Melbourne  2023   
12     Suburb                                   Melbourne 3000  2023   
13     Suburb                                     Carlton 3053  2023   
14     Suburb                              East Melbourne 3002  2023   
15     Suburb                                   Docklands 3008  2023   
16     Suburb  South Yarra 3141 / Melbourne/St Kilda Road 3004  2023 

In [2]:
def organise_data(data):
  # Drop specified columns
  data = data.drop(columns=["INDICATOR", "TYPE", "TOPIC", "DESCRIPTION", "RESPONSE", "SAMPLE SIZE", "FORMAT"])

  # Create a new 'CATEGORIES' column based on the values in 'RESPONDENT GROUP'
  data['CATEGORIES'] = np.where(data['RESPONDENT GROUP'].str.contains('years'), 'Age Group',
                                np.where(data['RESPONDENT GROUP'].isin(['Male', 'Female']), 'Gender', 'Suburb'))

  # Reorganize the columns
  data = data[['CATEGORIES', 'RESPONDENT GROUP', 'YEAR', 'RESULT']]

  # Reorganize the rows based on the 'CATEGORIES' column order
  category_order = ['Age Group', 'Suburb', 'Gender']
  data['CATEGORIES'] = pd.Categorical(data['CATEGORIES'], categories=category_order, ordered=True)

  # Sort the DataFrame based on the 'CATEGORIES' column
  data = data.sort_values(by='CATEGORIES')

  # Reset the index to start from 1
  data.reset_index(drop=True, inplace=True)
  data.index = data.index + 1  # Adjust the index to start from 1

  # Rename the columns
  data = data.rename(columns={
      'CATEGORIES': 'Category',
      'RESPONDENT GROUP': 'Subcategory',
      'YEAR': 'Year',
      'RESULT': 'Percentage'
  })

  # Display the updated DataFrame
  print(data)

  return data

## Load Datasets and Organise Data

Mental Health dataset is loaded for the year 2023.

In [3]:
# Load the CSV file
file = 'mental_health_2023.csv'
data = pd.read_csv(file) 

mental_data_2023 = organise_data(data)

     Category                                      Subcategory  Year  \
1   Age Group                                      25-34 years  2023   
2   Age Group                                      55-64 years  2023   
3   Age Group                                      18-24 years  2023   
4   Age Group                                        65+ years  2023   
5   Age Group                                      45-54 years  2023   
6   Age Group                                      35-44 years  2023   
7      Suburb       North Melbourne 3051 / West Melbourne 3003  2023   
8      Suburb                                   Parkville 3052  2023   
9      Suburb                     South Wharf / Southbank 3006  2023   
10     Suburb                     Kensington / Flemington 3031  2023   
11     Suburb                                City of Melbourne  2023   
12     Suburb                                   Melbourne 3000  2023   
13     Suburb                                     Carlton 3053  

Mental Health Data is loaded for the year 2022 with a filter applied to extract the mental health values from the entire dataset. The 2022 dataset is not originally filtered.

In [4]:
mental_data_2022 = pd.read_csv('mental_health_2022.csv')

# Filter rows where 'DESCRIPTION' is 'Self reported mental health'
mental_data_2022 = mental_data_2022[mental_data_2022['DESCRIPTION'] == "Self reported mental health"]

#organise into desired format
mental_data_2022 = organise_data(mental_data_2022)
    

     Category                                      Subcategory  Year  \
1   Age Group                                      25-34 years  2022   
2   Age Group                                        65+ years  2022   
3   Age Group                                      55-64 years  2022   
4   Age Group                                      18-24 years  2022   
5   Age Group                                      45-54 years  2022   
6   Age Group                                      35-44 years  2022   
7      Suburb                              East Melbourne 3002  2022   
8      Suburb  South Yarra 3141 / Melbourne/St Kilda Road 3004  2022   
9      Suburb       North Melbourne 3051 / West Melbourne 3003  2022   
10     Suburb                                City of Melbourne  2022   
11     Suburb                     Kensington / Flemington 3031  2022   
12     Suburb                                   Melbourne 3000  2022   
13     Suburb                                   Parkville 3052  

The 2021 Data is loaded and overall health is extracted. Note from the year 2021-2018, only overall health is recorded.

In [5]:
data_2021 = pd.read_csv('mental_health_2021.csv')

# Filter rows where 'DESCRIPTION' is 'Self reported mental health'
data_2021 = data_2021[data_2021['DESCRIPTION'] == "Self reported health"]

#organise into desired format
data_2021 = organise_data(data_2021)

     Category                                      Subcategory  Year  \
1   Age Group                                      35-44 years  2021   
2   Age Group                                      55-64 years  2021   
3   Age Group                                        65+ years  2021   
4   Age Group                                      18-24 years  2021   
5   Age Group                                      45-54 years  2021   
6   Age Group                                      25-34 years  2021   
7      Suburb  South Yarra 3141 / Melbourne/St Kilda Road 3004  2021   
8      Suburb                     South Wharf / Southbank 3006  2021   
9      Suburb                                   Parkville 3052  2021   
10     Suburb                     Kensington / Flemington 3031  2021   
11     Suburb                              East Melbourne 3002  2021   
12     Suburb                                   Docklands 3008  2021   
13     Suburb                                City of Melbourne  

Data for the year 2020 is extracted.

In [6]:
data_2020 = pd.read_csv('mental_health_2020.csv')

# Filter rows where 'DESCRIPTION' is 'Self reported mental health'
data_2020 = data_2020[data_2020['DESCRIPTION'] == "Self reported health"]

#organise into desired format
data_2020 = organise_data(data_2020)

     Category                                      Subcategory  Year  \
1   Age Group                                      35-44 years  2020   
2   Age Group                                      25-34 years  2020   
3   Age Group                                      45-54 years  2020   
4   Age Group                                      55-64 years  2020   
5   Age Group                                      18-24 years  2020   
6   Age Group                                        65+ years  2020   
7      Suburb                                   Parkville 3052  2020   
8      Suburb                                   Melbourne 3000  2020   
9      Suburb                              East Melbourne 3002  2020   
10     Suburb                                     Carlton 3053  2020   
11     Suburb       North Melbourne 3051 / West Melbourne 3003  2020   
12     Suburb  South Yarra 3141 / Melbourne/St Kilda Road 3004  2020   
13     Suburb                                   Docklands 3008  

Data for the year 2019 is extracted.

In [7]:
data_2019 = pd.read_csv('mental_health_2019.csv')

# Filter rows where 'DESCRIPTION' is 'Self reported mental health'
data_2019 = data_2019[data_2019['DESCRIPTION'] == "Self reported health"]

#organise into desired format
data_2019 = organise_data(data_2019)

     Category                                      Subcategory  Year  \
1   Age Group                                      55-64 years  2019   
2   Age Group                                      35-44 years  2019   
3   Age Group                                      25-34 years  2019   
4   Age Group                                      18-24 years  2019   
5   Age Group                                        65+ years  2019   
6   Age Group                                      45-54 years  2019   
7      Suburb                     South Wharf / Southbank 3006  2019   
8      Suburb       North Melbourne 3051 / West Melbourne 3003  2019   
9      Suburb                                   Melbourne 3000  2019   
10     Suburb                                City of Melbourne  2019   
11     Suburb  South Yarra 3141 / Melbourne/St Kilda Road 3004  2019   
12     Suburb                                   Parkville 3052  2019   
13     Suburb                     Kensington / Flemington 3031  

Data for the year 2018 is extracted.

In [8]:
data_2018 = pd.read_csv('mental_health_2018.csv')

# Filter rows where 'DESCRIPTION' is 'Self reported mental health'
data_2018 = data_2018[data_2018['DESCRIPTION'] == "Self reported health"]

#organise into desired format
data_2018 = organise_data(data_2018)

     Category                                        Subcategory  Year  \
1   Age Group                                        35-44 years  2018   
2   Age Group                                        55-64 years  2018   
3   Age Group                                          65+ years  2018   
4   Age Group                                        18-24 years  2018   
5   Age Group                                        25-34 years  2018   
6   Age Group                                        45-54 years  2018   
7      Suburb                         Kensington/ Flemingon 3031  2018   
8      Suburb                                     Melbourne 3000  2018   
9      Suburb                                  City of Melbourne  2018   
10     Suburb                                       Carlton 3053  2018   
11     Suburb                                     Docklands 3008  2018   
12     Suburb  South Yarra 3141 / Melbourne (St Kilda Road) 3004  2018   
13     Suburb                         

Extract the physical data for 2023 and organise it accordingly. We need to extract all features of the 2022 and 2023 datasets to get an overall health rating.

In [9]:
physical_2023 = pd.read_csv('physical_health_2023.csv')

#organise into desired format
physical_2023 = organise_data(physical_2023)

     Category                                      Subcategory  Year  \
1   Age Group                                      18-24 years  2023   
2   Age Group                                      45-54 years  2023   
3   Age Group                                      35-44 years  2023   
4   Age Group                                      25-34 years  2023   
5   Age Group                                        65+ years  2023   
6   Age Group                                      55-64 years  2023   
7      Suburb                                City of Melbourne  2023   
8      Suburb                                   Docklands 3008  2023   
9      Suburb       North Melbourne 3051 / West Melbourne 3003  2023   
10     Suburb                                   Parkville 3052  2023   
11     Suburb                                   Melbourne 3000  2023   
12     Suburb                     South Wharf / Southbank 3006  2023   
13     Suburb                              East Melbourne 3002  

Extract physical health feature from 2022 dataset using filter

In [10]:
physical_2022 = pd.read_csv('mental_health_2022.csv')

# Filter rows where 'DESCRIPTION' is 'Self reported mental health'
physical_2022 = physical_2022[physical_2022['DESCRIPTION'] == "Self reported physical health"]

#organise into desired format
physical_2022 = organise_data(physical_2022)

     Category                                      Subcategory  Year  \
1   Age Group                                      18-24 years  2022   
2   Age Group                                      35-44 years  2022   
3   Age Group                                      45-54 years  2022   
4   Age Group                                        65+ years  2022   
5   Age Group                                      55-64 years  2022   
6   Age Group                                      25-34 years  2022   
7      Suburb                                City of Melbourne  2022   
8      Suburb  South Yarra 3141 / Melbourne/St Kilda Road 3004  2022   
9      Suburb                     South Wharf / Southbank 3006  2022   
10     Suburb                                   Parkville 3052  2022   
11     Suburb                              East Melbourne 3002  2022   
12     Suburb                     Kensington / Flemington 3031  2022   
13     Suburb                                   Docklands 3008  

Extract smoking and vaping data for 2022.

In [11]:
smoking_2022 = pd.read_csv('mental_health_2022.csv')

# Filter rows where 'DESCRIPTION' is 'Self reported mental health'
smoking_2022 = smoking_2022[smoking_2022['DESCRIPTION'] == "Smoking behaviour"]

#organise into desired format
smoking_2022 = organise_data(smoking_2022)

     Category                                      Subcategory  Year  \
1   Age Group                                      35-44 years  2022   
2   Age Group                                        65+ years  2022   
3   Age Group                                      25-34 years  2022   
4   Age Group                                      55-64 years  2022   
5   Age Group                                      18-24 years  2022   
6   Age Group                                      45-54 years  2022   
7      Suburb       North Melbourne 3051 / West Melbourne 3003  2022   
8      Suburb                                   Parkville 3052  2022   
9      Suburb  South Yarra 3141 / Melbourne/St Kilda Road 3004  2022   
10     Suburb                                     Carlton 3053  2022   
11     Suburb                                   Docklands 3008  2022   
12     Suburb                     Kensington / Flemington 3031  2022   
13     Suburb                                   Melbourne 3000  

In [12]:
vaping_2022 = pd.read_csv('mental_health_2022.csv')

# Filter rows where 'DESCRIPTION' is 'Self reported mental health'
vaping_2022 = vaping_2022[vaping_2022['DESCRIPTION'] == "Vaping behaviour"]

#organise into desired format
vaping_2022 = organise_data(vaping_2022)

     Category                                      Subcategory  Year  \
1   Age Group                                        65+ years  2022   
2   Age Group                                      18-24 years  2022   
3   Age Group                                      25-34 years  2022   
4   Age Group                                      45-54 years  2022   
5   Age Group                                      55-64 years  2022   
6   Age Group                                      35-44 years  2022   
7      Suburb                              East Melbourne 3002  2022   
8      Suburb                                   Docklands 3008  2022   
9      Suburb                                     Carlton 3053  2022   
10     Suburb                                   Parkville 3052  2022   
11     Suburb                     South Wharf / Southbank 3006  2022   
12     Suburb       North Melbourne 3051 / West Melbourne 3003  2022   
13     Suburb  South Yarra 3141 / Melbourne/St Kilda Road 3004  

In [13]:
import pandas as pd
from functools import reduce

# Rename columns in each dataframe to make them unique
mental_data_2022 = mental_data_2022.rename(columns={"Percentage": "Percentage_mental"})
physical_2022 = physical_2022.rename(columns={"Percentage": "Percentage_physical"})
smoking_2022 = smoking_2022.rename(columns={"Percentage": "Percentage_smoking"})
vaping_2022 = vaping_2022.rename(columns={"Percentage": "Percentage_vaping"})

# List of DataFrames to merge
dataframes = [mental_data_2022, physical_2022, smoking_2022, vaping_2022]

# Merge all DataFrames on ['Category', 'Subcategory', 'Year']
merged_df_2022 = reduce(
    lambda left, right: pd.merge(left, right, on=['Category', 'Subcategory', 'Year']),
    dataframes
)

# Calculate the average percentage for mental, physical, smoking, and vaping health
merged_df_2022['percentage'] = merged_df_2022[
    ['Percentage_mental', 'Percentage_physical', 'Percentage_smoking', 'Percentage_vaping']
].mean(axis=1)

# Drop the individual percentage columns
merged_df_2022 = merged_df_2022.drop(
    columns=['Percentage_mental', 'Percentage_physical', 'Percentage_smoking', 'Percentage_vaping']
)

# Display the updated DataFrame
print(merged_df_2022)


     Category                                      Subcategory  Year  \
0   Age Group                                      25-34 years  2022   
1   Age Group                                        65+ years  2022   
2   Age Group                                      55-64 years  2022   
3   Age Group                                      18-24 years  2022   
4   Age Group                                      45-54 years  2022   
5   Age Group                                      35-44 years  2022   
6      Suburb                              East Melbourne 3002  2022   
7      Suburb  South Yarra 3141 / Melbourne/St Kilda Road 3004  2022   
8      Suburb       North Melbourne 3051 / West Melbourne 3003  2022   
9      Suburb                                City of Melbourne  2022   
10     Suburb                     Kensington / Flemington 3031  2022   
11     Suburb                                   Melbourne 3000  2022   
12     Suburb                                   Parkville 3052  

Extract smoking and vaping for 2023 data sets.

In [14]:
smoking_2023 = pd.read_csv('smoking_2023 (2).csv')

#organise into desired format
smoking_2023 = organise_data(smoking_2023)

     Category                                      Subcategory  Year  \
1   Age Group                                      25-34 years  2023   
2   Age Group                                      45-54 years  2023   
3   Age Group                                        65+ years  2023   
4   Age Group                                      55-64 years  2023   
5   Age Group                                      35-44 years  2023   
6   Age Group                                      18-24 years  2023   
7      Suburb                                   Parkville 3052  2023   
8      Suburb                              East Melbourne 3002  2023   
9      Suburb                                City of Melbourne  2023   
10     Suburb                                     Carlton 3053  2023   
11     Suburb                     South Wharf / Southbank 3006  2023   
12     Suburb                                   Melbourne 3000  2023   
13     Suburb                                   Docklands 3008  

In [15]:
vaping_2023 = pd.read_csv('vaping_2023.csv')

#organise into desired format
vaping_2023 = organise_data(vaping_2023)

     Category                                      Subcategory  Year  \
1   Age Group                                      35-44 years  2023   
2   Age Group                                      45-54 years  2023   
3   Age Group                                      55-64 years  2023   
4   Age Group                                      18-24 years  2023   
5   Age Group                                        65+ years  2023   
6   Age Group                                      25-34 years  2023   
7      Suburb                              East Melbourne 3002  2023   
8      Suburb                                   Docklands 3008  2023   
9      Suburb                                City of Melbourne  2023   
10     Suburb       North Melbourne 3051 / West Melbourne 3003  2023   
11     Suburb                                   Melbourne 3000  2023   
12     Suburb                     South Wharf / Southbank 3006  2023   
13     Suburb                     Kensington / Flemington 3031  

In [16]:
import pandas as pd
from functools import reduce

# Rename columns in each dataframe to make them unique
mental_data_2023 = mental_data_2023.rename(columns={"Percentage": "Percentage_mental"})
physical_2023 = physical_2023.rename(columns={"Percentage": "Percentage_physical"})
smoking_2023 = smoking_2023.rename(columns={"Percentage": "Percentage_smoking"})
vaping_2023 = vaping_2023.rename(columns={"Percentage": "Percentage_vaping"})

# List of DataFrames to merge
dataframes_23 = [mental_data_2023, physical_2023, smoking_2023, vaping_2023]

# Merge all DataFrames on ['Category', 'Subcategory', 'Year']
merged_df_2023 = reduce(
    lambda left, right: pd.merge(left, right, on=['Category', 'Subcategory', 'Year']),
    dataframes_23
)

# Calculate the average percentage for mental, physical, smoking, and vaping health
merged_df_2023['percentage'] = merged_df_2023[
    ['Percentage_mental', 'Percentage_physical', 'Percentage_smoking', 'Percentage_vaping']
].mean(axis=1)

# Drop the individual percentage columns
merged_df_2023 = merged_df_2023.drop(
    columns=['Percentage_mental', 'Percentage_physical', 'Percentage_smoking', 'Percentage_vaping']
)

# Display the updated DataFrame
print(merged_df_2023)


     Category                                      Subcategory  Year  \
0   Age Group                                      25-34 years  2023   
1   Age Group                                      55-64 years  2023   
2   Age Group                                      18-24 years  2023   
3   Age Group                                        65+ years  2023   
4   Age Group                                      45-54 years  2023   
5   Age Group                                      35-44 years  2023   
6      Suburb       North Melbourne 3051 / West Melbourne 3003  2023   
7      Suburb                                   Parkville 3052  2023   
8      Suburb                     South Wharf / Southbank 3006  2023   
9      Suburb                     Kensington / Flemington 3031  2023   
10     Suburb                                City of Melbourne  2023   
11     Suburb                                   Melbourne 3000  2023   
12     Suburb                                     Carlton 3053  

In [18]:
#concatenate all dataframes to get overal health indicator
dfs = [merged_df_2023, merged_df_2022, data_2021, data_2020, data_2019, data_2018]

# Concatenate the list of DataFrames along rows
result = pd.concat(dfs, axis=0, ignore_index=True)

# Fill NaN values in 'percentage' with values from 'Percentage'
result['percentage'] = result['percentage'].fillna(result['Percentage'])

# Drop the extra 'Percentage' column
result = result.drop(columns=['Percentage'])

# Display the updated DataFrame
print(result)


      Category                                 Subcategory  Year  percentage
0    Age Group                                 25-34 years  2023      32.475
1    Age Group                                 55-64 years  2023      26.400
2    Age Group                                 18-24 years  2023      31.100
3    Age Group                                   65+ years  2023      29.975
4    Age Group                                 45-54 years  2023      29.400
..         ...                                         ...   ...         ...
103     Suburb  North Melbourne 3051 / West Melbourne 3003  2018      56.000
104     Suburb                              Parkville 3052  2018      44.400
105     Suburb                 Southbank/ South Wharf 3006  2018      44.600
106     Gender                                      Female  2018      48.700
107     Gender                                        Male  2018      53.800

[108 rows x 4 columns]


In [19]:
# Save DataFrame to CSV file
result.to_csv('self_reported_health.csv', index=False)