In [19]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.offline as offline
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [20]:
# Defining the chunk size (number of rows per chunk)
chunksize = 10_000

# Initializing an empty DataFrame to store the processed data
processed_data = pd.DataFrame()

def process_chunk(chunk):
    # Filter the data for years 2018-2022
    filtered_chunk = chunk[(chunk['REF_DATE'] >= 2018) & (chunk['REF_DATE'] <= 2022)]
    return filtered_chunk

# Reading the CSV file in chunks
chunks = []
for chunk in pd.read_csv('Wages_Occupation.csv', chunksize=chunksize):
    processed_chunk = process_chunk(chunk)
    chunks.append(processed_chunk)

processed_data = pd.concat(chunks, ignore_index=True)

'''I created a list called chunks to store the processed chunks, and then use pd.concat() to concatenate all chunks into a single DataFrame.
This approach is efficient because it avoids the deprecation warnings and it is easy to read the big data size of around 1 GB.'''



In [21]:
# Filtering the data for full-time employees and median hourly wage
full_time_data = processed_data[(processed_data['Type of work'] == 'Full-time employees') & (processed_data['Wages'] == 'Median hourly wage rate')]

# Grouping the data by occupation and calculate the mean wage
average_wages = full_time_data.groupby('National Occupational Classification (NOC)')['VALUE'].mean().reset_index()

# Sorting the average_wages DataFrame by wage values in descending order
sorted_average_wages = average_wages.sort_values(by='VALUE', ascending=True)

In [22]:
# Creating a new column for wage ranges
def wage_range(value):
    if value <= 20:
        return 'Low'
    elif 20 < value <= 30:
        return 'Medium'
    elif 30 < value <= 40:
        return 'High'
    else:
        return 'Very High'

sorted_average_wages['Wage Range'] = sorted_average_wages['VALUE'].apply(wage_range)

# Creating a nested treemap using Plotly
fig = px.treemap(sorted_average_wages,
                 path=['Wage Range', 'National Occupational Classification (NOC)'],
                 values='VALUE',
                 color='VALUE',  # Set the color based on the VALUE column
                 color_continuous_scale='Blues',  # Define the color scale
                 title='Average Wages Across Different Occupations in Canada',
                 labels={'National Occupational Classification (NOC)': 'Occupation'},
                 hover_data=['VALUE'])

# Customizing the appearance of the chart
fig.update_traces(textinfo='label+value', hovertemplate='<b>%{label}:</b> %{value:.2f}')
fig.update_layout(title=dict(x=0.5, y=0.95, font=dict(size=25)))

# Showing the chart
fig.show()
offline.plot(fig, filename='fig1.1.html', auto_open=True)

'fig1.1.html'

In [24]:
# Filtering the data for each wage range
low_wages = sorted_average_wages[sorted_average_wages['Wage Range'] == 'Low']
medium_wages = sorted_average_wages[sorted_average_wages['Wage Range'] == 'Medium']
high_wages = sorted_average_wages[sorted_average_wages['Wage Range'] == 'High']
very_high_wages = sorted_average_wages[sorted_average_wages['Wage Range'] == 'Very High']

# Function to create custom color scale
def custom_color_scale(wages, min_val, max_val):
    return np.interp(wages, (min_val, max_val), (0.2, 1))

# Creating subplots
fig = make_subplots(rows=4, cols=1, subplot_titles=("Low", "Medium", "High", "Very High"), vertical_spacing=0.12)

# Adding bar charts for each wage range with custom hovertemplate and color scale
fig.add_trace(go.Bar(y=low_wages['National Occupational Classification (NOC)'], x=low_wages['VALUE'], orientation='h', name="Low", hovertemplate='Average Wage: %{x:.2f}<extra></extra>', marker=dict(color=custom_color_scale(low_wages['VALUE'], low_wages['VALUE'].min(), low_wages['VALUE'].max()), colorscale='Blues', showscale=False)), row=1, col=1)
fig.add_trace(go.Bar(y=medium_wages['National Occupational Classification (NOC)'], x=medium_wages['VALUE'], orientation='h', name="Medium", hovertemplate='Average Wage: %{x:.2f}<extra></extra>', marker=dict(color=custom_color_scale(medium_wages['VALUE'], medium_wages['VALUE'].min(), medium_wages['VALUE'].max()), colorscale='Blues', showscale=False)), row=2, col=1)
fig.add_trace(go.Bar(y=high_wages['National Occupational Classification (NOC)'], x=high_wages['VALUE'], orientation='h', name="High", hovertemplate='Average Wage: %{x:.2f}<extra></extra>', marker=dict(color=custom_color_scale(high_wages['VALUE'], high_wages['VALUE'].min(), high_wages['VALUE'].max()), colorscale='Blues', showscale=False)), row=3, col=1)
fig.add_trace(go.Bar(y=very_high_wages['National Occupational Classification (NOC)'], x=very_high_wages['VALUE'], orientation='h', name="Very High", hovertemplate='Average Wage: %{x:.2f}<extra></extra>', marker=dict(color=custom_color_scale(very_high_wages['VALUE'], very_high_wages['VALUE'].min(), very_high_wages['VALUE'].max()), colorscale='Blues', showscale=False)), row=4, col=1)

# Customizing the layout
fig.update_layout(
    height=1200,
    width=1000,
    title_text="Average Wages Across Different Occupations in Canada (By Wage Range)",
    margin=dict(l=300, r=0, t=100, b=0),  # Adjust the left margin
)


# Customizing the x-axis labels
fig.update_xaxes(title_text="Average Wage", row=1, col=1)
fig.update_xaxes(title_text="Average Wage", row=2, col=1)
fig.update_xaxes(title_text="Average Wage", row=3, col=1)
fig.update_xaxes(title_text="Average Wage", row=4, col=1)

# Customizing the y-axis labels
fig.update_yaxes(title_text="Occupation", row=1, col=1)
fig.update_yaxes(title_text="Occupation", row=2, col=1)
fig.update_yaxes(title_text="Occupation", row=3, col=1)
fig.update_yaxes(title_text="Occupation", row=4, col=1)

# Adjusting subplot title font size
for i in fig['layout']['annotations']:
    i['font'] = dict(size=12)
    
# Showing the chart
fig.show()
offline.plot(fig, filename='fig1.2.html', auto_open=True)

'fig1.2.html'