In [1]:
# Import dependencies
import pandas as pd
import os

In [2]:
# Below are the ABSOLUTE paths to the csv files (since they are too big for git without large file handling)
    # PATHS WILL BE DIFFERENT FOR EACH COMPUTER

census_file_cd_csd = "C:/Users/kronh/OneDrive/Coding_bootcamp/project_1_datasources/2021_census_data_cd_cds/98-401-X2021005_English_CSV_data.csv"

employment_nums = "C:/Users/kronh/OneDrive/Coding_bootcamp/project_1_datasources/employment_by_occupation/14100389.csv"

wages = "C:/Users/kronh/OneDrive/Coding_bootcamp/project_1_datasources/wage_data_2021_by_region_and_job.csv"

### Cleaning of census data

In [60]:
# Reads census data by division and subdivision into a DataFrame
census_cd_csd_df = pd.read_csv(census_file_cd_csd, encoding='ISO-8859-1')

# Grabs data from Ontario only - regions with Alt Geo Codes starting with 35
census_cd_csd_df = census_cd_csd_df[census_cd_csd_df["ALT_GEO_CODE"].astype(str).str[:2] == '35']

In [61]:
### Creates a list of dictionaries with division codes tied to division names

# Makes a list for dictionaries of division codes and names in Ontario
divisions = []
# Loops through all the rows in the dataframe at the census division level
for index, row in census_cd_csd_df[census_cd_csd_df['GEO_LEVEL'] == 'Census division'].iterrows():
    # Gets the code and name for the census division
    code = row['ALT_GEO_CODE']
    name = row['GEO_NAME']
    # Creates a dictionary to hold the code and name
    dic = {'code': code, 'name': name}
    # Adds the dictionary to the list divisions if it's not already there
    if dic not in divisions:
        divisions.append(dic)

In [62]:
# Grabs the subset of data at the census subdivision level 
census_csd_df = census_cd_csd_df.loc[census_cd_csd_df['GEO_LEVEL'] == 'Census subdivision'].copy().reset_index(drop=True)

# Keeps a subset of columns
census_csd_df = census_csd_df[['ALT_GEO_CODE', 'GEO_NAME', 'CHARACTERISTIC_ID', 'CHARACTERISTIC_NAME', 'C1_COUNT_TOTAL']]

In [65]:
# Adds a column for census division code to the subdivision data, populates it, and resets the index
census_csd_df['Census division code'] = census_csd_df["ALT_GEO_CODE"].astype(str).str[:4]

# Renames the Geo name column
census_csd_df = census_csd_df.rename(columns={'GEO_NAME': "Census subdivision name", "ALT_GEO_CODE":"Census subdivision code"})

In [None]:
### Filter out unnecessary characteristics and put desired ones into columns

# Makes an empty list to hold dictionaries of values
characteristics_by_csd = []





In [66]:
### Adds a column for census division name to the subdivision data and populates it

# Adds an empty column for census division name
census_csd_df['Census division name'] = ''

# Populates the census division name based on the census division code column 
for index, row in census_csd_df.iterrows():
    #(Todo: rewrite with find + lambda, time permitting)
    # Iterates through the dictionaries in the divisions list
    for item in divisions:
        # Checks to see which division code appears in the row 
        if int(row['Census division code']) == int(item['code']):
            # Sets the division name equal to the name in the dictionary with the proper division code
            census_csd_df.at[index,'Census division name'] = item['name']

Unnamed: 0,Census subdivision code,Census subdivision name,CHARACTERISTIC_ID,CHARACTERISTIC_NAME,C1_COUNT_TOTAL,SYMBOL,Census division code,Census division name
0,3501005,"South Glengarry, Township (TP)",1,"Population, 2021",13330.0,,3501,"Stormont, Dundas and Glengarry, United countie..."
1,3501005,"South Glengarry, Township (TP)",2,"Population, 2016",13150.0,,3501,"Stormont, Dundas and Glengarry, United countie..."
2,3501005,"South Glengarry, Township (TP)",3,"Population percentage change, 2016 to 2021",1.4,,3501,"Stormont, Dundas and Glengarry, United countie..."
3,3501005,"South Glengarry, Township (TP)",4,Total private dwellings,5848.0,,3501,"Stormont, Dundas and Glengarry, United countie..."
4,3501005,"South Glengarry, Township (TP)",5,Private dwellings occupied by usual residents,5431.0,,3501,"Stormont, Dundas and Glengarry, United countie..."


In [74]:

### Adds columns for economic region name and code to the subdivision data and populates them

# Points to a csv with the census divisions in each economic region
er_breakdown = 'Resources/economic_regions_breakdown.csv'

# Puts the economic region breakdown into a DataFrame
er_cd_df = pd.read_csv(er_breakdown)

# Makes empty columns for economic region data in the census dataframe
census_csd_df[['Economic region code', 'Economic region name']] = ["", ""]

## Populates economic region columns in the census dataframe
# Iterates over the rows in the census dataframe
for index, row in census_csd_df.iterrows():
    # Iterates over the rows in the economic region breakdown dataframe
    for index_2, row_2 in er_cd_df.iterrows():
        # Checks to see if the census division codes on the two rows are equal 
        if str(row['Census division code']).strip() == str(row_2['CD']).strip():
            # Sets economic region code equal to the code in the ER breakdown dataframe
            census_csd_df.at[index,'Economic region code'] = row_2['ERC']
            # Sets economic region name equal to the name in the ER breakdown dataframe
            census_csd_df.at[index,'Economic region code'] = row_2['ER']

census_csd_df.head()

KeyboardInterrupt: 

### Cleaning of wages by job and economic region

In [48]:
# Reads the wages csv into a DataFrame
wages_df = pd.read_csv(wages)

# Filters wages DataFrame for only Ontario data 
wages_df = wages_df.loc[wages_df['PROV'] == "ON"]
# Removes data for Ontaio as a whole 
wages_df = wages_df.loc[wages_df['ER_Code_Code_RE'] != 'ER35']

# Keeps a subset of the columns and resets the index
wages_df = wages_df[['NOC_Title', "ER_Code_Code_RE", "ER_Name_Nom_RE", "Low_Wage_Salaire_Minium", "Median_Wage_Salaire_Median", "High_Wage_Salaire_Maximal"]].reset_index(drop=True)

# Renames columns 
wages_df = wages_df.rename(columns={'NOC_Title':'National Occupational Classification (NOC)',"ER_Code_Code_RE":"Economic region code", "ER_Name_Nom_RE": "Economic region name", "Low_Wage_Salaire_Minium": "Minimum wage", "Median_Wage_Salaire_Median":"Median wage", "High_Wage_Salaire_Maximal": "Maximum wage"})

# Removes 'ER' from Economic Region code values
wages_df["Economic region code"] = wages_df["Economic region code"].str[2:]

wages_df.head()

Unnamed: 0,National Occupational Classification (NOC),Economic region code,Economic region name,Minimum wage,Median wage,Maximum wage
0,Legislators,3510,Ottawa,,,
1,Legislators,3515,Kingston--Pembroke,,,
2,Legislators,3520,Muskoka--Kawarthas,,,
3,Legislators,3530,Toronto,,,
4,Legislators,3540,Kitchener--Waterloo--Barrie,,,


### Cleaning of number employed by job and economic region

In [49]:
# Reads employment data into a dataframe
employment_df = pd.read_csv(employment_nums)

# Removes data for Canada as a whole
employment_df = employment_df.loc[employment_df['GEO'] != 'Canada']

# Splits the Geo line into Economic Region and Province
#   Province-level data doesn't have a split, so the povince is assigned to the economic region and province is empty. This would be a problem, except that I only WANT data at the economic region level
employment_df[['Economic Region', 'Province']] = employment_df["GEO"].str.split(", ", expand = True)

# Keeps only data for Ontario at an economic region level 
employment_df = employment_df.loc[employment_df['Province'] == 'Ontario']

# Adds an empty column for Economic Region Code
employment_df['Economic region code'] = ''

### Populates the Economic region code column

# Makes unique Economic region names and codes into a separate dataframe; removes duplicates and resets the index
er_df = er_cd_df.copy()[['ERC', 'ER']].drop_duplicates(keep='first').reset_index(drop=True)

# Iterates over rows in employment dataframe column
for index, row in employment_df.iterrows():
    # Iterates through the rows in the economic region codes dataframe
    for index_2, row_2 in er_df.iterrows():
        # Checks to see which division code appears in the row of the empployment dataframe
        if str(row['Economic Region']).strip() == str(row_2['ER']).strip():
            # Sets the division name equal to the name in the dictionary with the proper division code
            employment_df.at[index,'Economic region code'] = row_2['ERC']
    
# Splits the NOC column into name and number
employment_df[['National Occupational Classification (NOC)',"NOC code"]] = employment_df['National Occupational Classification (NOC)'].str.split("[", expand=True)

# Removes space at the end of NOC column
employment_df['National Occupational Classification (NOC)'] = employment_df['National Occupational Classification (NOC)'].str.strip()

# Removes end bracket from NOC code column
employment_df["NOC code"] = employment_df["NOC code"].str[:-1]

# Filters columns to keep
employment_df = employment_df[['National Occupational Classification (NOC)', "NOC code", 'Economic region code', 'Economic Region', 'VALUE']]

# Renames some columns
employment_df = employment_df.rename(columns={'Economic Region': "Economic region name", "VALUE":"Number Employed (thousands)"})

# Resets the index
employment_df = employment_df.reset_index(drop=True)

# Shows the head of the dataframe
employment_df.head()

Unnamed: 0,National Occupational Classification (NOC),NOC code,Economic region code,Economic region name,Number Employed (thousands)
0,"Total employed, all occupations",,3510,Ottawa,640.5
1,Management occupations,0,3510,Ottawa,74.0
2,Senior management occupations,00,3510,Ottawa,6.0
3,Specialized middle management occupations,01-05,3510,Ottawa,30.2
4,Middle management occupations in retail and wh...,06,3510,Ottawa,22.2


### Merge of employment and wage DataFrames