In [2]:
# Import dependencies
import pandas as pd
import os

In [3]:
# Below are the ABSOLUTE paths to the csv files (since they are too big for git without large file handling)
    # PATHS WILL BE DIFFERENT FOR EACH COMPUTER

census_file_er = "C:/Users/kronh/OneDrive/Coding_bootcamp/project_1_datasources/2021_census_data_economic_regions/98-401-X2021008_English_CSV_data.csv"

census_file_cd_csd = "C:/Users/kronh/OneDrive/Coding_bootcamp/project_1_datasources/2021_census_data_cd_cds/98-401-X2021005_English_CSV_data.csv"

employment_nums = "C:/Users/kronh/OneDrive/Coding_bootcamp/project_1_datasources/employment_by_occupation/14100389.csv"

wages = "C:/Users/kronh/OneDrive/Coding_bootcamp/project_1_datasources/wage_data_2021_by_region_and_job.csv"

In [64]:
# Reads census data by economic region into a DataFrame
census_er_df = pd.read_csv(census_file_er, encoding='ISO-8859-1')

# Grabs the subset of data at the economic region level 
census_er_df = census_er_df[census_er_df['GEO_LEVEL'] == 'Economic region']

# Grabs data from Ontario economic regions - regions with Alt Geo Codes starting with 35
census_er_df = census_er_df[census_er_df["ALT_GEO_CODE"].astype(str).str[:2] == '35']

# Keeps subset of columns - geographic information, characteristic information, and total people
census_er_df = census_er_df[['ALT_GEO_CODE', 'GEO_NAME','CHARACTERISTIC_ID', 'CHARACTERISTIC_NAME', 'C1_COUNT_TOTAL']]

# Figure out which characteristics we want and keep only those


# .reset_index(drop=True)

In [65]:
# Displays the census by economic regions
census_er_df.head()

Unnamed: 0,ALT_GEO_CODE,GEO_NAME,CHARACTERISTIC_ID,CHARACTERISTIC_NAME,C1_COUNT_TOTAL
102609,3510,Ottawa,1,"Population, 2021",1407555.0
102610,3510,Ottawa,2,"Population, 2016",1306249.0
102611,3510,Ottawa,3,"Population percentage change, 2016 to 2021",7.8
102612,3510,Ottawa,4,Total private dwellings,602529.0
102613,3510,Ottawa,5,Private dwellings occupied by usual residents,570809.0


In [66]:
# Reads census data by division and subdivision into a DataFrame
census_cd_csd_df = pd.read_csv(census_file_cd_csd, encoding='ISO-8859-1')

# Grabs the subset of data at the census division and census subdivision levels and resets the index
census_cd_csd_df = census_cd_csd_df.loc[census_cd_csd_df['GEO_LEVEL'].isin(['Census division', 'Census subdivision'])].reset_index(drop=True)

# Grabs data from Ontario economic regions - regions with Alt Geo Codes starting with 35
census_cd_csd_df = census_cd_csd_df[census_cd_csd_df["ALT_GEO_CODE"].astype(str).str[:2] == '35']



In [67]:
# Displays the census data by census division and subdivision
census_cd_csd_df.head()

Unnamed: 0,CENSUS_YEAR,DGUID,ALT_GEO_CODE,GEO_LEVEL,GEO_NAME,TNR_SF,TNR_LF,DATA_QUALITY_FLAG,CHARACTERISTIC_ID,CHARACTERISTIC_NAME,...,C2_COUNT_MEN+,SYMBOL.1,C3_COUNT_WOMEN+,SYMBOL.2,C10_RATE_TOTAL,SYMBOL.3,C11_RATE_MEN+,SYMBOL.4,C12_RATE_WOMEN+,SYMBOL.5
5940798,2021,2021A00033501,3501,Census division,"Stormont, Dundas and Glengarry, United countie...",1.5,2.1,20000,1,"Population, 2021",...,,...,,...,,...,,...,,...
5940799,2021,2021A00033501,3501,Census division,"Stormont, Dundas and Glengarry, United countie...",1.5,2.1,20000,2,"Population, 2016",...,,...,,...,,...,,...,,...
5940800,2021,2021A00033501,3501,Census division,"Stormont, Dundas and Glengarry, United countie...",1.5,2.1,20000,3,"Population percentage change, 2016 to 2021",...,,...,,...,1.0,,,...,,...
5940801,2021,2021A00033501,3501,Census division,"Stormont, Dundas and Glengarry, United countie...",1.5,2.1,20000,4,Total private dwellings,...,,...,,...,,...,,...,,...
5940802,2021,2021A00033501,3501,Census division,"Stormont, Dundas and Glengarry, United countie...",1.5,2.1,20000,5,Private dwellings occupied by usual residents,...,,...,,...,,...,,...,,...


In [31]:
# Reads the wages csv into a DataFrame
wages_df = pd.read_csv(wages)

# Filters wages DataFrame for only Ontario data 
wages_df = wages_df.loc[wages_df['PROV'] == "ON"]
# Removes data for Ontaio as a whole 
wages_df = wages_df.loc[wages_df['ER_Code_Code_RE'] != 'ER35']

# Keeps a subset of the columns and resets the index
wages_df = wages_df[['NOC_Title', "PROV", "ER_Code_Code_RE", "ER_Name_Nom_RE", "Low_Wage_Salaire_Minium", "Median_Wage_Salaire_Median", "High_Wage_Salaire_Maximal"]].reset_index(drop=True)

wages_df.head()

Unnamed: 0,NOC_Title,PROV,ER_Code_Code_RE,ER_Name_Nom_RE,Low_Wage_Salaire_Minium,Median_Wage_Salaire_Median,High_Wage_Salaire_Maximal
38,Legislators,ON,ER3510,Ottawa,,,
39,Legislators,ON,ER3515,Kingston--Pembroke,,,
40,Legislators,ON,ER3520,Muskoka--Kawarthas,,,
41,Legislators,ON,ER3530,Toronto,,,
42,Legislators,ON,ER3540,Kitchener--Waterloo--Barrie,,,


In [28]:
# Reads employment data into a dataframe
employment_df = pd.read_csv(employment_nums)

# Removes data for Canada as a whole
employment_df = employment_df.loc[employment_df['GEO'] != 'Canada']

# Splits the Geo line into Economic Region and Province
#   Province-level data doesn't have a split, so the povince is assigned to the economic region and province is empty. This would be a problem, except that I only WANT data at the economic region level
employment_df[['Economic Region', 'Province']] = employment_df["GEO"].str.split(", ", expand = True)

# Keeps only data for Ontario at an economic region level 
employment_df = employment_df.loc[employment_df['Province'] == 'Ontario']

# Filters columns to keep
employment_df = employment_df[['National Occupational Classification (NOC)', 'Economic Region', 'VALUE', 'SCALAR_FACTOR']]

# Resets the index
employment_df = employment_df.reset_index(drop=True)

# Shows the head of the dataframe
print(employment_df.head(5))

          National Occupational Classification (NOC) Economic Region  VALUE  \
0                    Total employed, all occupations          Ottawa  640.5   
1                         Management occupations [0]          Ottawa   74.0   
2                 Senior management occupations [00]          Ottawa    6.0   
3  Specialized middle management occupations [01-05]          Ottawa   30.2   
4  Middle management occupations in retail and wh...          Ottawa   22.2   

  SCALAR_FACTOR  
0     thousands  
1     thousands  
2     thousands  
3     thousands  
4     thousands  
