In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
from prophet import Prophet


In [2]:
# Load the dataset, specifying the encoding as 'latin-1'
df = pd.read_csv('NPRI_2000-2022.csv', encoding='latin-1')

#Setting up notebook so that all columns are displayed
pd.set_option('display.max_columns', None)

# Preview the dataset
df.head()

  df = pd.read_csv('NPRI_2000-2022.csv', encoding='latin-1')


Unnamed: 0,Reporting_Year / Année,NPRI_ID / No_INRP,Number of employees,Company_Name / Dénomination_sociale_de_l'entreprise,Facility_Name / Installation,NAICS / Code_SCIAN,NAICS Title / Titre Code_SCIAN,PROVINCE,City,Latitude,Longitude,CAS_Number / No_CAS,Substance Name (English) / Nom de substance (Anglais),Units / Unités,Estimation_Method / Méthode_destimation,Release to Air - Fugitive,Release to Air - Other Non-Point,Release to Air - Road dust,Release to Air - Spills,Release to Air - Stack / Point,Release to Air - Storage / Handling,Releases to Land - Leaks,Releases to Land - Other,Releases to Land - Spills,Releases to Water Bodies - Direct Discharges,Releases to Water Bodies - Leaks,Releases to Water Bodies - Spills,Sum of release to all media (<1tonne)
0,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,10049-04-4,Chlorine dioxide,tonnes,M - Monitoring or Direct Measurement,,,,,5.2,,,,,,,,
1,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,67-56-1,Methanol,tonnes,E - Emission Factor,,,,,113.0,,,,,,,,
2,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,67-66-3,Chloroform,tonnes,M - Monitoring or Direct Measurement,,,,,,,,,,0.0,,,
3,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,75-07-0,Acetaldehyde,tonnes,E - Emission Factor,,,,,7.67,,,,,,,,
4,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,7647-01-0,Hydrochloric acid,tonnes,M - Monitoring or Direct Measurement,,,,,0.665,,,,,,,,


In [3]:
df.shape

(737516, 28)

In [4]:
#showing all the attributes in the data
df.columns

Index(['Reporting_Year / Année', 'NPRI_ID / No_INRP', 'Number of employees',
       'Company_Name / Dénomination_sociale_de_l'entreprise',
       'Facility_Name / Installation', 'NAICS / Code_SCIAN',
       'NAICS Title / Titre Code_SCIAN', 'PROVINCE', 'City', 'Latitude',
       'Longitude', 'CAS_Number / No_CAS',
       'Substance Name (English) / Nom de substance (Anglais)',
       'Units / Unités', 'Estimation_Method / Méthode_destimation',
       'Release to Air - Fugitive', 'Release to Air - Other Non-Point ',
       'Release to Air - Road dust  ', 'Release to Air - Spills ',
       'Release to Air - Stack / Point  ',
       'Release to Air - Storage / Handling ', 'Releases to Land - Leaks',
       'Releases to Land - Other ', 'Releases to Land - Spills ',
       'Releases to Water Bodies - Direct Discharges ',
       'Releases to Water Bodies - Leaks',
       'Releases to Water Bodies - Spills ',
       'Sum of release to all media (<1tonne)'],
      dtype='object')

In [5]:
#Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")  #Count duplicates

Number of duplicate rows: 0


In [9]:
# Summary statistics
print(df.describe(include='all'))

        Reporting_Year / Année  NPRI_ID / No_INRP  Number of employees  \
count            737516.000000      737516.000000             736900.0   
unique                     NaN                NaN               2610.0   
top                        NaN                NaN                  1.0   
freq                       NaN                NaN             136154.0   
mean               2012.267601       11248.268386                  NaN   
std                   6.400156        9352.868266                  NaN   
min                2000.000000           1.000000                  NaN   
25%                2007.000000        3775.000000                  NaN   
50%                2012.000000        7114.000000                  NaN   
75%                2018.000000       18093.000000                  NaN   
max                2022.000000      306556.000000                  NaN   

       Company_Name / Dénomination_sociale_de_l'entreprise  \
count                                            

In [10]:
# Check for data types
print(df.dtypes)

Reporting_Year / Année                                     int64
NPRI_ID / No_INRP                                          int64
Number of employees                                       object
Company_Name / Dénomination_sociale_de_l'entreprise       object
Facility_Name / Installation                              object
NAICS / Code_SCIAN                                         int64
NAICS Title / Titre Code_SCIAN                            object
PROVINCE                                                  object
City                                                      object
Latitude                                                 float64
Longitude                                                float64
CAS_Number / No_CAS                                       object
Substance Name (English) / Nom de substance (Anglais)     object
Units / Unités                                            object
Estimation_Method / Méthode_destimation                  object
Release to Air - Fugitive

In [6]:
#Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]  # Display columns with missing values

Unnamed: 0,0
Number of employees,616
Facility_Name / Installation,1176
City,90764
Latitude,4082
Longitude,4082
Release to Air - Fugitive,612484
Release to Air - Other Non-Point,706636
Release to Air - Road dust,707655
Release to Air - Spills,728335
Release to Air - Stack / Point,224234


**Making a copy and splitting data before analysis**

In [12]:
df_analysis = df.copy()

In [13]:
# Standardize column names (e.g., lowercase, replace spaces with underscores)
df_analysis.columns = df_analysis.columns.str.lower().str.replace(' ', '_')

In [14]:
df_analysis.head()

Unnamed: 0,reporting_year_/_année,npri_id_/_no_inrp,number_of_employees,company_name_/_dénomination_sociale_de_l'entreprise,facility_name_/_installation,naics_/_code_scian,naics_title_/_titre_code_scian,province,city,latitude,longitude,cas_number_/_no_cas,substance_name_(english)_/_nom_de_substance_(anglais),units_/_unités,estimation_method_/_méthode_destimation,release_to_air_-_fugitive,release_to_air_-_other_non-point_,release_to_air_-_road_dust__,release_to_air_-_spills_,release_to_air_-_stack_/_point__,release_to_air_-_storage_/_handling_,releases_to_land_-_leaks,releases_to_land_-_other_,releases_to_land_-_spills_,releases_to_water_bodies_-_direct_discharges_,releases_to_water_bodies_-_leaks,releases_to_water_bodies_-_spills_,sum_of_release_to_all_media_(<1tonne)
0,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,10049-04-4,Chlorine dioxide,tonnes,M - Monitoring or Direct Measurement,,,,,5.2,,,,,,,,
1,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,67-56-1,Methanol,tonnes,E - Emission Factor,,,,,113.0,,,,,,,,
2,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,67-66-3,Chloroform,tonnes,M - Monitoring or Direct Measurement,,,,,,,,,,0.0,,,
3,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,75-07-0,Acetaldehyde,tonnes,E - Emission Factor,,,,,7.67,,,,,,,,
4,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,7647-01-0,Hydrochloric acid,tonnes,M - Monitoring or Direct Measurement,,,,,0.665,,,,,,,,


In [20]:
# Converting data types
df_analysis['reporting_year_/_année'] = df_analysis['reporting_year_/_année'].astype(int)

# Fill all NaN values in the DataFrame with 0
df_analysis = df_analysis.fillna(0)

# Convert specific columns to int after filling NaNs
df_analysis['number_of_employees'] = df_analysis['number_of_employees'].astype(int)


df_analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 737516 entries, 0 to 737515
Data columns (total 28 columns):
 #   Column                                                 Non-Null Count   Dtype  
---  ------                                                 --------------   -----  
 0   reporting_year_/_année                                 737516 non-null  int64  
 1   npri_id_/_no_inrp                                      737516 non-null  int64  
 2   number_of_employees                                    737516 non-null  int64  
 3   company_name_/_dénomination_sociale_de_l'entreprise    737516 non-null  object 
 4   facility_name_/_installation                           737516 non-null  object 
 5   naics_/_code_scian                                     737516 non-null  int64  
 6   naics_title_/_titre_code_scian                         737516 non-null  object 
 7   province                                               737516 non-null  object 
 8   city                              

In [21]:
# Convert 'object' columns to 'category'
for col in df_analysis.select_dtypes(include=['object']).columns:
    df_analysis[col] = df_analysis[col].astype('category')

df_analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 737516 entries, 0 to 737515
Data columns (total 28 columns):
 #   Column                                                 Non-Null Count   Dtype   
---  ------                                                 --------------   -----   
 0   reporting_year_/_année                                 737516 non-null  int64   
 1   npri_id_/_no_inrp                                      737516 non-null  int64   
 2   number_of_employees                                    737516 non-null  int64   
 3   company_name_/_dénomination_sociale_de_l'entreprise    737516 non-null  category
 4   facility_name_/_installation                           737516 non-null  category
 5   naics_/_code_scian                                     737516 non-null  int64   
 6   naics_title_/_titre_code_scian                         737516 non-null  category
 7   province                                               737516 non-null  category
 8   city                    

In [22]:
df_analysis.columns

Index(['reporting_year_/_année', 'npri_id_/_no_inrp', 'number_of_employees',
       'company_name_/_dénomination_sociale_de_l'entreprise',
       'facility_name_/_installation', 'naics_/_code_scian',
       'naics_title_/_titre_code_scian', 'province', 'city', 'latitude',
       'longitude', 'cas_number_/_no_cas',
       'substance_name_(english)_/_nom_de_substance_(anglais)',
       'units_/_unités', 'estimation_method_/_méthode_destimation',
       'release_to_air_-_fugitive', 'release_to_air_-_other_non-point_',
       'release_to_air_-_road_dust__', 'release_to_air_-_spills_',
       'release_to_air_-_stack_/_point__',
       'release_to_air_-_storage_/_handling_', 'releases_to_land_-_leaks',
       'releases_to_land_-_other_', 'releases_to_land_-_spills_',
       'releases_to_water_bodies_-_direct_discharges_',
       'releases_to_water_bodies_-_leaks',
       'releases_to_water_bodies_-_spills_',
       'sum_of_release_to_all_media_(<1tonne)'],
      dtype='object')

In [23]:
releases= (['release_to_air_-_fugitive', 'release_to_air_-_other_non-point_',
       'release_to_air_-_road_dust__', 'release_to_air_-_spills_',
       'release_to_air_-_stack_/_point__',
       'release_to_air_-_storage_/_handling_', 'releases_to_land_-_leaks',
       'releases_to_land_-_other_', 'releases_to_land_-_spills_',
       'releases_to_water_bodies_-_direct_discharges_',
       'releases_to_water_bodies_-_leaks',
       'releases_to_water_bodies_-_spills_',
       'sum_of_release_to_all_media_(<1tonne)'])


In [None]:
!pip install scikit-learn  # Install scikit-learn if not already installed
from sklearn.model_selection import train_test_split

# Splitting data into features (X) and target (y)
X = df_analysis.drop(columns=releases)
y = df_analysis[releases]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust test_size and random_state as needed


In [30]:
X.head(10)

Unnamed: 0,reporting_year_/_année,npri_id_/_no_inrp,number_of_employees,company_name_/_dénomination_sociale_de_l'entreprise,facility_name_/_installation,naics_/_code_scian,naics_title_/_titre_code_scian,province,city,latitude,longitude,cas_number_/_no_cas,substance_name_(english)_/_nom_de_substance_(anglais),units_/_unités,estimation_method_/_méthode_destimation
0,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,10049-04-4,Chlorine dioxide,tonnes,M - Monitoring or Direct Measurement
1,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,67-56-1,Methanol,tonnes,E - Emission Factor
2,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,67-66-3,Chloroform,tonnes,M - Monitoring or Direct Measurement
3,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,75-07-0,Acetaldehyde,tonnes,E - Emission Factor
4,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,7647-01-0,Hydrochloric acid,tonnes,M - Monitoring or Direct Measurement
5,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,7664-38-2,Phosphoric acid,tonnes,M - Monitoring or Direct Measurement
6,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,7664-93-9,Sulphuric acid,tonnes,M - Monitoring or Direct Measurement
7,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,7782-50-5,Chlorine,tonnes,M - Monitoring or Direct Measurement
8,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,NA - 09,Manganese (and its compounds),tonnes,M - Monitoring or Direct Measurement
9,2000,1,440,Alberta-Pacific Forest Industries Inc.,(blank),322112,Chemical pulp mills,AB,County of Athabasca,54.923116,-112.861867,NA - 14,Zinc (and its compounds),tonnes,M - Monitoring or Direct Measurement


In [33]:
y.describe()

Unnamed: 0,release_to_air_-_fugitive,release_to_air_-_other_non-point_,release_to_air_-_road_dust__,release_to_air_-_spills_,release_to_air_-_stack_/_point__,release_to_air_-_storage_/_handling_,releases_to_land_-_leaks,releases_to_land_-_other_,releases_to_land_-_spills_,releases_to_water_bodies_-_direct_discharges_,releases_to_water_bodies_-_leaks,releases_to_water_bodies_-_spills_,sum_of_release_to_all_media_(<1tonne)
count,737516.0,737516.0,737516.0,737516.0,737516.0,737516.0,737516.0,737516.0,737516.0,737516.0,737516.0,737516.0,737516.0
mean,10.367992,3.829248,4.746724,0.021693,100.359815,2.54515,0.00044,3.962627,0.030854,5.701059,0.000714,0.730422,0.0096
std,329.193552,333.808062,173.785236,2.48689,1682.190688,72.269703,0.091653,282.47634,7.279564,156.478695,0.170531,345.372258,0.680793
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.55805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,19.584025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,58433.9,91138.2,39992.96105,1554.4,235907.0,17883.11,39.058,152270.0,4500.1,39991.0,82.4,259050.0,582.0


In [32]:
y.head(100)

Unnamed: 0,release_to_air_-_fugitive,release_to_air_-_other_non-point_,release_to_air_-_road_dust__,release_to_air_-_spills_,release_to_air_-_stack_/_point__,release_to_air_-_storage_/_handling_,releases_to_land_-_leaks,releases_to_land_-_other_,releases_to_land_-_spills_,releases_to_water_bodies_-_direct_discharges_,releases_to_water_bodies_-_leaks,releases_to_water_bodies_-_spills_,sum_of_release_to_all_media_(<1tonne)
0,0.000,0.0,0.0,0.0,5.200,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000,0.0,0.0,0.0,113.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000,0.0,0.0,0.0,7.670,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000,0.0,0.0,0.0,0.665,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000,0.0,0.0,0.0,0.001,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.000,0.0,0.0,0.0,0.097,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.000,0.0,0.0,0.0,0.018,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.000,0.0,0.0,0.0,0.000,0.102,0.0,0.0,0.0,0.0,0.0,0.0,0.0
