# California Air Resources Board (CARB)

In [1]:
import requests
import pandas as pd
from io import BytesIO
from bs4 import BeautifulSoup

### Parameters

In [2]:
URL = "https://ww2.arb.ca.gov/mrr-data"

headers = {
    "Authority": "ww2.arb.ca.gov",
    "Method": "GET",
    "Path": "/mrr-data",
    "Scheme": "https",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
    "Cache-Control": "max-age=0",
    "If-None-Match": "1689111602",
    "Sec-Ch-Ua": '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": '"Windows"',
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "cross-site",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}


### Tests

In [3]:
r = requests.get(URL, headers=headers)

In [4]:
soup = BeautifulSoup(r.content, "html.parser")
soup
# Get all hrefs and their text
href_dicts = []
hrefs = soup.find_all("a")
for href in hrefs:
    href_dict = {}
    text=  href.text
    url = href.get("href")

    if "xls" in url:     
        href_dict["name"] = text
        href_dict["url"] = f"https:{url}" 

        href_dicts.append(href_dict)
    

In [5]:
urls_df = pd.DataFrame.from_dict(href_dicts)

### Access one of the files

In [6]:
urls_df

Unnamed: 0,name,url
0,2021 GHG Facility and Entity Emissions,https://ww2.arb.ca.gov/sites/default/files/cla...
1,2020 GHG Facility and Entity Emissions,https://ww2.arb.ca.gov/sites/default/files/cla...
2,2019 GHG Facility and Entity Emissions,https://ww2.arb.ca.gov/sites/default/files/cla...
3,2018 GHG Facility and Entity Emissions,https://ww2.arb.ca.gov/sites/default/files/cla...
4,2017 GHG Facility and Entity Emissions,https://ww2.arb.ca.gov/sites/default/files/cla...
5,2016 GHG Facility and Entity Emissions,https://ww2.arb.ca.gov/sites/default/files/cla...
6,2015 GHG Facility and Entity Emissions,https://ww2.arb.ca.gov/sites/default/files/cla...
7,2014 GHG Facility and Entity Emissions,https://ww2.arb.ca.gov/sites/default/files/cla...
8,2013 GHG Facility and Entity Emissions,https://ww2.arb.ca.gov/sites/default/files/cla...
9,2012 GHG Facility and Entity Emissions,https://ww2.arb.ca.gov/sites/default/files/cla...


In [7]:
xlsx_urls = urls_df["url"].tolist()
undesired_words = ["facility", "entity", "archive"]

xlsx_urls = [url for url in xlsx_urls if not any(word in url for word in undesired_words)]
xlsx_urls

['https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2021-ghg-emissions-2022-11-04.xlsx',
 'https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2020-ghg-emissions-2022-11-04.xlsx',
 'https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2019-ghg-emissions-2022-11-04.xlsx',
 'https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2018-ghg-emissions-2022-11-04.xlsx',
 'https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2017-ghg-emissions-2022-11-04.xlsx',
 'https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2016-ghg-emissions-2022-11-04.xlsx',
 'https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2015-ghg-emissions-2019-11-04.xlsx',
 'https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2014-ghg-emissions-2019-11-04.xlsx',


In [8]:
mega_df = pd.DataFrame([])
for xlsx_url in xlsx_urls:
    response = requests.get(xlsx_url)
    xls_file = pd.ExcelFile(BytesIO(response.content))
    # Create a dictionary of DataFrames, with sheet name as key
    dataframes = {sheet_name: xls_file.parse(sheet_name) for sheet_name in xls_file.sheet_names}

    key_of_interest = None
    for key in dataframes.keys():
        if 'GHG Data' in key:
            key_of_interest = key
    
    if key_of_interest is None:
        print(f"Skipping {xlsx_url}")
        continue
    
    ghg_data = dataframes[key_of_interest]
    # Remove rows with several NaNs
    ghg_data = ghg_data.dropna(thresh=10)
    # First row is the header
    ghg_data.columns = ghg_data.iloc[0]
    # Remove rows
    longbeach_df = ghg_data.loc[ghg_data.loc[:, 'City'] == 'Long Beach'].reset_index(drop=True)
    # Remove columns with NaNs
    longbeach_df = longbeach_df.dropna(axis=1, how='all')
    # Remove \n in column names
    longbeach_df.columns = longbeach_df.columns.str.replace('\n', ' ')
    
    mega_df = pd.concat([mega_df, longbeach_df], ignore_index=True)

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2010-ghg-emissions-2015-06-15.xlsx


In [9]:
mega_df

Unnamed: 0,ARB ID,Facility Name,Report Year,"Total CO2e (combustion, process, vented, and supplier)",AEL,Emitter CO2e from Non-Biogenic Sources and CH4 and N2O from Biogenic Fuels,Emitter CO2 from Biogenic Fuels,Fuel Supplier CO2e from Non-Biogenic Fuels and CH4 and N2O from Biogenic Fuels,Fuel Supplier CO2 from Biogenic Fuels,Electricity Importer CO2e,...,Total Non-Covered Emissions,Emissions Data,Product Data,Verification Body,City,State,Zip Code,North American Industry Classification System (NAICS) Code and Description,U.S.EPA/ARB Subparts,Industry Sector
0,101321,"AES Alamitos, LLC",2021,1477435.567312,No,1477435.567312,0,0,0,0,...,0,Positive,,Ashworth Leininger Group,Long Beach,CA,90803,221112 - Fossil Fuel Electric Power Generation,"C,D",In-State Electricity Generation
1,5003,Long Beach Gas & Oil Dept (LBGO),2021,511924.130656,No,0,0,511924.130656,0,0,...,91550.793474,Positive,,Locus Technologies,Long Beach,CA,90806,221210 - Natural Gas Distribution,NN,"Supplier of Natural Gas, NGL, or LPG"
2,100002,Calciner,2021,285809.907123,No,285809.907123,0,0,0,0,...,0.407612,Positive,Positive,"Tetra Tech, Inc. and Subsidiaries",Long Beach,CA,90813,324199 - All Other Petroleum and Coal Products...,"C,Y",Other Combustion Source
3,104072,California Resources Production Corporation - ...,2021,220714.719382,No,220714.719382,0,0,0,0,...,3123.234311,Positive,Positive,"Tetra Tech, Inc. and Subsidiaries",Long Beach,CA,90802,211111 - Crude Petroleum and Natural Gas Extra...,"C,W",Oil and Gas Production
4,101697,"Gold Bond Building Products, LLC - Long Beach",2021,36091.208986,No,36091.208986,0,0,0,0,...,0,Positive,Positive,SCS Engineers,Long Beach,CA,90813,327420 - Gypsum Product Manufacturing,C,Other Combustion Source
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,101697,"New NGC, Inc - Long Beach",2011,21179.8,No,21179.8,0,0,0,0,...,0,Positive,Positive,"GHG Climate Team, LLC",Long Beach,CA,90813,327420 - Gypsum Product Manufacturing,C,Other Combustion Source
88,100208,NRG Energy - Long Beach,2011,18800.2,No,18800.2,0,0,0,0,...,0,Positive,,"Trinity Consultants, Inc.",Long Beach,CA,90813,221112 - Fossil Fuel Electric Power Generation,D,In-State Electricity Generation
89,100994,Oxy - Thums Long Beach Company,2011,261596.75,No,261596.75,0,0,0,0,...,53559.682138,Positive,Positive,"Tetra Tech, Inc. and Subsidiaries",Long Beach,CA,90802,211111 - Crude Petroleum and Natural Gas Extra...,"C,W",Oil and Gas Production
90,104072,Oxy - Tidelands Oil Production Company - 760 L...,2011,41469,Yes,41639.58,0,0,0,0,...,784,Adverse,Positive,"Tetra Tech, Inc. and Subsidiaries",Long Beach,CA,90802,211111 - Crude Petroleum and Natural Gas Extra...,"C,W",Oil and Gas Production


### Testing Class

In [10]:
from CARBExtractor import CARBExtractor

In [11]:
carb_extractor = CARBExtractor()

In [12]:
carb_extractor.get_longbeach_data(year=2019)
carb_extractor.longbeach_df

  warn(msg)
  warn(msg)


Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2021-ghg-emissions-2022-11-04.xlsx


  warn(msg)
  warn(msg)


Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2020-ghg-emissions-2022-11-04.xlsx


  warn(msg)
  warn(msg)
  warn(msg)


Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2018-ghg-emissions-2022-11-04.xlsx


  warn(msg)
  warn(msg)


Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2017-ghg-emissions-2022-11-04.xlsx


  warn(msg)
  warn(msg)


Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2016-ghg-emissions-2022-11-04.xlsx


  warn(msg)


Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2015-ghg-emissions-2019-11-04.xlsx


  warn(msg)


Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2014-ghg-emissions-2019-11-04.xlsx


  warn(msg)
  warn(msg)


Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2013-ghg-emissions-2019-11-04.xlsx


  warn(msg)


Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2012-ghg-emissions-2019-11-04.xlsx


  warn(msg)
  warn(msg)


Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2011-ghg-emissions-2018-11-05.xlsx
Skipping https://ww2.arb.ca.gov/sites/default/files/classic/cc/reporting/ghg-rep/reported-data/2010-ghg-emissions-2015-06-15.xlsx


7,ARB ID,Facility Name,Report Year,"Total CO2e (combustion, process, vented, and supplier)",AEL,Emitter CO2e from Non-Biogenic Sources and CH4 and N2O from Biogenic Fuels,Emitter CO2 from Biogenic Fuels,Fuel Supplier CO2e from Non-Biogenic Fuels and CH4 and N2O from Biogenic Fuels,Fuel Supplier CO2 from Biogenic Fuels,Electricity Importer CO2e,...,Total Non-Covered Emissions,Emissions Data,Product Data,Verification Body,City,State,Zip Code,North American Industry Classification System (NAICS) Code and Description,U.S.EPA/ARB Subparts,Industry Sector
0,101321,"AES Alamitos, LLC",2019,531188.204322,No,531188.204322,0,0.0,0,0,...,0.0,Positive,,Ashworth Leininger Group,Long Beach,CA,90803,221112 - Fossil Fuel Electric Power Generation,"C,D",In-State Electricity Generation
1,100002,Calciner,2019,262853.498266,No,262853.498266,0,0.0,0,0,...,0.474151,Positive,Positive,SCS Engineers,Long Beach,CA,90813,324199 - All Other Petroleum and Coal Products...,"C,Y",Other Combustion Source
2,104072,California Resources Production Corporation - ...,2019,230275.751398,No,230275.751398,0,0.0,0,0,...,7509.47377,Positive,Positive,"NSF Certification, LLC",Long Beach,CA,90802,211111 - Crude Petroleum and Natural Gas Extra...,"C,W",Oil and Gas Production
3,5003,Long Beach Gas & Oil Dept (LBGO),2019,525804.073362,No,0.0,0,525804.073362,0,0,...,84234.249399,Positive,,Locus Technologies,Long Beach,CA,90806,221210 - Natural Gas Distribution,NN,"Supplier of Natural Gas, NGL, or LPG"
4,100187,Los Angeles Department of Water & Power-Haynes...,2019,1185578.559992,No,1185578.559992,0,0.0,0,0,...,639.860721,Positive,,"Lincus, Inc.",Long Beach,CA,90803,221112 - Fossil Fuel Electric Power Generation,"C,D",In-State Electricity Generation
5,101697,"New NGC, Inc - Long Beach",2019,35054.509766,No,35054.509766,0,0.0,0,0,...,0.0,Positive,Positive,SCS Engineers,Long Beach,CA,90813,327420 - Gypsum Product Manufacturing,C,Other Combustion Source
6,100208,NRG Energy - Long Beach,2019,13135.157372,No,13135.157372,0,0.0,0,0,...,0.0,Positive,,Ashworth Leininger Group,Long Beach,CA,90802,221112 - Fossil Fuel Electric Power Generation,D,In-State Electricity Generation
7,100063,Southeast Resource Recovery Facility (SERRF),2019,359127.135564,No,132356.135564,226771,0.0,0,0,...,226771.0,Positive,,"Trinity Consultants, Inc.",Long Beach,CA,90802,562213 - Solid Waste Combustors and Incinerators,C,In-State Electricity Generation
