In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import time
from selenium import webdriver
from selenium.webdriver.firefox.service import Service #Use webdriver.chrome.service for Chrome

In [2]:
# For Firefox, move geckodriver folder with geckodriver into current working directory
# For Chrome, move chromedriver folder with chromedriver into current working directory
# use 'chromedriver/chromedriver' for service path

ser = Service('geckodriver/geckodriver')
op = webdriver.FirefoxOptions()
driver = webdriver.Firefox(service=ser, options=op)
url = 'https://ww2.energy.ca.gov/almanac/renewables_data/solar/index_cms.php'

In [3]:
#uncomment and run to close window
#driver.close()

In [4]:
driver.get(url)

In [None]:
# Generate list of page source strings for year 2001-2021 in descending year order
html_list = []
for x in range(2,23):
    driver.find_element('xpath', f'/html/body/div[3]/main/div/div/div[3]/form/select/option[{x}]').click()
    driver.find_element('xpath', '/html/body/div[3]/main/div/div/div[3]/form/input').click()
    time.sleep(1)
    html = driver.page_source
    html_list.append(html)

In [None]:
# Solar Thermal Production
for i in range(len(html_list)):
    dfs = pd.read_html(html_list[i])
    if i == 0:
        solar_therm_all = dfs[0]
        print(f'Year {2021 - i} completed.')
    else:
        solar_therm_all = pd.concat([solar_therm_all, dfs[0]])
        print(f'Year {2021 - i} completed.')

In [None]:
solar_therm_all

In [None]:
# Solar PV Production
for i in range(len(html_list)):
    dfs = pd.read_html(html_list[i])
    if i == 0:
        solar_pv_all = dfs[1]
        print(f'Year {2021 - i} completed.')
    else:
        solar_pv_all = pd.concat([solar_pv_all, dfs[1]])        
        print(f'Year {2021 - i} completed.')

In [None]:
solar_pv_all

In [None]:
# Solar Imports Production (PV + thermal)
for i in range(len(html_list)):
    dfs = pd.read_html(html_list[i])
    if i == 0:
        solar_imports_all = dfs[2]
        print(f'Year {2021 - i} completed.')
    else:
        solar_imports_all = pd.concat([solar_imports_all, dfs[2]])
        print(f'Year {2021 - i} completed.')

In [None]:
solar_imports_all

In [None]:
# Solar Annual Totals (PV + thermal + imports) - 1 chart for 2021-1983, present on all pages

dfs = pd.read_html(html_list[0])
solar_annual_all = dfs[3]

In [None]:
solar_annual_all

In [None]:
# Solar by County (PV + Thermal)
for i in range(len(html_list)):
    dfs = pd.read_html(html_list[i])
    if i == 0:
        solar_county_all = dfs[4]
        solar_county_all['year'] = 2021
        print(f'Year {2021 - i} completed.')
        
    else:
        solar_county_next = dfs[4]
        solar_county_next['year'] = 2021-i
        solar_county_all = pd.concat([solar_county_all, solar_county_next])
        print(f'Year {2021 - i} completed.')


In [None]:
solar_county_all

In [None]:
driver.close()

### Initial Cleaning
 - Combining pv and thermal production dfs, dropping unnamed columns

In [None]:
solar_pv_all.head()

In [None]:
solar_pv_all.columns = [col[1] for col in solar_pv_all.columns]

In [None]:
solar_pv_all = solar_pv_all.drop('Unnamed: 7_level_1', axis = 1)

In [None]:
solar_pv_all['type'] = 'pv'

In [None]:
solar_pv_all.head()

In [None]:
solar_therm_all.head()

In [None]:
solar_therm_all.columns = [col[1] for col in solar_therm_all.columns]
solar_therm_all = solar_therm_all.drop('Unnamed: 7_level_1', axis = 1)

In [None]:
solar_therm_all['type'] = 'therm'

In [None]:
solar_therm_all.head()

In [None]:
solar_pvtherm_all = pd.concat([solar_pv_all, solar_therm_all])

In [None]:
solar_pvtherm_all.shape

In [None]:
solar_pvtherm_all = solar_pvtherm_all.reset_index(drop=True)

In [None]:
solar_imports_all = solar_imports_all.drop('Unnamed: 7', axis = 1)

In [None]:
solar_imports_all = solar_imports_all.reset_index(drop=True)

In [None]:
solar_county_all = solar_county_all.reset_index(drop=True)

### Write to .csv

In [None]:
solar_pvtherm_all.to_csv('../data/CA_Solar_Production_Data/solar_pv_thermal_prod_ca.csv')

In [None]:
solar_imports_all.to_csv('../data/CA_Solar_Production_Data/solar_pv_thermal_prod_imported.csv')

In [None]:
solar_annual_all.to_csv('../data/CA_Solar_Production_Data/solar_pv_thermal_prod_annual.csv')

In [None]:
solar_county_all.to_csv('../data/CA_Solar_Production_Data/solar_pv_thermal_prod_county.csv')