In [51]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

driver = webdriver.Firefox()
# URL of the Hydro-Québec electricity contracts page
url = "https://www.hydroquebec.com/electricity-purchases-quebec/electricity-contracts.html"

# Open the webpage
driver.get(url)

time.sleep(5)

# Scroll to the bottom to ensure all data is loaded
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)

# Extract wind farm data
wind_farms = []
coordinates = []

for i in range(1, len(driver.find_elements(By.XPATH,  "/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div/div[4]/ul/li"))+1):
    name = driver.find_element(By.XPATH, f"/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div/div[4]/ul/li[{i}]/div/h2/span").get_attribute('textContent')
    farm_properties = driver.find_elements(By.XPATH, f"/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div/div[4]/ul/li[{i}]/div/ul/li")
    for pr in farm_properties:
        pr_text = pr.get_attribute("textContent")
        if 'Capacity' in pr_text:
            capacity = pr_text.strip('Capacity: ')
        elif 'Status' in pr_text:
            status = pr_text.strip('Status: ')
        elif 'commissioning' in pr_text:
            commissioning_date = pr_text.strip('Date of commissioning: ')
        elif 'Region' in pr_text:
            region = pr_text.strip('Region: ')
        elif 'Type' in pr_text:
            project_type = pr_text.strip('Type of project: ')
    wind_farms.append([name, project_type, capacity, region, status, commissioning_date])

for el in driver.find_elements(By.XPATH, "/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div/div[3]/div[1]/ng-map/marker"):
    coordinates.append([el.get_attribute('title'), el.get_attribute('position')])

driver.quit()

df_properties = pd.DataFrame(wind_farms, columns=['name', 'project_type', 'capacity_MW', 'region', 'status', 'commissioning_date'])
# Convert capacity to numbers
df_properties['capacity_MW'] = df_properties['capacity_MW'].str.strip('MW')
df_properties['capacity_MW'] = df_properties['capacity_MW'].str.replace('\xa0', '')
df_properties['capacity_MW'] = pd.to_numeric(df_properties['capacity_MW'])

# Convert commissioning date to datetime
### leaving it out for now, a few instances have Phase 1, Phase 2, Phase 3
#df_properties['commissioning_date'] = pd.to_datetime(df_properties['commissioning_date'])

df_coordinates = pd.DataFrame(coordinates, columns=['name', 'position'])
# Separate position into numeric longitude and latitude
df_coordinates[['latitude', 'longitude']] = df_coordinates['position'].str.strip('[]').str.split(',', expand=True)
df_coordinates['latitude'] = pd.to_numeric(df_coordinates['latitude'])
df_coordinates['longitude'] = pd.to_numeric(df_coordinates['longitude'])
df_coordinates = df_coordinates.drop('position', axis=1)


df = pd.merge(df_properties, df_coordinates, on='name')
# Display the DataFrame
print(df.head())

# Save only the wind farms data to a CSV file
df.query('project_type=="Wind farm"').to_csv('hydroquebec_wind_farms.csv', index=False)


                        name project_type  capacity_MW  \
0  Baie-des-Sables wind farm    Wind farm       109.50   
1         Carleton wind farm    Wind farm       109.50   
2     Mont-Rothery wind farm    Wind farm        74.00   
3      De L'Érable wind farm    Wind farm       100.00   
4      Des Moulins wind farm    Wind farm       156.85   

                      region      status commissioning_date   latitude  \
0          Bas-Saint-Laurent  In service         2006-11-22  48.702210   
1  Gaspésie–Îles-de-la-Madel  In service         2008-11-22  48.202737   
2  Gaspésie–Îles-de-la-Madel  In service         2015-12-01  48.978875   
3           Centre-du-Québec  In service         2013-11-16  46.096983   
4       Chaudière-Appalaches  In service         2013-12-07  46.173945   

   longitude  
0 -67.872489  
1 -66.128295  
2 -65.373544  
3 -71.647639  
4 -71.351327  
