# Future weather forecasting project

##### My project is a forecast of the weather in the future, and although it appears to be an easy project, it is not really so. Trying to predict the weather and getting an approximate value is considered an achievement!

The data will be collected (scraping) from the website [Wunderground](https://www.wunderground.com/).And i will use Selenium framework and a Chrome browser to scrape the data from the website.

New York City was chosen to be a source of data from the station (LAGUARDIA AIRPORT STATION|CHANGE) because America is one of the leading countries in meteorology

# Data collection

## Preparation

In [12]:
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd
import statistics

In [13]:
# Initialize Selenium WebDriver

chrome_options = Options()
# Enable headless mode
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(browser, 10)

#### Create a DataFrame to store the scraped data

In [14]:
df = pd.DataFrame(columns=['Date','Humidity_Avg','Wind_Type','Pressure_Avg'
,'Condition','Temperature_Avg','Temperature_Historic','Precipitatio_Actual',
'Precipitation_Historic','Dew_Point' ,'Max_Wind_Speed' ,'Sea_Level_Pressure' ,'Day_Length'])

In [15]:
def get_web_element_data(column_name , xpath, df_index):
    """ 
    Scrapes data from a website and saves it into a DataFrame.

    Args:
        column_name (str): The name of the column in the DataFrame where the scraped data will be saved.
        xpath (str): The XPath of the element from which data will be scraped on the website.
        df_index (int): The index of the last row in the DataFrame, so the scraped data can be appended.

    Returns:
        None
    """
    try:
        element = wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
        df.loc[df_index, column_name] = element.text
    except:
        df.loc[df_index, column_name] = None

In [16]:
def get_data_from_weather_table(weather_table,df_index):
    """
    Calculate the mode of the data from a scraped weather table.

    Args:
        weather_table (webElement): A web element representing a table containing important data.
        df_index (int): The index of the last row in the DataFrame, allowing the scraped data to be appended.

    Returns:
        None
    """
    humidity_list = wind_type_list =pressure_list = condition_list = None
    # If the weather_table does exist on the site
    if weather_table:
        weather_rows = weather_table.text.strip().split('\n')[10:]
        weather_data = [row.split() for row in weather_rows]
        humidity_list = [values[6] + ' ' + values[7] for values in weather_data]
        wind_type_list = [values[8] for values in weather_data]
        pressure_list = [values[13] + ' ' + values[14] for values in weather_data]
        condition_list = [values[17] + ' ' + values[18] if len(values) > 18 else values[17] for values in weather_data]

    # The site displays data every hour for 24 hours a day.
    # I preferre to take the mode of these values for the entire day
    df.loc[df_index, 'Humidity_Avg'] = statistics.mode(humidity_list)
    df.loc[df_index, 'Wind_Type'] = statistics.mode(wind_type_list)
    df.loc[df_index, 'Pressure_Avg'] = statistics.mode(pressure_list)
    df.loc[df_index, 'Condition'] = statistics.mode(condition_list)

In [17]:
def scrape_data_for_date(browser, wait, current_date,df_index):
    """
    scrap data from the site for specific date

    Args:
        browser (webdriver): A webdriver object
        wait (WebDriverWait object): the WebDriverWait object for how many waithing for site 
        df_index (int): The index of the last row in the DataFrame, allowing the scraped data to be appended.

    Returns:
        None
    """

    # Print the current_date just to see the progress of the scraping
    print(f"Scraping data for {current_date}")
    try:
        browser.get(f'https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/{current_date}')
        df.loc[df_index, 'Date'] = current_date
        get_web_element_data('Temperature_Avg','//*[@id="inner-content"]/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[1]/tr[3]/td[1]',df_index)
        get_web_element_data('Temperature_Historic','//*[@id="inner-content"]/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[1]/tr[3]/td[2]',df_index)
        get_web_element_data('Precipitatio_Actual','//*[@id="inner-content"]/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[2]/tr/td[1]',df_index)
        get_web_element_data('Precipitation_Historic','//*[@id="inner-content"]/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[2]/tr/td[1]',df_index)
        get_web_element_data('Precipitatio_Actual','//*[@id="inner-content"]/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[2]/tr/td[2]',df_index)
        get_web_element_data('Dew_Point','//*[@id="inner-content"]/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[3]/tr[1]/td[1]',df_index)
        get_web_element_data('Max_Wind_Speed','//*[@id="inner-content"]/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[4]/tr[1]/td[1]',df_index)
        get_web_element_data('Sea_Level_Pressure','//*[@id="inner-content"]/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[5]/tr/td[1]',df_index)
        get_web_element_data('Day_Length','//*[@id="inner-content"]/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[6]/tr[1]/td[1]',df_index)
        
        try:
            weather_table = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'mat-table')))
        except:
            print(f"An error occurred: {str(e)}")
            weather_table = None
        finally:
            get_data_from_weather_table(weather_table,df_index)
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [18]:
def scrape_data(start_date, end_date):
    """
    Scrapes data from the site for a range of dates.

    Args:
        start_date (datetime): The start date for scraping, as a datetime object.
        end_date (datetime): The end date for scraping, as a datetime object.

    Returns:
        None
    """
    # Get the current length of the DataFrame to determine the last index and append data accordingly
    current_date = start_date
    while current_date <= end_date:
        df_index = len(df.index)
        scrape_data_for_date(browser, wait, current_date,df_index)
        # Increment current_date by one day
        current_date += timedelta(days=1)

## Start scraping

In [None]:
# Time range for data scraping
start_date = datetime(2009,1,1)
end_date = datetime(2023, 12, 30)

# start Scraping
scrape_data(start_date, end_date)

## Save the data to CSV file

In [21]:
df.to_csv('weather.csv',index=False)

## Close the browser driver

In [22]:
browser.quit()