**This weather scraping script was borrow from someone else and modified.**  
The original author is [Matthew Lee](https://github.com/silvernine209)

In [12]:
# import web driver
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from seleniumrequests import Chrome

from bs4 import BeautifulSoup
import requests
import time
import random
import re
import getpass
import os
import pandas as pd
import numpy as np
import datetime as dt
import pickle

%matplotlib inline 

# Things to do
* 1.) Specify year & month combo you want from "year_month_list" 
* 2.) Specify the name of the city by assigning it to "city_name"
* 3.) Run "Initiate Scraping" to start scraping
    * Make sure to maximize selenium chrome webpage's screen.
    * Code will save each month individually & if code breaks. You can simply rerun. Code will look at what's already scraped and continue where it left off.
* 4.) Once done with scraping, run "Compile Individual Files & Pickle It" to compile the data

In [13]:
#pull monthly data for 2014-2016
year_month_list =[   '2014-01',
                     '2014-02',
                     '2014-03',
                     '2014-04',
                     '2014-05',
                     '2014-06',
                     '2014-07',
                     '2014-08',
                     '2014-09',
                     '2014-10',
                     '2014-11',
                     '2014-12',
                     '2015-01',
                     '2015-02',
                     '2015-03',
                     '2015-04',
                     '2015-05',
                     '2015-06',
                     '2015-07',
                     '2015-08',
                     '2015-09',
                     '2015-10',
                     '2015-11',
                     '2015-12',
                     '2016-01',
                     '2016-02',
                     '2016-03',
                     '2016-04',
                     '2016-05',
                     '2016-06',
                     '2016-07',
                     '2016-08',
                     '2016-09',
                     '2016-10',
                     '2016-11',
                     '2016-12']

In [14]:
#This section has to be populated with the proper link and county name for each county in NYS
city_name = "@5128316" #link stump for timeanddate.com
county_dan = "NASSAU" # acually county


'\nCHANGE BOTH + URL\n'

# Scraping Function.

In [15]:
def scrape_weather_url(url):
    # weather data holder to be inserted to pandas dataframe
    high_low, weather_desc, humidity_barometer, wind, date_time = [], [], [], [], []
    
    # open url
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "lxml")
    days_chain = [x.find_all('a') for x in soup.find_all(class_='weatherLinks')]
    time.sleep(5)
    
    # Load Entire Page by Scrolling to charts
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/3.7);") # Scroll down
    
    # First load of each month takes extra long time. Therefore 'counter' variable is used to run else block first
    counter = 0
    for ix,link in enumerate(days_chain[0]):
        
        '''
        Bottom section tries to solve loading issue by implementing wait feature
        Refer : https://selenium-python.readthedocs.io/waits.html
        '''
        wait = WebDriverWait(driver, 10)
        if counter!=0:
            delay = 3 # seconds
            try:
                myElem = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'weatherLinks')))
            except TimeoutException:
                print("Loading took too much time!" ) 
            day_link = driver.find_element_by_xpath("//div[@class='weatherLinks']/a[{}]".format(ix+1))
            wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='weatherLinks']/a[{}]".format(ix+1))))
            day_link.click()
        else:
            delay = 5 # seconds
            try:
                myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'weatherLinks')))
            except TimeoutException:
                print("Loading took too much time!" ) 
            day_link = driver.find_element_by_xpath("//div[@class='weatherLinks']/a[{}]".format(ix+1))
            wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='weatherLinks']/a[{}]".format(ix+1))))
            time.sleep(4)
            day_link.click()
            time.sleep(3)
            counter+=1
        
        # Wait a bit for the Javascript to fully load data to be scraped
        time.sleep(2.5)
            
        # Scrape weather data
        high_low.insert(0,driver.find_elements_by_xpath("//div[@class='temp']")[-1].text) #notice elements, s at the end. This returns a list, and I can index it.
        weather_desc.insert(0,driver.find_element_by_xpath("//div[@class='wdesc']").text)
        humidity_barometer.insert(0,driver.find_element_by_xpath("//div[@class='mid__block']").text)
        wind.insert(0,driver.find_element_by_xpath("//div[@class='right__block']").text)
        date_time.insert(0,driver.find_elements_by_xpath("//div[@class='date']")[-1].text)
    return high_low, weather_desc, humidity_barometer, wind, date_time 

# Initiate Scraping

In [16]:
# Initiate Selenium Chrome driver for Mac
git_folder_location = os.path.abspath(os.path.dirname('metis_proj_2_luther'))
full_path_to_chromedriver = os.path.join(git_folder_location, "chromedriver")
driver = Chrome(executable_path = full_path_to_chromedriver)

# Create "weather_data" folder if it's not there
git_folder_location = os.path.abspath(os.path.dirname('metis_proj_2_luther'))
if 'weather_data' not in os.listdir(git_folder_location):
    !mkdir 'weather_data'

# already scraped (list from what's already saved in the folder)
done_list = os.listdir(git_folder_location+'/weather_data/')
if '.DS_Store' in done_list:
    done_list.remove('.DS_Store')
    
#done_list.remove('.DS_Store')
done_list_processed = [x[11:][:-4].split('_') for x in done_list]
for date in done_list_processed:
    if len(date[0])<2:
        date[0]='0'+date[0]
done_list_processed = [x[1]+'-'+x[0] for x in done_list_processed]

# remove done items from list of month & year combo that need to be scraped
year_month_list_done_removed = year_month_list.copy()
for done_item in done_list_processed:
    year_month_list_done_removed.remove(done_item)

# iterate through each year & month combo to scrape
for date in year_month_list_done_removed:
    # define initial empty dataframe
    df_weather = pd.DataFrame({'DATE_TIME':[], 'HIGH_LOW':[], 'WEATHER_DESC':[],'HUMIDITY_BAROMETER':[],'WIND':[]})
    month = int(date[5:])
    year = date[:4]
    #url = 'https://www.timeanddate.com/weather/usa/{}/historic?month={}&year={}'.format(city_name,month,year)
    url = 'https://www.timeanddate.com/weather/{}/historic?month={}&year={}'.format(city_name,month,year)
    high_low, weather_desc, humidity_barometer, wind, date_time = scrape_weather_url(url)
    df_weather_holder = pd.DataFrame({'DATE_TIME':date_time, 'HIGH_LOW':high_low, 'WEATHER_DESC':weather_desc,'HUMIDITY_BAROMETER':humidity_barometer,'WIND':wind})
    df_weather = df_weather.append(df_weather_holder)
    df_weather.to_csv('weather_data/df_weather_{}_{}.csv'.format(month,year),index=False)
    

# Compile Individual Files & Pickle It

In [17]:
# Obtain scraped data files per month
done_list = os.listdir(git_folder_location+'/weather_data/')
if '.DS_Store' in done_list:
    done_list.remove('.DS_Store')

# Initiate empty dataframe for weather data to compile individual files
df_weather = pd.DataFrame({'DATE_TIME':[], 'HIGH_LOW':[], 'WEATHER_DESC':[],'HUMIDITY_BAROMETER':[],'WIND':[]})

# Concat all individual files
for file_name in done_list:
    file = pd.read_csv('weather_data/'+file_name)
    df_weather = pd.concat([df_weather,file],ignore_index=True,axis=0)

# Remove duplicate rows. (Mistake created by earlier code imperfection that was corrected later)
df_weather.drop_duplicates(inplace=True)

# Process and create new columns with individual features for further feature engineering
df_weather['DATE_TIME_PROCESSED'] = df_weather['DATE_TIME'].apply(lambda x : x.split(',')[1]+x.split(',')[2])
df_weather['DATE_TIME_PROCESSED'] = pd.to_datetime(df_weather['DATE_TIME_PROCESSED'], format = ' %B %d %Y')
df_weather['T_HIGH_F'] = df_weather['HIGH_LOW'].apply(lambda x : x.split('/')[0].strip())
df_weather['T_LOW_F'] = df_weather['HIGH_LOW'].apply(lambda x : x.split('/')[1].strip().split()[0])
df_weather['HUMIDITY_%'] = df_weather['HUMIDITY_BAROMETER'].apply(lambda x : x.split()[1][:-1])
df_weather['BAROMETER_HG'] = df_weather['HUMIDITY_BAROMETER'].apply(lambda x : x.split()[3])
df_weather['WIND_DIRECTION'] = df_weather['WIND'].apply(lambda x : x.split()[0])
df_weather['WIND_MPH'] = df_weather['WIND'].apply(lambda x : x.split()[2])
df_weather['County_join'] = county_dan

# Drop raw columns that contained multiple featuers as texts
df_weather_processed = df_weather.drop(['DATE_TIME', 'HIGH_LOW','HUMIDITY_BAROMETER', 'WIND'], axis=1)

# Remove rows for which weather data was not available
df_weather_processed = df_weather_processed[df_weather_processed['T_HIGH_F']!='N']

# Turn object dtypes to floats for temperatures, humidity, barometer reading, and wind intensity. 
convert_to_int_list = ['T_LOW_F','T_HIGH_F','HUMIDITY_%','BAROMETER_HG','WIND_MPH']
for feature in convert_to_int_list:
        df_weather_processed[feature] = df_weather_processed[feature].astype(float)

# Save clean data
with open('df_weather_clean.pkl', 'wb') as picklefile:
    pickle.dump(df_weather_processed, picklefile)

# Load Pickled File

In [None]:
# Read the saved file.
with open('df_weather_clean.pkl', 'rb') as picklefile: 
    df_weather_processed = pickle.load(picklefile)

In [18]:
df_weather_processed

Unnamed: 0,WEATHER_DESC,DATE_TIME_PROCESSED,T_HIGH_F,T_LOW_F,HUMIDITY_%,BAROMETER_HG,WIND_DIRECTION,WIND_MPH,County_join
0,Clear.,2016-05-26,75.0,70.0,41.0,30.07,W,3.107,NASSAU
6,Clear.,2016-05-25,64.0,61.0,64.0,30.01,WNW,2.486,NASSAU
7,Light rain. Fog.,2016-05-24,63.0,57.0,79.0,29.88,W,2.486,NASSAU
8,Clear.,2016-05-23,57.0,55.0,80.0,29.89,N,0.000,NASSAU
9,Overcast.,2016-05-22,54.0,52.0,82.0,29.82,N,4.971,NASSAU
10,Clear.,2016-05-21,63.0,57.0,47.0,30.17,WSW,1.243,NASSAU
11,Clear.,2016-05-20,59.0,54.0,49.0,30.17,WNW,4.350,NASSAU
12,Overcast.,2016-05-19,55.0,55.0,58.0,30.12,N,0.000,NASSAU
13,Cool.,2016-05-18,54.0,52.0,76.0,0.00,N,0.000,NASSAU
14,Refreshingly cool.,2016-05-17,57.0,55.0,44.0,10.06,N,1.243,NASSAU


In [9]:
print(df_weather_processed.shape)

(915, 9)
