In [60]:
import pandas as pd
import datetime
from bs4 import BeautifulSoup
import requests
from unicodedata import normalize
import re

In [6]:
cab_data = pd.read_csv("cab_data_p2.csv")

In [91]:
# Get weather data for 2016
start_date = datetime.datetime(2016, 1, 1)
end_date = datetime.datetime(2016, 12, 31)
date = start_date
date_to_features = dict()
# weather related features to extract
feature_names = ["Mean Temperature",
                "Max Temperature",
                "Min Temperature",
                "Dew Point",
                "Average Humidity",
                "Precipitation",
                "Snow",
                "Wind Speed",
                "Visibility"]
while (date <= end_date):
    # station: KNYC (Central Park, New York)
    (year, month, day) = (date.year, date.month, date.day)
    print(year, month, day)
    
    # Get the HTML of the weather webpage
    url_format_string = "http://www.wunderground.com/history/airport/KNYC/{year}/{month}/{day}/DailyHistory.html"
    url = url_format_string.format(year=year, month=month, day=day)
    response = requests.get(url)
    
    if response.status_code != 200:
        print("An error occurred while getting NYC weather data for {day}-{month}-{year}".format(
        year=year, month=month, day=day))
    else:
        features = dict()
        html = response.content
        soup = BeautifulSoup(html, "lxml")
        table = soup.find_all(attrs={'id': 'historyTable'})[0]
        
        table_rows = table.find_all('tr')
        feature_list = []
        found_features = [False, False, False, False, False, False, False, False, False]
        # Loop through the entries of the table to find weather features
        for tr in table_rows:
            td = tr.find_all('td')
            # normalize the text to account for string encoding
            row = [normalize('NFKD', i.text) for i in td]
            
            # There are multiple rows named 'snow' and 'precipitation'
            # Use len(row) > 2 to get the "right" rows
            if len(row) > 2 and row[0] in feature_names:
                data = row[1].strip() # remove extraneous whitespace
                result = re.sub('[^0-9.]','', data) # remove all alphabetic characters
                if result == "": # if there's no feature value (e.g.: T, for traces of precipitation/snow)
                    result = 0
                features[row[0]] = result
                found_features[feature_names.index(row[0])] = True
                feature_list.append(result)
        
        # If a feature is not present in the table, set it to a default value: 0
        for found_feature, feature_name in zip(found_features, feature_names):
            if (not found_feature):
                features[feature_name] = 0
        assert(len(features) == len(feature_names))
        date_to_features[date] = features
    date = date + datetime.timedelta(1)

2016 1 1
2016 1 2
2016 1 3
2016 1 4
2016 1 5
2016 1 6
2016 1 7
2016 1 8
2016 1 9
2016 1 10
2016 1 11
2016 1 12
2016 1 13
2016 1 14
2016 1 15
2016 1 16
2016 1 17
2016 1 18
2016 1 19
2016 1 20
2016 1 21
2016 1 22
2016 1 23
2016 1 24
2016 1 25
2016 1 26
2016 1 27
2016 1 28
2016 1 29
2016 1 30
2016 1 31
2016 2 1
2016 2 2
2016 2 3
2016 2 4
2016 2 5
2016 2 6
2016 2 7
2016 2 8
2016 2 9
2016 2 10
2016 2 11
2016 2 12
2016 2 13
2016 2 14
2016 2 15
2016 2 16
2016 2 17
2016 2 18
2016 2 19
2016 2 20
2016 2 21
2016 2 22
2016 2 23
2016 2 24
2016 2 25
2016 2 26
2016 2 27
2016 2 28
2016 2 29
2016 3 1
2016 3 2
2016 3 3
2016 3 4
2016 3 5
2016 3 6
2016 3 7
2016 3 8
2016 3 9
2016 3 10
2016 3 11
2016 3 12
2016 3 13
2016 3 14
2016 3 15
2016 3 16
2016 3 17
2016 3 18
2016 3 19
2016 3 20
2016 3 21
2016 3 22
2016 3 23
2016 3 24
2016 3 25
2016 3 26
2016 3 27
2016 3 28
2016 3 29
2016 3 30
2016 3 31
2016 4 1
2016 4 2
2016 4 3
2016 4 4
2016 4 5
2016 4 6
2016 4 7
2016 4 8
2016 4 9
2016 4 10
2016 4 11
2016 4 12
2016 4

In [89]:
# insert method for dictionary (list of values for each key)
def dict_insert(d, key, val):
    if key in d:
        d[key].append(val)
    else:
        d[key] = [val]

# put the weather-related features into the dataframe
def create_weather_features(df):
    dt_format = '%Y-%m-%dT%H:%M:%S.000'
    weather_features = dict()
    for time in df["tpep_dropoff_datetime"]:
        # Extract the datetime object from the timestamp
        dt = datetime.datetime.strptime(time, dt_format)
        y_m_d = datetime.datetime(dt.year, dt.month, dt.day)
        weather_dict = date_to_features[y_m_d]
        for feature_name in weather_dict:
            dict_insert(weather_features, feature_name, weather_dict[feature_name])
        
    for feature_name in weather_features:
        df[feature_name] = weather_features[feature_name]
    return df

cab_data = create_weather_features(cab_data)

Mean Temperature
492194
Max Temperature
492194
Min Temperature
492194
Dew Point
492194
Average Humidity
492194
Precipitation
492194
Snow
492194
Wind Speed
492194
Visibility
492194


In [90]:
# Inspect our newly created weather features
print(cab_data.iloc[0])

Unnamed: 0                                500324
dropoff_latitude                          40.774
dropoff_longitude                       -73.8709
extra                                          0
fare_amount                                   31
improvement_surcharge                        0.3
mta_tax                                      0.5
passenger_count                                2
payment_type                                   1
pickup_latitude                          40.7569
pickup_longitude                        -73.9731
ratecodeid                                     1
store_and_fwd_flag                             N
tip_amount                                     5
tolls_amount                                5.54
total_amount                               42.34
tpep_dropoff_datetime    2016-04-12T10:28:02.000
tpep_pickup_datetime     2016-04-12T10:05:28.000
trip_distance                               10.7
vendorid                                       1
Mean Temperature    