In [56]:
import sys

print(sys.version)
print("")
print(f"Pandas: version {pd.__version__}")

3.8.3 (default, Jul  2 2020, 16:21:59) 
[GCC 7.3.0]

Pandas: version 1.1.1


In [2]:
import pandas as pd

new_york = pd.read_csv("../data/weather/new_york_ny.csv")

print(new_york.shape)
new_york.head()

(17056, 25)


Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,moon_illumination,moonrise,moonset,sunrise,...,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph,location
0,2009-01-01 00:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,38,22,62,0.0,1017,-6,10,316,27,10007
1,2009-01-01 06:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,33,7,63,0.0,1023,-8,10,315,24,10007
2,2009-01-01 12:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,26,4,42,0.0,1025,-3,10,304,23,10007
3,2009-01-01 18:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,21,16,49,0.0,1025,-5,10,294,13,10007
4,2009-01-02 00:00:00,0,0,0.1,7.0,2,38,11:28 AM,11:54 PM,08:20 AM,...,14,46,49,0.0,1023,-6,10,260,8,10007


In [3]:
new_york.columns

Index(['date_time', 'maxtempC', 'mintempC', 'totalSnow_cm', 'sunHour',
       'uvIndex', 'moon_illumination', 'moonrise', 'moonset', 'sunrise',
       'sunset', 'DewPointC', 'FeelsLikeC', 'HeatIndexC', 'WindChillC',
       'WindGustKmph', 'cloudcover', 'humidity', 'precipMM', 'pressure',
       'tempC', 'visibility', 'winddirDegree', 'windspeedKmph', 'location'],
      dtype='object')

### Planning

#### Pre-modeling Steps

**Accounting for Seasonal Variations**


- Create dynamic datetime splits separating the four seasons and taking the year as an input,
- Use these datetime splits to subset New York City's data by year and season into smaller DataFrames,
- Identify max, min, avg, and med of the following seasonal weather metrics (both metric and imperial because **AMERICA**):
    - tempC
    - FeelsLikeC, see above
    - totalSnow_cm  **<--**  if None, final app will return a verbal message saying it does not Snow there
    - humidity
    
- Store the above data in a single DataFrame and create visualizations for the annual progression of each


**NOTE:** Data will begin on the first day of Spring 2009 and be cut off on the last day of summer 2020 so as to account for potentally eroneous data arising from partial seasons. Go back and make this adjustment to the *weather.py* file.


**Applying the above process to all of the data**


- Encapsulate the above process in a function or functions,
- Devise means of applying the above function(s) to all csv files in the root data/weather directory,
- Organize and store yearly averages in a by_year DataFrame,
- Create visualizations showing how the average seasonal weather of each compares with the average of all,
- Encapsulate the above visualization process into a function for easy use.


**Testing that this process will function in Fast API in notebook**


- Replicate Ryan Herr's example notebook material for testing Fast API locally to this notebook.
- Test that the above visualization-making functions work locally.

In [26]:
new_york.dtypes

date_time             object
maxtempC               int64
mintempC               int64
totalSnow_cm         float64
sunHour              float64
uvIndex                int64
moon_illumination      int64
moonrise              object
moonset               object
sunrise               object
sunset                object
DewPointC              int64
FeelsLikeC             int64
HeatIndexC             int64
WindChillC             int64
WindGustKmph           int64
cloudcover             int64
humidity               int64
precipMM             float64
pressure               int64
tempC                  int64
visibility             int64
winddirDegree          int64
windspeedKmph          int64
location               int64
dtype: object

In [27]:
new_york.date_time

0        2009-01-01 00:00:00
1        2009-01-01 06:00:00
2        2009-01-01 12:00:00
3        2009-01-01 18:00:00
4        2009-01-02 00:00:00
                ...         
17051    2020-09-02 18:00:00
17052    2020-09-03 00:00:00
17053    2020-09-03 06:00:00
17054    2020-09-03 12:00:00
17055    2020-09-03 18:00:00
Name: date_time, Length: 17056, dtype: object

In [49]:
# Checking date_time format

date = new_york.date_time[0]
print(type(date), date)

<class 'str'> 2009-01-01 00:00:00


In [50]:
# converting datetime from string to datetime object

from datetime import datetime

date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')

print(type(date), date)

<class 'datetime.datetime'> 2009-01-01 00:00:00


In [52]:
# Applying change to all dates

new_york.date_time = new_york.date_time.apply(lambda d: datetime.strptime(d, '%Y-%m-%d %H:%M:%S'))
new_york.date_time.head()

0   2009-01-01 00:00:00
1   2009-01-01 06:00:00
2   2009-01-01 12:00:00
3   2009-01-01 18:00:00
4   2009-01-02 00:00:00
Name: date_time, dtype: datetime64[ns]

### Meterological Seasons

**Spring:** March 1 - May 31

**Summer:** June 1 - August 31

**Fall:** September 1 - November 30

**Winter:** December 1 - April 30

In [63]:
splits = []

for i in range(2009, 2021):
    splits.append(datetime(i, 3, 1))
    splits.append(datetime(i, 6, 1))
    splits.append(datetime(i, 9, 1))
    splits.append(datetime(i, 12, 1))
    
print(splits)

[datetime.datetime(2009, 3, 1, 0, 0), datetime.datetime(2009, 6, 1, 0, 0), datetime.datetime(2009, 9, 1, 0, 0), datetime.datetime(2009, 12, 1, 0, 0), datetime.datetime(2010, 3, 1, 0, 0), datetime.datetime(2010, 6, 1, 0, 0), datetime.datetime(2010, 9, 1, 0, 0), datetime.datetime(2010, 12, 1, 0, 0), datetime.datetime(2011, 3, 1, 0, 0), datetime.datetime(2011, 6, 1, 0, 0), datetime.datetime(2011, 9, 1, 0, 0), datetime.datetime(2011, 12, 1, 0, 0), datetime.datetime(2012, 3, 1, 0, 0), datetime.datetime(2012, 6, 1, 0, 0), datetime.datetime(2012, 9, 1, 0, 0), datetime.datetime(2012, 12, 1, 0, 0), datetime.datetime(2013, 3, 1, 0, 0), datetime.datetime(2013, 6, 1, 0, 0), datetime.datetime(2013, 9, 1, 0, 0), datetime.datetime(2013, 12, 1, 0, 0), datetime.datetime(2014, 3, 1, 0, 0), datetime.datetime(2014, 6, 1, 0, 0), datetime.datetime(2014, 9, 1, 0, 0), datetime.datetime(2014, 12, 1, 0, 0), datetime.datetime(2015, 3, 1, 0, 0), datetime.datetime(2015, 6, 1, 0, 0), datetime.datetime(2015, 9, 1, 0

In [64]:
splits = splits[1:]
len(splits)

47

In [65]:
# Testing split:)

spring_2009 = new_york[new_york.date_time < splits[0]]

print(spring_2009.shape)
spring_2009.head()

(604, 25)


Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,moon_illumination,moonrise,moonset,sunrise,...,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph,location
0,2009-01-01 00:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,38,22,62,0.0,1017,-6,10,316,27,10007
1,2009-01-01 06:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,33,7,63,0.0,1023,-8,10,315,24,10007
2,2009-01-01 12:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,26,4,42,0.0,1025,-3,10,304,23,10007
3,2009-01-01 18:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,21,16,49,0.0,1025,-5,10,294,13,10007
4,2009-01-02 00:00:00,0,0,0.1,7.0,2,38,11:28 AM,11:54 PM,08:20 AM,...,14,46,49,0.0,1023,-6,10,260,8,10007


In [100]:
desc = spring_2009.tempC.describe()[1:]
desc

mean     5.743377
std      7.970013
min    -14.000000
25%      0.000000
50%      6.000000
75%     11.000000
max     27.000000
Name: tempC, dtype: float64

In [89]:
# to_fahr function

def to_fahr(temp: float, system="celsius") -> float:
    """
    Converts temperature in celsius or kelvin to fahrenheit.
    """
    if type(system) != str:
        raise Exception(TypeError (f'Invalid system type {type(system)}, expected {str}'))
    elif system == "celsius":
        return ((temp * 9) / 5) + 32
    elif system == "kelvin":
        return ((temp * 9) / 5) - 459.67
    else:
        raise Exception(ValueError (f'Invalid system parameter "{system}"'))

In [101]:
# Applying to_fahr function to spring_2009 temp. stats
# in order to insure that the function works properly:)

fahr = desc.apply(lambda temp: to_fahr(temp))
fahr

mean    42.338079
std     46.346023
min      6.800000
25%     32.000000
50%     42.800000
75%     51.800000
max     80.600000
Name: tempC, dtype: float64

In [102]:
# Applying to_fahr function to the complete dataset

new_york["maxtempF"] = new_york["maxtempC"].apply(lambda temp: to_fahr(temp))
new_york["mintempF"] = new_york["mintempC"].apply(lambda temp: to_fahr(temp))
new_york["DewPointF"] = new_york["DewPointC"].apply(lambda temp: to_fahr(temp))
new_york["FeelsLikeF"] = new_york["FeelsLikeC"].apply(lambda temp: to_fahr(temp))
new_york["HeatIndexF"] = new_york["HeatIndexC"].apply(lambda temp: to_fahr(temp))
new_york["WindChillF"] = new_york["WindChillC"].apply(lambda temp: to_fahr(temp))
new_york["tempF"] = new_york["tempC"].apply(lambda temp: to_fahr(temp))

print(new_york.shape)
new_york.head()

(17056, 32)


Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,moon_illumination,moonrise,moonset,sunrise,...,winddirDegree,windspeedKmph,location,maxtempF,mintempF,DewPointF,FeelsLikeF,HeatIndexF,WindChillF,tempF
0,2009-01-01 00:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,316,27,10007,32.0,32.0,10.4,6.8,21.2,6.8,21.2
1,2009-01-01 06:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,315,24,10007,32.0,32.0,6.8,3.2,17.6,3.2,17.6
2,2009-01-01 12:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,304,23,10007,32.0,32.0,6.8,15.8,26.6,15.8,26.6
3,2009-01-01 18:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,294,13,10007,32.0,32.0,8.6,14.0,23.0,14.0,23.0
4,2009-01-02 00:00:00,0,0,0.1,7.0,2,38,11:28 AM,11:54 PM,08:20 AM,...,260,8,10007,32.0,32.0,6.8,17.6,23.0,17.6,21.2


In [103]:
new_york.columns

Index(['date_time', 'maxtempC', 'mintempC', 'totalSnow_cm', 'sunHour',
       'uvIndex', 'moon_illumination', 'moonrise', 'moonset', 'sunrise',
       'sunset', 'DewPointC', 'FeelsLikeC', 'HeatIndexC', 'WindChillC',
       'WindGustKmph', 'cloudcover', 'humidity', 'precipMM', 'pressure',
       'tempC', 'visibility', 'winddirDegree', 'windspeedKmph', 'location',
       'maxtempF', 'mintempF', 'DewPointF', 'FeelsLikeF', 'HeatIndexF',
       'WindChillF', 'tempF'],
      dtype='object')

In [104]:
# Re-subsetting Spring 2009 Data

spring_2009 = new_york[new_york.date_time < splits[0]]

print(spring_2009.shape)
spring_2009.head()

(604, 32)


Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,moon_illumination,moonrise,moonset,sunrise,...,winddirDegree,windspeedKmph,location,maxtempF,mintempF,DewPointF,FeelsLikeF,HeatIndexF,WindChillF,tempF
0,2009-01-01 00:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,316,27,10007,32.0,32.0,10.4,6.8,21.2,6.8,21.2
1,2009-01-01 06:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,315,24,10007,32.0,32.0,6.8,3.2,17.6,3.2,17.6
2,2009-01-01 12:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,304,23,10007,32.0,32.0,6.8,15.8,26.6,15.8,26.6
3,2009-01-01 18:00:00,0,0,0.0,8.7,2,31,11:07 AM,10:50 PM,08:20 AM,...,294,13,10007,32.0,32.0,8.6,14.0,23.0,14.0,23.0
4,2009-01-02 00:00:00,0,0,0.1,7.0,2,38,11:28 AM,11:54 PM,08:20 AM,...,260,8,10007,32.0,32.0,6.8,17.6,23.0,17.6,21.2


In [106]:
# Subsetting remaining data

# seasons array = ["Spring", "Summer", "Fall", "Winter"]

# i = 1

# current_season = seasons_array[i]
# Loop through splits array
# for j in range(1, len(splits array) - 1):
    # subset = new_york[new_york.date_time > splits[j]]
    # subset = new_york[new_york.date_time < splits[j + 1]]
    # t = (current_season, subset.date_time[0].year, subset)
    # if i is less than 3:
        # i += 1
    # elif i == 3:
        # i = 0
    # current_season = season_array[i]

(1, 2, 3)
