## EDA - Task 3.2 (second try)

In [1]:
import pandas as pd
import numpy as np

In [2]:
import datetime as dt

In [3]:
import requests
import time

In [4]:
flights = pd.read_csv('data/flights_10000.csv', index_col=0)

## **Task 3**: Does the weather affect the delay? 
Use the API to pull the weather information for flights. There is no need to get weather for ALL flights. We can choose the right representative sample. Let's focus on four weather types:
- sunny
- cloudy
- rainy
- snow.
Test the hypothesis that these 4 delays are from the same distribution. If they are not, which ones are significantly different?

In [5]:
#select the relevant cols 
weather_df = flights[['fl_date', 'origin_city_name','dest_city_name']]
weather_df.head()

Unnamed: 0,fl_date,origin_city_name,dest_city_name
0,2018-07-10,"San Antonio, TX","Nashville, TN"
1,2019-10-10,"Chicago, IL","Orlando, FL"
2,2019-07-18,"Lawton/Fort Sill, OK","Dallas/Fort Worth, TX"
3,2018-11-25,"Savannah, GA","Chicago, IL"
4,2018-10-24,"Detroit, MI","Santa Ana, CA"


### Create date/city unique combos table

In [6]:
#let's create a list of all city/date combinations to see if we can be more efficient with our api pulls

city_date = flights[['fl_date', 'origin_city_name']].rename({'origin_city_name': 'city'}, axis = 1)    #start with origin cities
temp = flights[['fl_date', 'dest_city_name']].rename({'dest_city_name': 'city'}, axis = 1)             #dest cities

In [7]:
#append dest cities to origin cities
city_date = pd.concat([city_date, temp], ignore_index=True)

In [8]:
city_date.shape        #confirm the df doubles in size

(20000, 2)

In [9]:
#find unique city_date combos
city_date_unique = city_date.groupby(['fl_date','city']).size().reset_index().rename(columns={0:'count'}).sort_values(by = 'count', ascending = False)

In [43]:
#add empty weather col
city_date_unique['weather_categ'] = 'empty'

In [31]:
city_date_unique.reset_index(drop = True, inplace= True)

In [44]:
city_date_unique

Unnamed: 0,fl_date,city,count,weather_categ
0,2018-10-25,"New York, NY",7,empty
1,2019-07-10,"Chicago, IL",7,empty
2,2019-10-02,"Chicago, IL",7,empty
3,2019-07-01,"Chicago, IL",7,empty
4,2019-11-23,"Denver, CO",7,empty
...,...,...,...,...
15625,2018-10-05,"Baltimore, MD",1,empty
15626,2018-10-05,"Boston, MA",1,empty
15627,2018-10-05,"Charlotte, NC",1,empty
15628,2018-10-05,"Fort Lauderdale, FL",1,empty


In [12]:
city_date_unique.shape

(15630, 4)

### Import lat long table
table was cleaned in EDA_Task3 notebook

In [13]:
#api requires lat/long, found a csv of american cities and lat/longs
latlong_df = pd.read_csv('data/uscities_ll.csv')

In [14]:
latlong_df.shape

(30409, 6)

In [15]:
latlong_df

Unnamed: 0,city,state_id,lat,lng,timezone,city_state
0,New York,NY,40.6943,-73.9249,America/New_York,"New York, NY"
1,Los Angeles,CA,34.1141,-118.4068,America/Los_Angeles,"Los Angeles, CA"
2,Chicago,IL,41.8375,-87.6866,America/Chicago,"Chicago, IL"
3,Miami,FL,25.7840,-80.2101,America/New_York,"Miami, FL"
4,Dallas,TX,32.7935,-96.7667,America/Chicago,"Dallas, TX"
...,...,...,...,...,...,...
30404,Drummond,ID,43.9996,-111.3433,America/Boise,"Drummond, ID"
30405,Lost Springs,WY,42.7652,-104.9255,America/Denver,"Lost Springs, WY"
30406,Provo,SD,43.1937,-103.8329,America/Denver,"Provo, SD"
30407,Goldcreek,MT,46.5838,-112.9284,America/Denver,"Goldcreek, MT"


In [16]:
latlong_df['city_state'].nunique()

30351

In [17]:
latlong_df = latlong_df.drop_duplicates(subset='city_state', keep="first")

In [18]:
latlong_df

Unnamed: 0,city,state_id,lat,lng,timezone,city_state
0,New York,NY,40.6943,-73.9249,America/New_York,"New York, NY"
1,Los Angeles,CA,34.1141,-118.4068,America/Los_Angeles,"Los Angeles, CA"
2,Chicago,IL,41.8375,-87.6866,America/Chicago,"Chicago, IL"
3,Miami,FL,25.7840,-80.2101,America/New_York,"Miami, FL"
4,Dallas,TX,32.7935,-96.7667,America/Chicago,"Dallas, TX"
...,...,...,...,...,...,...
30404,Drummond,ID,43.9996,-111.3433,America/Boise,"Drummond, ID"
30405,Lost Springs,WY,42.7652,-104.9255,America/Denver,"Lost Springs, WY"
30406,Provo,SD,43.1937,-103.8329,America/Denver,"Provo, SD"
30407,Goldcreek,MT,46.5838,-112.9284,America/Denver,"Goldcreek, MT"


### API functions

In [19]:
#open weather api function
def open_weather_api(lat, long, date, timezone = "America%2FChicago"):
    """
    function returns individual api calls by location and date. 
    """
    #timezone = America%2FChicago #timezone format, opportunity to fine tune this - doesn't work for some reason, I think formatting
    
    url = f"https://archive-api.open-meteo.com/v1/era5?latitude={lat}&longitude={long}&start_date={date}&end_date={date}&hourly=precipitation,rain,snowfall,cloudcover&daily=precipitation_sum,rain_sum,snowfall_sum,precipitation_hours&timezone={timezone}"
    response = requests.get(url)
    return response.json()

In [20]:
test_api = open_weather_api("41.75","-87.75","2018-10-25", "America%2FNew_York")
test_api

{'latitude': 41.75,
 'longitude': -87.75,
 'generationtime_ms': 0.32901763916015625,
 'utc_offset_seconds': -14400,
 'timezone': 'America/New_York',
 'timezone_abbreviation': 'EDT',
 'elevation': 192.0,
 'hourly_units': {'time': 'iso8601',
  'precipitation': 'mm',
  'rain': 'mm',
  'snowfall': 'cm',
  'cloudcover': '%'},
 'hourly': {'time': ['2018-10-25T00:00',
   '2018-10-25T01:00',
   '2018-10-25T02:00',
   '2018-10-25T03:00',
   '2018-10-25T04:00',
   '2018-10-25T05:00',
   '2018-10-25T06:00',
   '2018-10-25T07:00',
   '2018-10-25T08:00',
   '2018-10-25T09:00',
   '2018-10-25T10:00',
   '2018-10-25T11:00',
   '2018-10-25T12:00',
   '2018-10-25T13:00',
   '2018-10-25T14:00',
   '2018-10-25T15:00',
   '2018-10-25T16:00',
   '2018-10-25T17:00',
   '2018-10-25T18:00',
   '2018-10-25T19:00',
   '2018-10-25T20:00',
   '2018-10-25T21:00',
   '2018-10-25T22:00',
   '2018-10-25T23:00'],
  'precipitation': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
 

In [22]:
def city_scrubber(city_str):
    """
    city name will be scrubbed to remove multiple names
    Dallas/Fort Something, TX -----> Dallas, TX
    """
    try:
        state = city_str.split(',')       #split city(s) and state
        cities = state[0].split('/')      #split the list of cities by /
        return cities[0] + ',' + state[1]
    except: 
        return "no state in your city string"

In [23]:
#test
city_scrubber("Dallas/Fort Something, TX")

'Dallas, TX'

In [24]:
city_scrubber("Austin, TX")

'Austin, TX'

In [25]:
city_scrubber("Austin")

'no state in your city string'

In [39]:
#function that returns the weather condition for a date/city
def weather_condition(city, date, lat_long_df):
    """
    returns the weather condition for a single date/city. 
    city format "city, ST"
    date format "2020-01-01"
    """
    #scrub city
    #print(city)
    city_scrub = city_scrubber(city)
    if city_scrub == 'no state in your city string':
        return city_scrub, city
    
    #pull the lat and long for city
    lat = None
    long = None
    try:
        lat = float(lat_long_df['lat'][lat_long_df['city_state'] == city_scrub])
        long = float(lat_long_df['lng'][lat_long_df['city_state'] == city_scrub])
    except:
        print(lat, long, city)
        pass
    
    #call the weather api
    if lat and long:         #none is False
        json_result = open_weather_api(lat, long, date)
        #print(json_result)
        try:
            total_rain = int(json_result['daily']['rain_sum'][0])                  # mm
        except: 
            total_rain = None
        try:
            total_snow = int(json_result['daily']['snowfall_sum'][0])              # cm        
        except:
            total_snow = None
        try:
            cloudcover_mean = round(sum(json_result['hourly']['cloudcover'])/len(json_result['hourly']['cloudcover']))     # %
        except:
            cloudcover_mean = None

        if total_rain is None and total_snow is None and cloudcover_mean is None:
            result = "no weather data"    
        elif total_rain == 0 and total_snow == 0 and cloudcover_mean < 40:
            result = "sunny"
        elif total_rain == 0 and total_snow == 0 and cloudcover_mean >= 40:
            result = "cloudy"
        elif total_rain > 0 and total_snow == 0:
            result = "rain"
        elif total_rain == 0 and total_snow > 0:
            result = "snow"
        elif total_rain > 0 and total_snow > 0:
            result = "snow & rain"
        else:
            result = f"Error, total rain: {total_rain}, total snow: {total_snow}, mean cloud: {cloudcover_mean}"

        return result  

In [27]:
latlong_df['lat'][latlong_df['city_state'] == "Denver, CO"]

18    39.762
Name: lat, dtype: float64

In [28]:
#test
weather_condition("Denver, CO", "2019-11-23", latlong_df)

'sunny'

In [45]:
for row in range(3):
    city_date_unique['weather_categ'][row] = weather_condition(city_date_unique['city'][row], city_date_unique['fl_date'][row], latlong_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_date_unique['weather_categ'][row] = weather_condition(city_date_unique['city'][row], city_date_unique['fl_date'][row], latlong_df)


In [46]:
city_date_unique['weather_categ']

0        sunny
1         rain
2         rain
3        empty
4        empty
         ...  
15625    empty
15626    empty
15627    empty
15628    empty
15629    empty
Name: weather_categ, Length: 15630, dtype: object

In [88]:
city_date_unique['weather_categ'] = city_date_unique.apply(lambda x: weather_condition(x['city'], x['fl_date'], latlong_df), axis = 1)

None None Kona, HI
None None Christiansted, VI
None None Kona, HI
None None Lanai, HI
None None Christiansted, VI
None None Lanai, HI
None None Hilton Head, SC
None None Kona, HI
None None Deadhorse, AK
None None Newburgh/Poughkeepsie, NY
None None Charlotte Amalie, VI
None None Kona, HI
None None Kona, HI
None None Charlotte Amalie, VI
None None Islip, NY
None None Kona, HI
None None Islip, NY
None None Kona, HI
None None Kona, HI
None None Kona, HI
None None Guam, TT
None None Saipan, TT
None None Kona, HI
None None Newburgh/Poughkeepsie, NY
None None Lanai, HI
None None Kona, HI
None None Islip, NY
None None Kona, HI
None None Nantucket, MA
None None Kona, HI
None None Islip, NY
None None Guam, TT
None None Saipan, TT
None None Charlotte Amalie, VI
None None Kona, HI
None None Nantucket, MA
None None Martha's Vineyard, MA
None None Charlotte Amalie, VI
None None Martha's Vineyard, MA
None None Barrow, AK
None None Deadhorse, AK
None None Barrow, AK
None None Kona, HI
None None Kona,

ConnectionError: HTTPSConnectionPool(host='archive-api.open-meteo.com', port=443): Max retries exceeded with url: /v1/era5?latitude=34.0378&longitude=-80.9036&start_date=2018-04-13&end_date=2018-04-13&hourly=precipitation,rain,snowfall,cloudcover&daily=precipitation_sum,rain_sum,snowfall_sum,precipitation_hours&timezone=America%2FChicago (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001FBA93FC760>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

In [55]:
import time

In [57]:
start = time.time()
weather_list = [weather_condition(city_date_unique['city'][x], city_date_unique['fl_date'][x], latlong_df) for x in range(100)]
end = time.time()
print(end - start)

33.94999289512634


In [58]:
weather_list

['sunny',
 'rain',
 'rain',
 'cloudy',
 'sunny',
 'rain',
 'rain',
 'sunny',
 'rain',
 'cloudy',
 'snow & rain',
 'cloudy',
 'sunny',
 'rain',
 'sunny',
 'cloudy',
 'cloudy',
 'sunny',
 'rain',
 'cloudy',
 'snow',
 'sunny',
 'sunny',
 'rain',
 'cloudy',
 'sunny',
 'rain',
 'rain',
 'sunny',
 'rain',
 'rain',
 'rain',
 'cloudy',
 'sunny',
 'cloudy',
 'cloudy',
 'rain',
 'cloudy',
 'cloudy',
 'rain',
 'sunny',
 'cloudy',
 'rain',
 'sunny',
 'cloudy',
 'cloudy',
 'rain',
 'cloudy',
 'cloudy',
 'sunny',
 'sunny',
 'rain',
 'rain',
 'cloudy',
 'cloudy',
 'rain',
 'sunny',
 'cloudy',
 'sunny',
 'sunny',
 'sunny',
 'cloudy',
 'sunny',
 'sunny',
 'cloudy',
 'rain',
 'sunny',
 'rain',
 'cloudy',
 'sunny',
 'rain',
 'sunny',
 'cloudy',
 'cloudy',
 'sunny',
 'sunny',
 'cloudy',
 'sunny',
 'sunny',
 'rain',
 'sunny',
 'cloudy',
 'sunny',
 'sunny',
 'sunny',
 'sunny',
 'cloudy',
 'sunny',
 'rain',
 'rain',
 'cloudy',
 'cloudy',
 'cloudy',
 'rain',
 'sunny',
 'sunny',
 'cloudy',
 'cloudy',
 'snow',


In [34]:
city_date_unique.shape

(15630, 4)

In [None]:
#do not use this:

In [29]:
city_date_unique['weather_categ'][0] = city_date_unique.apply(lambda x: weather_condition(x['city'], x['fl_date'], latlong_df), axis = 1)

KeyboardInterrupt: 

### now that we have the weather for each date/city combo, match that weather for each flight

In [None]:
#merge for origin city
weather_master = pd.merge(weather_df, city_date_unique, how = 'left', left_on = ['fl_date', 'origin_city_name'], right_on = ['date', 'city_state'])

In [None]:
#rename col weather_origin

In [None]:
weather_master = pd.merge(weather_master, city_date_unique, how = 'left', left_on = ['fl_date', 'dest_city_name'], right_on = ['date', 'city_state'])

In [None]:
#remane that col weather_dest

In [69]:
#load flights data
flights = pd.read_csv('data/flights_10000.csv', index_col=0)

In [70]:
flights.head(2)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-07-10,WN,WN,WN,2212,WN,N958WN,2212,14683,SAT,...,822.0,,,,,,,,,
1,2019-10-10,UA,UA,UA,206,UA,N68811,206,13930,ORD,...,1005.0,,,,,,,,,


In [64]:
#load unique table with the weather data
city_unique = pd.read_csv('data/final_city_unique.csv', index_col=0)

In [65]:
city_unique.head(2)

Unnamed: 0,fl_date,city,count,weather_categ
0,2018-10-25,"New York, NY",7,sunny
1,2019-07-10,"Chicago, IL",7,rain


In [71]:
#we don't want to add the count, only the weather
city_unique.drop('count', axis = 1, inplace = True)

In [72]:
#merge for origin city
flights = pd.merge(flights, city_unique, how = 'left', left_on = ['fl_date', 'origin_city_name'], right_on = ['fl_date', 'city'])

In [74]:
flights.head(2)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,city,weather_categ
0,2018-07-10,WN,WN,WN,2212,WN,N958WN,2212,14683,SAT,...,,,,,,,,,"San Antonio, TX",cloudy
1,2019-10-10,UA,UA,UA,206,UA,N68811,206,13930,ORD,...,,,,,,,,,"Chicago, IL",rain


In [75]:
#rename col weather_origin
flights.rename({'weather_categ': 'orig_weather_categ'}, axis = 1, inplace= True)
#drop the city col which also merged
flights.drop('city', axis = 1, inplace = True)

In [76]:
flights.head(2)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,orig_weather_categ
0,2018-07-10,WN,WN,WN,2212,WN,N958WN,2212,14683,SAT,...,,,,,,,,,,cloudy
1,2019-10-10,UA,UA,UA,206,UA,N68811,206,13930,ORD,...,,,,,,,,,,rain


In [77]:
#add the dest weather
flights = pd.merge(flights, city_unique, how = 'left', left_on = ['fl_date', 'dest_city_name'], right_on = ['fl_date', 'city'])

In [78]:
flights.head(2)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,orig_weather_categ,city,weather_categ
0,2018-07-10,WN,WN,WN,2212,WN,N958WN,2212,14683,SAT,...,,,,,,,,cloudy,"Nashville, TN",cloudy
1,2019-10-10,UA,UA,UA,206,UA,N68811,206,13930,ORD,...,,,,,,,,rain,"Orlando, FL",rain


In [79]:
#rename col weather dest
flights.rename({'weather_categ': 'dest_weather_categ'}, axis = 1, inplace= True)
#drop the city col which also merged
flights.drop('city', axis = 1, inplace = True)

In [80]:
flights.head(2)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,orig_weather_categ,dest_weather_categ
0,2018-07-10,WN,WN,WN,2212,WN,N958WN,2212,14683,SAT,...,,,,,,,,,cloudy,cloudy
1,2019-10-10,UA,UA,UA,206,UA,N68811,206,13930,ORD,...,,,,,,,,,rain,rain


In [88]:
#save over the flights_10000 csv
flights.to_csv('data/flights_10000.csv', header=True, index=False)

## Now let's compare the weather to the delay

In [83]:
weather_delay_df = flights[['origin_city_name','dest_city_name', 'orig_weather_categ','dest_weather_categ', 'arr_delay']]

In [84]:
weather_delay_df.head()

Unnamed: 0,origin_city_name,dest_city_name,orig_weather_categ,dest_weather_categ,arr_delay
0,"San Antonio, TX","Nashville, TN",cloudy,cloudy,-2.0
1,"Chicago, IL","Orlando, FL",rain,rain,-9.0
2,"Lawton/Fort Sill, OK","Dallas/Fort Worth, TX",sunny,sunny,8.0
3,"Savannah, GA","Chicago, IL",cloudy,snow & rain,
4,"Detroit, MI","Santa Ana, CA",sunny,sunny,28.0


In [85]:
#fill nans
weather_delay_df['arr_delay'] = weather_delay_df['arr_delay'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_delay_df['arr_delay'] = weather_delay_df['arr_delay'].fillna(0)


In [86]:
weather_delay_df.head()

Unnamed: 0,origin_city_name,dest_city_name,orig_weather_categ,dest_weather_categ,arr_delay
0,"San Antonio, TX","Nashville, TN",cloudy,cloudy,-2.0
1,"Chicago, IL","Orlando, FL",rain,rain,-9.0
2,"Lawton/Fort Sill, OK","Dallas/Fort Worth, TX",sunny,sunny,8.0
3,"Savannah, GA","Chicago, IL",cloudy,snow & rain,0.0
4,"Detroit, MI","Santa Ana, CA",sunny,sunny,28.0


In [87]:
import seaborn as sns
import matplotlib.pyplot as plt