## EDA - Task 3

In [112]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os

In [113]:
import datetime as dt

In [114]:
flights = pd.read_csv('data/flights_10000.csv', index_col=0)
flights.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-07-10,WN,WN,WN,2212,WN,N958WN,2212,14683,SAT,...,822.0,,,,,,,,,
1,2019-10-10,UA,UA,UA,206,UA,N68811,206,13930,ORD,...,1005.0,,,,,,,,,
2,2019-07-18,AA,AA_CODESHARE,AA,4221,MQ,N694AE,4221,12891,LAW,...,140.0,,,,,,,,,
3,2018-11-25,UA,UA_CODESHARE,UA,4822,ZW,,4822,14685,SAV,...,773.0,,,,,,,,,
4,2018-10-24,DL,DL,DL,957,DL,N693DL,957,11433,DTW,...,1960.0,0.0,0.0,28.0,0.0,0.0,,,,


In [115]:
flights.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'no_name'],
      dtype='object')

## **Task 3**: Does the weather affect the delay? 
Use the API to pull the weather information for flights. There is no need to get weather for ALL flights. We can choose the right representative sample. Let's focus on four weather types:
- sunny
- cloudy
- rainy
- snow.
Test the hypothesis that these 4 delays are from the same distribution. If they are not, which ones are significantly different?

In [116]:
#select the relevant cols 
weather_df = flights[['fl_date', 'origin_city_name','crs_dep_time', 'dest_city_name', 'crs_arr_time']].head(10)
weather_df

Unnamed: 0,fl_date,origin_city_name,crs_dep_time,dest_city_name,crs_arr_time
0,2018-07-10,"San Antonio, TX",705,"Nashville, TN",915
1,2019-10-10,"Chicago, IL",800,"Orlando, FL",1148
2,2019-07-18,"Lawton/Fort Sill, OK",1034,"Dallas/Fort Worth, TX",1130
3,2018-11-25,"Savannah, GA",2200,"Chicago, IL",2327
4,2018-10-24,"Detroit, MI",841,"Santa Ana, CA",1039
5,2019-06-08,"Richmond, VA",1149,"Dallas/Fort Worth, TX",1400
6,2018-01-29,"Charlotte, NC",1436,"St. Louis, MO",1534
7,2019-08-14,"Allentown/Bethlehem/Easton, PA",1844,"Sanford, FL",2105
8,2019-01-28,"Tucson, AZ",735,"Dallas/Fort Worth, TX",1050
9,2018-03-05,"Charlotte, NC",1759,"Raleigh/Durham, NC",1854


In [117]:
import requests
import time

In [118]:
#let's create a list of all city/date combinations to see if we can be more efficient with our api pulls
city_date = flights[['fl_date', 'origin_city_name']].rename({'origin_city_name': 'city'}, axis = 1)    #start with origin cities
temp = flights[['fl_date', 'dest_city_name']].rename({'dest_city_name': 'city'}, axis = 1)             #dest cities

In [119]:
#append dest cities to origin cities
city_date = pd.concat([city_date, temp], ignore_index=True)

In [120]:
city_date.shape        #confirm the df doubles in size

(20000, 2)

In [231]:
#find unique city_date combos
city_date.groupby(['fl_date','city']).size().reset_index().rename(columns={0:'count'}).sort_values(by = 'count', ascending = False)

Unnamed: 0,fl_date,city,count
6240,2018-10-25,"New York, NY",7
11766,2019-07-10,"Chicago, IL",7
13671,2019-10-02,"Chicago, IL",7
11589,2019-07-01,"Chicago, IL",7
14817,2019-11-23,"Denver, CO",7
...,...,...,...
5819,2018-10-05,"Baltimore, MD",1
5820,2018-10-05,"Boston, MA",1
5821,2018-10-05,"Charlotte, NC",1
5826,2018-10-05,"Fort Lauderdale, FL",1


unique combos only reduce the total pulls by 4370. In the grand scheme of things this is not enough of a reducion to go this route.

In [3]:
#open weather api function
def open_weather_api(lat, long, date, timezone = "America%2FChicago"):
    """
    function returns individual api calls by location and date. 
    """
    #timezone = America%2FChicago #timezone format, opportunity to fine tune this - doesn't work for some reason, I think formatting
    
    url = f"https://archive-api.open-meteo.com/v1/era5?latitude={lat}&longitude={long}&start_date={date}&end_date={date}&hourly=precipitation,rain,snowfall,cloudcover&daily=precipitation_sum,rain_sum,snowfall_sum,precipitation_hours&timezone={timezone}"
    response = requests.get(url)
    return response.json()

In [321]:
test_api = open_weather_api("41.75","-87.75","2018-10-25", "America%2FNew_York")

In [322]:
test_api

{'latitude': 41.75,
 'longitude': -87.75,
 'generationtime_ms': 0.3420114517211914,
 'utc_offset_seconds': -14400,
 'timezone': 'America/New_York',
 'timezone_abbreviation': 'EDT',
 'elevation': 192.0,
 'hourly_units': {'time': 'iso8601',
  'precipitation': 'mm',
  'rain': 'mm',
  'snowfall': 'cm',
  'cloudcover': '%'},
 'hourly': {'time': ['2018-10-25T00:00',
   '2018-10-25T01:00',
   '2018-10-25T02:00',
   '2018-10-25T03:00',
   '2018-10-25T04:00',
   '2018-10-25T05:00',
   '2018-10-25T06:00',
   '2018-10-25T07:00',
   '2018-10-25T08:00',
   '2018-10-25T09:00',
   '2018-10-25T10:00',
   '2018-10-25T11:00',
   '2018-10-25T12:00',
   '2018-10-25T13:00',
   '2018-10-25T14:00',
   '2018-10-25T15:00',
   '2018-10-25T16:00',
   '2018-10-25T17:00',
   '2018-10-25T18:00',
   '2018-10-25T19:00',
   '2018-10-25T20:00',
   '2018-10-25T21:00',
   '2018-10-25T22:00',
   '2018-10-25T23:00'],
  'precipitation': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
  

In [157]:
#api requires lat/long, found a csv of american cities and lat/longs
lat_long_df = pd.read_csv('data/uscities.csv')
lat_long_df = lat_long_df[['city', 'state_id', 'lat', 'lng', 'timezone']]

  lat_long_df = pd.read_csv('data/uscities.csv')


In [158]:
lat_long_df.shape

(60818, 5)

In [159]:
lat_long_df

Unnamed: 0,city,state_id,lat,lng,timezone
0,New York,NY,40.6943,-73.9249,America/New_York
1,Los Angeles,CA,34.1141,-118.4068,America/Los_Angeles
2,Chicago,IL,41.8375,-87.6866,America/Chicago
3,Miami,FL,25.7840,-80.2101,America/New_York
4,Dallas,TX,32.7935,-96.7667,America/Chicago
...,...,...,...,...,...
60813,30404,ID,"Drummond, ID",,
60814,30405,WY,"Lost Springs, WY",,
60815,30406,SD,"Provo, SD",,
60816,30407,MT,"Goldcreek, MT",,


In [160]:
# create a column, city_state, to match our main city column

KeyError: 'city_state'

In [175]:
def city_state_creator(row):
    return str(row['city']) + ", " + row['state_id']

In [177]:
#adding city state to the whole data frame
lat_long_df['city_state'] = lat_long_df.apply(city_state_creator, axis = 1)           #pass in func object, not calling the func

In [178]:
lat_long_df

Unnamed: 0,city,state_id,lat,lng,timezone,city_state
0,New York,NY,40.6943,-73.9249,America/New_York,"New York, NY"
1,Los Angeles,CA,34.1141,-118.4068,America/Los_Angeles,"Los Angeles, CA"
2,Chicago,IL,41.8375,-87.6866,America/Chicago,"Chicago, IL"
3,Miami,FL,25.7840,-80.2101,America/New_York,"Miami, FL"
4,Dallas,TX,32.7935,-96.7667,America/Chicago,"Dallas, TX"
...,...,...,...,...,...,...
60813,30404,ID,"Drummond, ID",,,"30404, ID"
60814,30405,WY,"Lost Springs, WY",,,"30405, WY"
60815,30406,SD,"Provo, SD",,,"30406, SD"
60816,30407,MT,"Goldcreek, MT",,,"30407, MT"


dump the last rows of the df that do not match the format:

In [179]:
lat_long_df.drop(lat_long_df.tail(30409).index,inplace=True)

In [180]:
lat_long_df

Unnamed: 0,city,state_id,lat,lng,timezone,city_state
0,New York,NY,40.6943,-73.9249,America/New_York,"New York, NY"
1,Los Angeles,CA,34.1141,-118.4068,America/Los_Angeles,"Los Angeles, CA"
2,Chicago,IL,41.8375,-87.6866,America/Chicago,"Chicago, IL"
3,Miami,FL,25.7840,-80.2101,America/New_York,"Miami, FL"
4,Dallas,TX,32.7935,-96.7667,America/Chicago,"Dallas, TX"
...,...,...,...,...,...,...
30404,Drummond,ID,43.9996,-111.3433,America/Boise,"Drummond, ID"
30405,Lost Springs,WY,42.7652,-104.9255,America/Denver,"Lost Springs, WY"
30406,Provo,SD,43.1937,-103.8329,America/Denver,"Provo, SD"
30407,Goldcreek,MT,46.5838,-112.9284,America/Denver,"Goldcreek, MT"


In [None]:
#make the city_state the index

In [198]:
lat_long_df.set_index('city_state', inplace = True)

In [199]:
lat_long_df

Unnamed: 0_level_0,city,state_id,lat,lng,timezone
city_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"New York, NY",New York,NY,40.6943,-73.9249,America/New_York
"Los Angeles, CA",Los Angeles,CA,34.1141,-118.4068,America/Los_Angeles
"Chicago, IL",Chicago,IL,41.8375,-87.6866,America/Chicago
"Miami, FL",Miami,FL,25.7840,-80.2101,America/New_York
"Dallas, TX",Dallas,TX,32.7935,-96.7667,America/Chicago
...,...,...,...,...,...
"Drummond, ID",Drummond,ID,43.9996,-111.3433,America/Boise
"Lost Springs, WY",Lost Springs,WY,42.7652,-104.9255,America/Denver
"Provo, SD",Provo,SD,43.1937,-103.8329,America/Denver
"Goldcreek, MT",Goldcreek,MT,46.5838,-112.9284,America/Denver


In [195]:
#undo that
lat_long_df.reset_index(drop = True)

Unnamed: 0,city,state_id,lat,lng,timezone,city_state
0,New York,NY,40.6943,-73.9249,America/New_York,"New York, NY"
1,Los Angeles,CA,34.1141,-118.4068,America/Los_Angeles,"Los Angeles, CA"
2,Chicago,IL,41.8375,-87.6866,America/Chicago,"Chicago, IL"
3,Miami,FL,25.7840,-80.2101,America/New_York,"Miami, FL"
4,Dallas,TX,32.7935,-96.7667,America/Chicago,"Dallas, TX"
...,...,...,...,...,...,...
30404,Drummond,ID,43.9996,-111.3433,America/Boise,"Drummond, ID"
30405,Lost Springs,WY,42.7652,-104.9255,America/Denver,"Lost Springs, WY"
30406,Provo,SD,43.1937,-103.8329,America/Denver,"Provo, SD"
30407,Goldcreek,MT,46.5838,-112.9284,America/Denver,"Goldcreek, MT"


In [103]:
#save this modification to the csv
lat_long_df.to_csv('data/uscities_ll.csv', header=True, index=False)

In [318]:
#attempt to fix the timezone formatting
lat_long_df['timezone'] = lat_long_df['timezone'].str.replace("/", "%2F")

In [319]:
lat_long_df

Unnamed: 0_level_0,city,state_id,lat,lng,timezone
city_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"New York, NY",New York,NY,40.6943,-73.9249,America%2FNew_York
"Los Angeles, CA",Los Angeles,CA,34.1141,-118.4068,America%2FLos_Angeles
"Chicago, IL",Chicago,IL,41.8375,-87.6866,America%2FChicago
"Miami, FL",Miami,FL,25.7840,-80.2101,America%2FNew_York
"Dallas, TX",Dallas,TX,32.7935,-96.7667,America%2FChicago
...,...,...,...,...,...
"Drummond, ID",Drummond,ID,43.9996,-111.3433,America%2FBoise
"Lost Springs, WY",Lost Springs,WY,42.7652,-104.9255,America%2FDenver
"Provo, SD",Provo,SD,43.1937,-103.8329,America%2FDenver
"Goldcreek, MT",Goldcreek,MT,46.5838,-112.9284,America%2FDenver


In [126]:
#create empty cols in weather df to be filled with the column filler function
weather_df['weather_origin'] = 0
weather_df['weather_dest'] = 0
weather_df.head()

Unnamed: 0,fl_date,origin_city_name,crs_dep_time,dest_city_name,crs_arr_time,weather_origin,weather_dest
0,2018-07-10,"San Antonio, TX",705,"Nashville, TN",915,0,0
1,2019-10-10,"Chicago, IL",800,"Orlando, FL",1148,0,0
2,2019-07-18,"Lawton/Fort Sill, OK",1034,"Dallas/Fort Worth, TX",1130,0,0
3,2018-11-25,"Savannah, GA",2200,"Chicago, IL",2327,0,0
4,2018-10-24,"Detroit, MI",841,"Santa Ana, CA",1039,0,0


In [None]:
#get rid of the second city name i.e. Lawton/Fort Sill, OK -------> Lawton, OK

In [378]:
#cleaning the city names
state = weather_df['origin_city_name'].str.split(',').apply(lambda x: x[1])       #split city(s) and state
temp = weather_df['origin_city_name'].str.split(',').apply(lambda x: x[0])        #split the multiple states into a list
city_task3 = temp.str.split('/').apply(lambda x: x[0])                            #[0] will be the first city in your city list (temp)

result3 = city_task3 + ',' + state

In [379]:
result3

0    San Antonio, TX
1        Chicago, IL
2         Lawton, OK
3       Savannah, GA
4        Detroit, MI
5       Richmond, VA
6      Charlotte, NC
7      Allentown, PA
8         Tucson, AZ
9      Charlotte, NC
Name: origin_city_name, dtype: object

In [110]:
weather_df['origin_city_name'] = result3

NameError: name 'result3' is not defined

In [None]:
#replace the above with a func

In [127]:
def city_scrubber(df, target_column):
    """
    update target_column in dataframe df to remove multiple city names with alternate names within string.
    doesn't return anything, but your df will be scrubbed
    """
    state = df[target_column].str.split(',').apply(lambda x: x[1])       #split city(s) and state
    temp = df[target_column].str.split(',').apply(lambda x: x[0])        #split the multiple states into a list
    city = temp.str.split('/').apply(lambda x: x[0])                            #[0] will be the first city in your city list (temp)
    
    result = city + ',' + state
    df[target_column] = result

In [138]:
city_scrubber(weather_df, 'dest_city_name')
city_scrubber(weather_df, 'origin_city_name')

In [139]:
weather_df

Unnamed: 0,fl_date,origin_city_name,crs_dep_time,dest_city_name,crs_arr_time,weather_origin,weather_dest
0,2018-07-10,"San Antonio, TX",705,"Nashville, TN",915,0,0
1,2019-10-10,"Chicago, IL",800,"Orlando, FL",1148,0,0
2,2019-07-18,"Lawton, OK",1034,"Dallas, TX",1130,0,0
3,2018-11-25,"Savannah, GA",2200,"Chicago, IL",2327,0,0
4,2018-10-24,"Detroit, MI",841,"Santa Ana, CA",1039,0,0
5,2019-06-08,"Richmond, VA",1149,"Dallas, TX",1400,0,0
6,2018-01-29,"Charlotte, NC",1436,"St. Louis, MO",1534,0,0
7,2019-08-14,"Allentown, PA",1844,"Sanford, FL",2105,0,0
8,2019-01-28,"Tucson, AZ",735,"Dallas, TX",1050,0,0
9,2018-03-05,"Charlotte, NC",1759,"Raleigh, NC",1854,0,0


In [218]:
def weather_cols_openweather(df, lat_long_df = lat_long_df):
    """
    function to add the weather conditions for both origin and destination locations on a particular date to the input df
    Params: 
    df = weather_df
    lat_long_df
    """
    
    locations = ["origin", "dest"]
    
    #try df.apply(), func that takes in orig_city and outputs the weather data for that city only
    #df['weather'] = df[origin].apply(function)
    
    for index in df.index:
        
        for loc in locations:
            city = df[f'{loc}_city_name'][index]
            date = df['fl_date'][index]
            
            #print(lat_long_df['lat'][lat_long_df.index.str.match(city)])
                                     
            lat = float(lat_long_df['lat'][lat_long_df.index.str.match(city)])
            long = float(lat_long_df['lng'][lat_long_df.index.str.match(city)])
            #timezone = lat_long_df['timezone'][lat_long_df.index.str.match(city)]

            json_result = open_weather_api(lat, long, date)      #remove timezone, not working 
            #print(json_result)
            
            try:
                total_rain = int(json_result['daily']['rain_sum'][0])                  # mm
                total_snow = int(json_result['daily']['snowfall_sum'][0])              # cm              
                cloudcover_mean = round(sum(json_result['hourly']['cloudcover'])/len(json_result['hourly']['cloudcover']))     # %

                if total_rain == 0 & total_snow == 0 & cloudcover_mean < 40:
                    df[f'weather_{loc}'][index] = "sunny"
                elif total_rain == 0 & total_snow == 0 & cloudcover_mean >= 40:
                    df[f'weather_{loc}'][index] = "cloudy"
                elif total_rain > 0 & total_snow == 0:
                    df[f'weather_{loc}'][index] = "rain"
                elif total_rain == 0 & total_snow > 0:
                    df[f'weather_{loc}'][index] = "snow"
                elif total_rain > 0 & total_snow > 0:
                    df[f'weather_{loc}'][index] = "snow & rain"
                else:
                    df[f'weather_{loc}'][index] = "error"
            except:
                df[f'weather_{loc}'][index] = "error"

In [None]:
lat_long_df['lat'][lat_long_df['city'] == city]

In [200]:
weather_cols_openweather(weather_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'weather_{loc}'][index] = "sunny"


In [201]:
weather_df

Unnamed: 0,fl_date,origin_city_name,crs_dep_time,dest_city_name,crs_arr_time,weather_origin,weather_dest
0,2018-07-10,"San Antonio, TX",705,"Nashville, TN",915,sunny,sunny
1,2019-10-10,"Chicago, IL",800,"Orlando, FL",1148,rain,rain
2,2019-07-18,"Lawton, OK",1034,"Dallas, TX",1130,sunny,sunny
3,2018-11-25,"Savannah, GA",2200,"Chicago, IL",2327,sunny,rain
4,2018-10-24,"Detroit, MI",841,"Santa Ana, CA",1039,sunny,sunny
5,2019-06-08,"Richmond, VA",1149,"Dallas, TX",1400,rain,sunny
6,2018-01-29,"Charlotte, NC",1436,"St. Louis, MO",1534,rain,sunny
7,2019-08-14,"Allentown, PA",1844,"Sanford, FL",2105,rain,rain
8,2019-01-28,"Tucson, AZ",735,"Dallas, TX",1050,sunny,sunny
9,2018-03-05,"Charlotte, NC",1759,"Raleigh, NC",1854,sunny,sunny


### Let's try it out on the flights_test

In [207]:
#load test data (these are the cols we will have at the test phase)
fl_test = pd.read_csv('data/flights_test_10.csv')
fl_test.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,1577836800000,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,1577836800000,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,1577836800000,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333
3,1577836800000,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1340,1455,N,75,1,333
4,1577836800000,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",915,1035,N,80,1,333


In [208]:
pd.Timestamp.fromtimestamp(1577836800000/1000)

Timestamp('2020-01-01 00:00:00')

In [209]:
fl_test['fl_date'] = fl_test['fl_date']/1000
fl_test['fl_date'] = fl_test['fl_date'].apply(pd.Timestamp.fromtimestamp)

In [210]:
fl_test

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333
3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1340,1455,N,75,1,333
4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",915,1035,N,80,1,333
5,2020-01-01,WN,WN,WN,5684,WN,N7856A,5684,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",600,715,N,75,1,333
6,2020-01-01,WN,WN,WN,6152,WN,N7735A,6152,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1620,1740,N,80,1,333
7,2020-01-01,WN,WN,WN,1679,WN,N405WN,1679,13891,ONT,"Ontario, CA",14893,SMF,"Sacramento, CA",1505,1630,N,85,1,390
8,2020-01-01,WN,WN,WN,3479,WN,N489WN,3479,13891,ONT,"Ontario, CA",14893,SMF,"Sacramento, CA",1230,1355,N,85,1,390
9,2020-01-01,WN,WN,WN,4069,WN,N7708E,4069,13891,ONT,"Ontario, CA",14893,SMF,"Sacramento, CA",740,900,N,80,1,390


In [211]:
fl_test.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time',
       'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance'],
      dtype='object')

Let's create a function that adds the weather info to any input df

In [202]:
def add_weather_condition(input_df):
    """
    function that cleans your input df, matches the cities to the lat long, calls the weather api, and appends weather conditions to your input df
    Params:
    input_df
    """
    #check that input df has the cols we need
    cols = set(['fl_date', 'origin_city_name','dest_city_name'])
    if not cols.issubset(set(input_df.columns)):
        return f"your input df must include the following columns: {cols}"
    
    #need to load lat_long for the api call
    latlong = pd.read_csv('data/uscities_ll.csv')
    latlong.set_index('city_state', inplace = True)
    
    #create empty cols in input df. Will be filled with the column filler function
    input_df['weather_origin'] = 0
    input_df['weather_dest'] = 0
    
    #clean the city names
    city_scrubber(input_df, 'origin_city_name')
    city_scrubber(input_df, 'dest_city_name')
    
    #call the api and fill the weather cols
    weather_cols_openweather(input_df, lat_long_df = latlong)
        #open_weather_api(lat, long, date, timezone = "America%2FChicago") is called within this func
        
    return input_df

In [133]:
input_df_test = flights[['fl_date', 'origin_city_name','dest_city_name']][10:20]
input_df_test

Unnamed: 0,fl_date,origin_city_name,dest_city_name
10,2019-03-19,"New Orleans, LA","Houston, TX"
11,2019-02-24,"Jackson/Vicksburg, MS","Atlanta, GA"
12,2018-02-20,"Boston, MA","New York, NY"
13,2019-03-13,"Dayton, OH","Philadelphia, PA"
14,2019-03-23,"New Orleans, LA","Newark, NJ"
15,2018-12-07,"Philadelphia, PA","Indianapolis, IN"
16,2018-07-09,"Tampa, FL","Atlanta, GA"
17,2018-01-06,"Springfield, MO","Chicago, IL"
18,2018-02-16,"Raleigh/Durham, NC","Newark, NJ"
19,2019-05-09,"Lansing, MI","Minneapolis, MN"


In [134]:
latlong = pd.read_csv('data/uscities_ll.csv')

In [135]:
latlong.dtypes

city           object
state_id       object
lat           float64
lng           float64
timezone       object
city_state     object
dtype: object

In [136]:
latlong

Unnamed: 0,city,state_id,lat,lng,timezone,city_state
0,New York,NY,40.6943,-73.9249,America/New_York,"New York, NY"
1,Los Angeles,CA,34.1141,-118.4068,America/Los_Angeles,"Los Angeles, CA"
2,Chicago,IL,41.8375,-87.6866,America/Chicago,"Chicago, IL"
3,Miami,FL,25.7840,-80.2101,America/New_York,"Miami, FL"
4,Dallas,TX,32.7935,-96.7667,America/Chicago,"Dallas, TX"
...,...,...,...,...,...,...
30404,Drummond,ID,43.9996,-111.3433,America/Boise,"Drummond, ID"
30405,Lost Springs,WY,42.7652,-104.9255,America/Denver,"Lost Springs, WY"
30406,Provo,SD,43.1937,-103.8329,America/Denver,"Provo, SD"
30407,Goldcreek,MT,46.5838,-112.9284,America/Denver,"Goldcreek, MT"


In [None]:
add_weather_condition(input_df_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'weather_{loc}'][index] = "sunny"


Unnamed: 0,fl_date,origin_city_name,dest_city_name,weather_origin,weather_dest
10,2019-03-19,"New Orleans, LA","Houston, TX",sunny,sunny
11,2019-02-24,"Jackson, MS","Atlanta, GA",sunny,rain
12,2018-02-20,"Boston, MA","New York, NY",sunny,sunny
13,2019-03-13,"Dayton, OH","Philadelphia, PA",sunny,sunny
14,2019-03-23,"New Orleans, LA","Newark, NJ",sunny,sunny
15,2018-12-07,"Philadelphia, PA","Indianapolis, IN",sunny,sunny
16,2018-07-09,"Tampa, FL","Atlanta, GA",rain,sunny
17,2018-01-06,"Springfield, MO","Chicago, IL",sunny,sunny
18,2018-02-16,"Raleigh, NC","Newark, NJ",rain,rain
19,2019-05-09,"Lansing, MI","Minneapolis, MN",rain,rain


In [214]:
fl_test = fl_test[['fl_date', 'origin_city_name', 'dest_city_name', ]]

In [219]:
fl_test_weather = add_weather_condition(fl_test)
fl_test_weather

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df['weather_origin'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df['weather_dest'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_column] = result
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://panda

Unnamed: 0,fl_date,origin_city_name,dest_city_name,weather_origin,weather_dest
0,2020-01-01,"Ontario, CA","San Francisco, CA",error,error
1,2020-01-01,"Ontario, CA","San Francisco, CA",error,error
2,2020-01-01,"Ontario, CA","San Jose, CA",error,error
3,2020-01-01,"Ontario, CA","San Jose, CA",error,error
4,2020-01-01,"Ontario, CA","San Jose, CA",error,error
5,2020-01-01,"Ontario, CA","San Jose, CA",error,error
6,2020-01-01,"Ontario, CA","San Jose, CA",error,error
7,2020-01-01,"Ontario, CA","Sacramento, CA",error,error
8,2020-01-01,"Ontario, CA","Sacramento, CA",error,error
9,2020-01-01,"Ontario, CA","Sacramento, CA",error,error


In [220]:
open_weather_api(34.0393, -117.6064, "2020-01-01")

{'latitude': 33.75,
 'longitude': -117.75,
 'generationtime_ms': 68.42410564422607,
 'utc_offset_seconds': -18000,
 'timezone': 'America/Chicago',
 'timezone_abbreviation': 'CDT',
 'elevation': 224.0,
 'hourly_units': {'time': 'iso8601',
  'precipitation': 'mm',
  'rain': 'mm',
  'snowfall': 'cm',
  'cloudcover': '%'},
 'hourly': {'time': ['2020-01-01T00:00',
   '2020-01-01T01:00',
   '2020-01-01T02:00',
   '2020-01-01T03:00',
   '2020-01-01T04:00',
   '2020-01-01T05:00',
   '2020-01-01T06:00',
   '2020-01-01T07:00',
   '2020-01-01T08:00',
   '2020-01-01T09:00',
   '2020-01-01T10:00',
   '2020-01-01T11:00',
   '2020-01-01T12:00',
   '2020-01-01T13:00',
   '2020-01-01T14:00',
   '2020-01-01T15:00',
   '2020-01-01T16:00',
   '2020-01-01T17:00',
   '2020-01-01T18:00',
   '2020-01-01T19:00',
   '2020-01-01T20:00',
   '2020-01-01T21:00',
   '2020-01-01T22:00',
   '2020-01-01T23:00'],
  'precipitation': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   

In [221]:
flights_500 = flights[['fl_date', 'origin_city_name','dest_city_name']].head(500)

In [222]:
flights_500

Unnamed: 0,fl_date,origin_city_name,dest_city_name
0,2018-07-10,"San Antonio, TX","Nashville, TN"
1,2019-10-10,"Chicago, IL","Orlando, FL"
2,2019-07-18,"Lawton/Fort Sill, OK","Dallas/Fort Worth, TX"
3,2018-11-25,"Savannah, GA","Chicago, IL"
4,2018-10-24,"Detroit, MI","Santa Ana, CA"
...,...,...,...
495,2018-12-12,"Honolulu, HI","Hilo, HI"
496,2018-02-13,"San Antonio, TX","New Orleans, LA"
497,2018-01-03,"Houston, TX","Corpus Christi, TX"
498,2018-10-25,"Charlotte, NC","Salt Lake City, UT"


In [223]:
add_weather_condition(flights_500)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'weather_{loc}'][index] = "sunny"


TypeError: cannot convert the series to <class 'float'>