In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go

In [2]:
data= pd.read_csv('../data/flights_train.csv')

In [19]:
data.head()

Unnamed: 0,flight_date,from,to,avg_weeks,target,std_weeks
0,2012-06-19,ORD,DFW,12.875,12.331296,9.812647
1,2012-09-10,LAS,DEN,14.285714,10.775182,9.466734
2,2012-10-05,DEN,LAX,10.863636,11.083177,9.035883
3,2011-10-09,ATL,ORD,11.48,11.169268,7.990202
4,2012-02-21,DEN,SFO,11.45,11.269364,9.517159


In [18]:
data['flight_date'] = pd.to_datetime(data['flight_date'])

In [93]:
data.dr

Index(['ORD', 'ATL', 'LAX', 'SFO', 'DFW', 'DEN', 'BOS', 'LGA', 'LAS', 'EWR',
       'JFK', 'PHL', 'SEA', 'MSP', 'MIA', 'DTW', 'MCO', 'CLT', 'IAH', 'PHX'],
      dtype='object')

In [41]:




def plot_time_serie(dest_from, dest_to):
    fig = go.Figure()
    for dest in dest_to:
        filterd = data[(data['from']==dest_from) & (data['to']==dest)]
        filterd.sort_values(by='flight_date', inplace=True)
        #sns.lineplot(x="flight_date", y="target",data=filterd)
        
        fig.add_trace(go.Scatter(
                       name=dest,
                       mode="markers+lines", x=filterd["flight_date"], y=filterd["target"],
                       
                     ))
    fig.update_xaxes(showgrid=True, ticklabelmode="period")
    # update title 
    fig.update_layout(title_text=dest_from)
    fig.show()

    


# test
plot_time_serie('ORD',['LAX','DFW','LGA','BOS'])
plot_time_serie('ATL',['DTW','DFW','LGA','BOS'])

In [51]:
grouped_data = data.groupby(by=['from','to'],).count()


In [59]:
# some data exploration 
# link graph 

from pyvis.network import Network
import pandas as pd

got_net = Network(height='750px', width='100%', bgcolor='#222222', font_color='white')





edge_data = zip(grouped_data.index.values , grouped_data['flight_date'])

for e in edge_data:
    src,dst = e[0]
    w = e[1]

    got_net.add_node(src, src, title=src)
    got_net.add_node(dst, dst, title=dst)
    got_net.add_edge(src, dst)

neighbor_map = got_net.get_adj_list()

# add neighbor data to node hover data
for node in got_net.nodes:
    node['title'] += ' Neighbors:<br>' + '<br>'.join(neighbor_map[node['id']])
    node['value'] = len(neighbor_map[node['id']])
got_net.show_buttons(filter_=['physics','edges'])
got_net.show('../renders/airports.html')


In [None]:
# plot distribution of flights
def plot_dist(dest_from, dest_to):
    dist_list = []
    to_dest = []    
    for dest in dest_to:
        filterd = data[(data['from']==dest_from) & (data['to']==dest)]
        target_values = filterd['target'].values
        if len(target_values) > 0:
            dist_list.append(target_values)
            to_dest.append(dest)
        




    # Create distplot with custom bin_size
    fig = ff.create_distplot(dist_list, to_dest,show_hist=False)
    # add title
    fig.update_layout(title_text=dest_from)
    fig.show()


for to in data['from'].unique():
    plot_dist(to,data['to'].unique())


In [107]:
import mpu
airport_spatial = pd.read_csv('../data/airports_spatial.csv')
def calculate_dst(airp_a,airp_b):
    # approximate radius of earth in km
    R = 6373.0

    lat1 = airport_spatial[airport_spatial['iata_code']==airp_a]['latitude'].values[0]
    lon1 = airport_spatial[airport_spatial['iata_code']==airp_a]['longitude'].values[0]
    lat2 = airport_spatial[airport_spatial['iata_code']==airp_b]['latitude'].values[0]
    lon2 = airport_spatial[airport_spatial['iata_code']==airp_b]['longitude'].values[0]
    
    dist = mpu.haversine_distance((lat1, lon1), (lat2, lon2))
    return dist



17.19850284408619

In [None]:
# plot distribution of flights
def plot_dist(dest_from, dest_to,order_dist=None):
    dist_list = []
    to_dest = []
    fig = go.Figure() 
    for dest in dest_to:
        filterd = data[(data['from']==dest_from) & (data['to']==dest)]
        target_values = filterd['target'].values
        if len(target_values) > 0:
            dist_list.append(target_values)
            to_dest.append(dest)
    
    # order dist_list  by mean 
    if order_dist :
       dist_list = sorted(zip(dist_list,to_dest), key=lambda x: calculate_dst(dest_from , x[1]))
    else:
        dist_list = sorted(zip(dist_list,to_dest), key=lambda x: np.mean(x[0]))

    for target,dest in dist_list:
        
        fig.add_trace(go.Box(x=target, name=dest))
 
    
    
    # add title
    fig.update_layout(title_text=dest_from)
    fig.show()
    



for to in data['from'].unique():
    plot_dist(to,data['to'].unique(),order_dist=True)



In [120]:
# plot distribution of flights
def plot_dist(dest_from, dest_to,order_dist=None):
    dist_list = []
    dest_dist = []
    fig = go.Figure() 
    for dest in dest_to:
        filterd = data[(data['from']==dest_from) & (data['to']==dest)]
        target_values = filterd['target'].values
        if len(target_values) > 0:
            dist_list.append(target_values)
            dest_dist.append(calculate_dst(dest_from ,dest))
           
    
    # plot dist vs mean 
    fig.add_trace(go.Scatter(
                          name='mean',
                            x=dest_dist,
                            y=dist_list,
                            mode='markers',
                         ))

 
    
    
    # add title
    fig.update_layout(title_text=dest_from)
    fig.show()
    



for to in data['from'].unique():
    plot_dist(to,data['to'].unique(),order_dist=True)
