# 1. Imports

`imports`, you've seen this before!

In [22]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from datetime import datetime
import geopy
from geopy.geocoders import Nominatim
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Point, Polygon

Some magic that tells jupyter to put graphs and things in the notebook instead of the default behaviour which is to save it as a file.

In [23]:
%matplotlib inline

# 2. Read in the csv file

In [24]:
if os.path.isfile("international_air_traffic_data_set.csv"):
    filepath = "international_air_traffic_data_set.csv"
    print("loading from file")
else:
    filepath = "https://data.gov.au/data/dataset/d9fbffaa-836f-4f52-80e8-324249ff269f/resource/ebcafd83-9514-4f72-a995-fe7ee90cb9da/download/city_pairs.csv"
    print("loading from the internet")

international_air_traffic_data_set = pd.read_csv(filepath)
print("done")

loading from file
done


Read the data frame as df

In [25]:
df = pd.read_csv("international_air_traffic_data_set.csv")
df.head()

Unnamed: 0,Month,AustralianPort,ForeignPort,Country,Passengers_In,Freight_In_(tonnes),Mail_In_(tonnes),Passengers_Out,Freight_Out_(tonnes),Mail_Out_(tonnes),Passengers_Total,Freight_Total_(tonnes),Mail_Total_(tonnes),Year,Month_num
0,31048,Adelaide,Auckland,New Zealand,1513.0,42.167,0.311,985.0,18.704,0.924,2498.0,60.871,1.235,1985,1
1,31048,Adelaide,Bahrain,Bahrain,12.0,0.0,0.0,5.0,0.033,0.0,17.0,0.033,0.0,1985,1
2,31048,Adelaide,Bombay,India,7.0,0.0,0.0,5.0,0.0,0.0,12.0,0.0,0.0,1985,1
3,31048,Adelaide,Frankfurt,Germany,115.0,0.009,0.0,171.0,0.0,0.248,286.0,0.009,0.248,1985,1
4,31048,Adelaide,London,UK,1567.0,2.8,0.0,1472.0,10.618,2.487,3039.0,13.418,2.487,1985,1


# Info about the data frame

looking at the info for the code - are there any null values? what are the objects type?

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82499 entries, 0 to 82498
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Month                   82499 non-null  int64  
 1   AustralianPort          82499 non-null  object 
 2   ForeignPort             82499 non-null  object 
 3   Country                 82499 non-null  object 
 4   Passengers_In           82487 non-null  float64
 5   Freight_In_(tonnes)     82495 non-null  float64
 6   Mail_In_(tonnes)        82495 non-null  float64
 7   Passengers_Out          82489 non-null  float64
 8   Freight_Out_(tonnes)    82495 non-null  float64
 9   Mail_Out_(tonnes)       82495 non-null  float64
 10  Passengers_Total        82489 non-null  float64
 11  Freight_Total_(tonnes)  82499 non-null  float64
 12  Mail_Total_(tonnes)     82499 non-null  float64
 13  Year                    82499 non-null  int64  
 14  Month_num               82499 non-null

Tells you how many items are in each YEAR

In [27]:
df.groupby("Year").size()

Year
1985    1944
1986    2224
1987    2284
1988    2361
1989    2369
1990    2471
1991    2281
1992    2469
1993    2656
1994    2600
1995    2807
1996    3035
1997    3157
1998    3162
1999    3407
2000    3502
2001    2970
2002    2769
2003    1820
2004    1877
2005    1898
2006    1830
2007    1824
2008    1792
2009    1738
2010    1791
2011    1875
2012    1808
2013    1848
2014    1786
2015    1828
2016    1949
2017    2106
2018    2240
2019    2219
2020    1373
2021     429
dtype: int64

Tells you how many items are in each DOMESTIC PORT

Highest to lowest

In [28]:
df.AustralianPort.value_counts(dropna=False)

Sydney                    22911
Melbourne                 16471
Brisbane                  13628
Perth                      8890
Cairns                     6752
Adelaide                   5967
Darwin                     4383
Gold Coast                  970
Townsville                  660
Gold Coast/Coolangatta      503
Norfolk Island              418
Hobart                      319
Port Hedland                286
Canberra                     94
Christmas Island             80
Toowoomba Wellcamp           64
Broome                       37
Sunshine Coast               37
Newcastle                    18
GoldCoast                     8
NorfolkIsland                 1
PortHedland                   1
ToowoombaWellcamp             1
Name: AustralianPort, dtype: int64

Tells you how many items are in each FOREIGN PORT

Highest to lowest

In [29]:
df.ForeignPort.value_counts(dropna=False)

Auckland          3460
Singapore         3253
Denpasar          2992
Hong Kong         2793
Kuala Lumpur      2505
                  ... 
Mombasa              1
San Diego            1
Phoenix              1
Al Ain               1
General Santos       1
Name: ForeignPort, Length: 203, dtype: int64

Tells you how many items are in each FOREIGN PORT

Alphabetical

In [30]:
df.groupby("ForeignPort").size()

ForeignPort
Abu Dhabi    808
AbuDhabi       3
Abuja          3
Al Ain         1
Almaty        17
            ... 
Xi'an         81
Xiamen       111
Zagreb        22
Zhengzhou     46
Zurich       128
Length: 203, dtype: int64

This Function looks up a country in the data frame

In [31]:
Look_up_a_Country = df[((df.Country == "Japan"))]
Look_up_a_Country

Unnamed: 0,Month,AustralianPort,ForeignPort,Country,Passengers_In,Freight_In_(tonnes),Mail_In_(tonnes),Passengers_Out,Freight_Out_(tonnes),Mail_Out_(tonnes),Passengers_Total,Freight_Total_(tonnes),Mail_Total_(tonnes),Year,Month_num
31,31048,Brisbane,Tokyo,Japan,536.0,4.085,0.000,384.0,4.713,0.212,920.0,8.798,0.212,1985,1
86,31048,Melbourne,Tokyo,Japan,228.0,42.059,1.404,333.0,15.395,0.892,561.0,57.454,2.296,1985,1
108,31048,Perth,Tokyo,Japan,55.0,3.004,0.000,35.0,13.661,0.262,90.0,16.665,0.262,1985,1
152,31048,Sydney,Tokyo,Japan,5949.0,313.350,15.190,5993.0,185.748,5.289,11942.0,499.098,20.479,1985,1
192,31079,Brisbane,Tokyo,Japan,455.0,7.364,0.000,206.0,5.626,0.312,661.0,12.990,0.312,1985,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82409,44287,Brisbane,Tokyo,Japan,0.0,39.567,41.454,0.0,139.605,17.499,0.0,179.172,58.953,2021,4
82414,44287,Cairns,Tokyo,Japan,0.0,0.000,0.000,0.0,0.006,0.000,0.0,0.006,0.000,2021,4
82443,44287,Melbourne,Tokyo,Japan,0.0,89.343,10.888,0.0,207.554,3.199,0.0,296.897,14.087,2021,4
82479,44287,Sydney,Nagoya,Japan,0.0,0.000,0.000,0.0,0.000,0.000,0.0,0.000,0.000,2021,4


# Useful Groupings

Groups the data by DOMESTIC Airport

In [None]:
for group_key, group_value in df.groupby("AustralianPort"):
    print(group_key)
    print(group_value)

Groups the data frame by year

In [None]:
for group_key, group_value in df.groupby("Year"):
    print(group_key)
    print(group_value)

Creates a list of the FOREIGN PORTS

In [None]:
list(df.groupby("ForeignPort"))

Shows the first and last year that an airport had been used

In [None]:
df.groupby("AustralianPort").agg({"Year" : ["count", "min", "max"]})

Shows the first and last year that a country had been travelled to

In [None]:
df.groupby("ForeignPort").agg({"Year" : ["count", "min", "max"]})

SPECIFIC AIRPORT, MONTH AND YEAR

Shows all the travel in January 2016 from Sydney

In [None]:
a = df[(df.Year == 2016) & ((df.AustralianPort == "Sydney")) & ((df.Month_num == 1))]
a

Groups the data frame by year and australian airport

then it tells you how many times the foriegn airport appears in that year

In [None]:
df.groupby(["Year", "AustralianPort", "ForeignPort"]).size()

Just shows a DATAFRAME with relevant infomration on PASSENGERS

In [None]:
df[["Year", "AustralianPort", "ForeignPort", "Country", "Passengers_In", "Passengers_Out", "Passengers_Total"]]

SHOWS cells with BOTH "USA" and "SYDNEY" in it

boolean indexing 

Just shows a DATAFRAME with relevant infomration on PASSENGERS

In [None]:
dfa = df[["Year", "AustralianPort", "ForeignPort", "Country", "Passengers_In", "Passengers_Out", "Passengers_Total"]]
Sydney = dfa[(dfa.AustralianPort == "Sydney") & (dfa.Country == "USA")]
Sydney.sort_values(by=["Passengers_Total", "AustralianPort"])
Sydney.tail()

# 3. Get long and lat

In [None]:
#import all these - in the link it shows you how to install them
import pandas as pd
import geopy
from geopy.geocoders import Nominatim

#I don't really understand code but this line makes it all work
geolocator = Nominatim(timeout=10, user_agent = "myGeolocator")

#This is just me reading in my data frame
df = pd.read_csv("international_air_traffic_data_set.csv")

#this concatinates the columns for city name and country - e.g. Auckland, New Zealand
df["full_address"] = df.ForeignPort + ", " + df.Country

# My data frame has over 82000 rows, so I only want to look for the unique cities because there are only 203
# therefore less time waiting for all the data to load
unique_ports = df.full_address.unique()

# here im creating "buckets" or "empty lists" to fill in the data that comes back from the api
lat = []
lon = []
ports = []
locations = []

# This loop goes through each city in the "unique cities" list and get the geocode data from the internet
for port in unique_ports:
    try:
        location = geolocator.geocode(port)
        lat.append(location.latitude)
        lon.append(location.longitude)
        locations.append(location)
        ports.append(port)
        print(location.longitude, location.latitude)
    except:
        # i put in an except loop because sometimes the api doesnt recognise a city name
        # if it return no value then it prints the name of the city it couldnt find
        # im having some trouble at the moment with cases its getting confused somewhere along the way
        # It doesnt like "NewYork" it likes "New York" it has to have a space - somewhere my code is removing the space
        if location == None:
            # by printing "port" you can see which cities it cant find
            print(port)
# this bit takes those lists from above and puts them in a dictionary
di = {"port":ports, "locations":locations, "latitude":lat, "longitude":lon}
# this converts the dictionary into a data frame
port_df = pd.DataFrame(di)

# 4. Graphs

Shows the amount of passengers from each country

In [None]:
a = df.groupby("Country").size()
a.sort_values(ascending=False)
a.plot(kind="bar")
plt.plot(a, "x-")
plt.title("Airtravel Pasengers", fontsize=18)
plt.xlabel('Year', fontsize=26)
plt.ylabel('Passengers', fontsize=26)
plt.grid(False)
plt.show()

Plots the Passengers

Shows the increase in passengers and then decrease when COVID hit

In [None]:
income = df[["Year","Passengers_Out"]].groupby("Year").sum()

plt.rcdefaults()
plt.plot(income, "x-")
plt.title("Airtravel Pasengers", fontsize=18)
plt.xlabel('Year', fontsize=26)
plt.ylabel('Passengers', fontsize=26)
plt.grid(True)
plt.show()

# 5. Maps

loads the world map and cites dataset

In [10]:
 world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
 cities = gpd.read_file(gpd.datasets.get_path('naturalearth_cities'))

Shows the world map
no data

In [None]:
world.plot()

This code maps the long and lat data onto a world map

In [None]:
# the data frame with the long and lat coordinates
data = long_lat
# tells python the coordinate system
crs={'init':'epsg:4326'}
# defines the geometry
geometry = [Point(xy) for xy in zip(data["longitude"], data["latitude"])]
# loads data from geopandas
geodata = gpd.GeoDataFrame(data,crs=crs, geometry=geometry)

# Loads shape file of the world from geopandas 

world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

# Formatting stuff is below like colour, text etc
fig, ax = plt.subplots(figsize=(15,7))

world.plot(ax=ax, facecolor='Grey', edgecolor='k',alpha=1,linewidth=0.1,cmap="Blues")

#cmaps is the colour map - can check matplotlib for all the colour maps 

geodata.plot(ax=ax, color='red', markersize=5);
fig.suptitle('Airports', fontsize=12)
ax.set_xlabel('Longitude', fontsize=10)
ax.set_ylabel('Latitude', fontsize='medium')