In [None]:
import pandas as pd

In [None]:
!git clone https://github.com/Charlesfox1/geo_training

In [None]:
# Pick up your data from a .csv, .xlsx, database, or other format with the wide range of import functions
df = pd.read_csv('geo_training/data-sample.csv')

In [None]:
# inspect your dataframe by just calling its variable name in the notebook
df

In [None]:
# a dataset of this size would already be pushing it in Excel!
df.shape

In [None]:
# a DataFrame is composed of columns and rows, and a special 3rd thing - an index

In [None]:
df.columns

In [None]:
df.index

In [None]:
# we select a column by using its name in square brackets:
df['ship_size']

In [None]:
# we can select rows by their index position or the 
# value of the index - these aren't the same thing!
df.iloc[0]

In [None]:
# we can select rows by their index position or the value of the index - these aren't always the same thing!
df.loc[0]

In [None]:
# you can choose what data item(s) act as the index:
df = df.set_index('ship_size')
df

In [None]:
# let's try .loc again:
df.loc['SMALL']

In [None]:
# first argument of loc is the index, second can be the columns:
df.loc['SMALL','ship_type']

In [None]:
# you can select multiple columns at once:
df.loc[:,['frequency','period_ending','ship_type']]

In [None]:
# ...which is the same as (the : at the front can be ignored - means 'all rows')
df[['frequency','period_ending','ship_type']]

In [None]:
# ...also handy if you want to filter the index
df.loc['SMALL', ['frequency','period_ending','ship_type']]

In [None]:
# if you want a certain number of rows, you can index a dataframe like a list:
df[['frequency','period_ending','ship_type']][4:10]

In [None]:
# define a new column by choosing a new column name as so:
df['average_dwt'] = df['dwt'] / df['ship_count']

# new columns are always added on the end:
df

In [None]:
# easiest to do operations in the column direction - but nothing stops you from transposing the table:
df.T

In [None]:
# if you want to return the index to a standard numerical one, it's:
df = df.reset_index()

In [None]:
# text columns can be added as you'd expect
df['description'] = df['ship_size']+' '+df['ship_type']+' - '+df['draft']

In [None]:
df['description']

In [None]:
# fairly sophisticated methods are also available for mapping and filling values:
size_mapper = {'SMALL':1, 
               'MEDIUM':2}
df['num_size'] = df['ship_size'].map(size_mapper)

In [None]:
df['num_size'].value_counts()

In [None]:
df['num_size'].isnull()

In [None]:
df['num_size'].isnull().value_counts()

In [None]:
df['num_size'] = df['num_size'].fillna(3)

In [None]:
df['num_size'].isnull().value_counts()

In [None]:
df['num_size'].value_counts()

In [None]:
# we can also fill forward with ffill() and fill back with bfill() if we need to

In [None]:
# final function of interest is joining to create a new DataFrame:
a = df[df.columns[:-2]]
b = df[['description','num_size']]

In [None]:
a

In [None]:
b

In [None]:
# implicit left join on index, but that can changed with various options - 
# see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.join.html
c = a.join(b)

In [None]:
c

### Let's Explore our dataset

In [None]:
# looks like we have two different types of count - ships come in, ships go out
df['metric'].value_counts()

In [None]:
# let's focus on entries into port
df = df[df.metric == 'Entries']

In [None]:
df = df[df.metric == 'Entries'].sort_values(by = 'period_ending', ascending = True)

In [None]:
# Often we want to see the 'wood' (patterns) for the 'trees' (individual bits of data)
# Data Scientists achieve that through summarisation or visualisation. 
# 'Groupby' can be used to group data that share a common field value.
# An operation can then be applied (e.g. summing, averaging) to immediately get useful answers
# On its own, groupby is more powerful than anything in Excel!

In [None]:
# what are the top 10 busiest ports by ship count over the year?
df.groupby(['port_country','port_name'])['ship_count'].sum().sort_values(ascending = False).head(10)

In [None]:
# Does the answer change if we look at deadweight tonage?
df.groupby(['port_country','port_name'])['dwt'].sum().sort_values(ascending = False).head(10)

In [None]:
# you can pre-filter as well - e.g. what are the 10 busiest 
# ports by dead weight tonnage for tanker-type ships?
df[df.ship_type == 'TANKER'].groupby(
    ['port_country','port_name'])['dwt'].sum().sort_values(ascending = False).head(10)

In [None]:
# you will notice that these frames look different. that's because when you call 
# an aggregation on only one column, pandas gives you a pd.Series and not a pd.DataFrame:

In [None]:
type(df[df.ship_type == 'TANKER'].groupby(
    ['port_country','port_name'])['dwt'].sum().sort_values(ascending = False).head(10))

In [None]:
# you can force it to return a DataFrame by asking for the column with double brackets:
df[df.ship_type == 'TANKER'].groupby(
    ['port_country','port_name'])[['dwt']].sum().sort_values(by = 'dwt', ascending = False).head(10)

In [None]:
# the returned object can be assigned to a variable, e.g. sdf:
sdf = df[df.ship_type == 'TANKER'].groupby(
    ['port_country','port_name'])[['dwt']].sum().sort_values(
        by = 'dwt', ascending = False).head(10)

In [None]:
sdf

In [None]:
type(sdf)

In [None]:
# here, because we are grouping by more than one column, we get a multi-index return
sdf.index

In [None]:
# we can return this to a standard df with reset_index()
sdf = sdf.reset_index()

In [None]:
sdf

In [None]:
# very basic visualisation is built in to pandas:
sdf.set_index('port_name')['dwt'].plot.bar()

In [None]:
df['average_dwt'].describe([.9,.95,.975,.99]).round(0)

In [None]:
df['average_dwt'].plot.hist(by=None, bins=10, title = 'Average DWT')

In [None]:
df['month'] = pd.to_datetime(df['period_ending']).dt.month

In [None]:
df['region_corridor'].value_counts()

### Plotly Express
- Make better looking plots with more flexibility in plotly

In [None]:
pip install plotly

In [None]:
import plotly.express as px

d = df.copy()
d = d[['region_corridor','month','dwt']] # pick out 3 columns
# sum dwt of ship exits by month by corridor
d = d.groupby(['region_corridor', 'month'])['modified_dwt'].sum().reset_index() 

fig = px.line(d, # pass the dataframe to plotly express
              x="month", # define x 
              y="dwt", # define y
              color="region_corridor", # define variable to color lines by 
              line_shape="spline", # fit a spline between points
              title = 'Total Dead Weight Tonnage by Shipping Region Corridor',
              labels = {'month':'Month',
                        'dwt':'Dead Weight Tonnage (dwt)',
                        'region_corridor':'Region Corridor'},
              height = 750
             )


fig.show()

In [None]:
d = df.copy() # copy the dataset
d = d[d.port_name == 'Hong Kong'] # look only at ships entering Hong Kong port
d['Ship Description'] = d['ship_type']+' - '+d['ship_size'] # make a new composite column 
d = d.groupby(['month','Ship Description'])['modified_dwt'].sum().reset_index() # groupby new column

d = d.sort_values(by = 'Ship Description')

fig = px.bar(d, # pass the dataframe to plotly express
              x="month", # define x 
              y="modified_dwt", # define y
              color="Ship Description", # define variable to color lines by 
              title = 'Modified Dead Weight Tonnage by Ship Type Arriving to Hong Kong',
              labels = {'month':'Month',
                        'modified_dwt':'Dead Weight Tonnage (dwt)'},
              color_discrete_map = {
                 'BULKER - LARGE':'#0A2F51',
                 'BULKER - MEDIUM':'#137177',
                 'BULKER - SMALL':'#1D9A6C',
                 'CONTAINER - LARGE':'#98397E',
                 'CONTAINER - MEDIUM':'#C54F65',
                 'CONTAINER - SMALL':'#D67693',
                 'TANKER - LARGE':'#486E86',
                 'TANKER - MEDIUM':'#6E9ABC',
                 'TANKER - SMALL':'#97C4EF'
                 },
              height = 750
             )
fig.show()