In [1]:
import os
import pyspark

# Start Spark session
conf = pyspark.SparkConf()
conf.setMaster("local").setAppName("My app")

sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)
spark

In [2]:
from pyspark.sql.functions import col, sum, round, when, struct, collect_list, first

In [3]:
# Load fishing fleets dataset
fleets = spark \
                .read \
                .option("inferSchema", "true") \
                .option("header", "true") \
                .csv("data/fishing_fleet.csv")

In [4]:
fleets.head(5)

[Row(COUNTRY0='AUS', Country1='Australia', FLEET2='TOT_VESSEL', Fleet3='Total Vessels', MEASURE4='NUM', Measure5='Number of vessels', YEAR6=2010, Year7=2010, Unit Code='NBR', Unit='Number', PowerCode Code=0, PowerCode='Units', Reference Period Code=None, Reference Period=None, Value=318.0, Flag Codes=None, Flags=None),
 Row(COUNTRY0='AUS', Country1='Australia', FLEET2='TOT_VESSEL', Fleet3='Total Vessels', MEASURE4='NUM', Measure5='Number of vessels', YEAR6=2011, Year7=2011, Unit Code='NBR', Unit='Number', PowerCode Code=0, PowerCode='Units', Reference Period Code=None, Reference Period=None, Value=325.0, Flag Codes=None, Flags=None),
 Row(COUNTRY0='AUS', Country1='Australia', FLEET2='TOT_VESSEL', Fleet3='Total Vessels', MEASURE4='NUM', Measure5='Number of vessels', YEAR6=2012, Year7=2012, Unit Code='NBR', Unit='Number', PowerCode Code=0, PowerCode='Units', Reference Period Code=None, Reference Period=None, Value=318.0, Flag Codes=None, Flags=None),
 Row(COUNTRY0='AUS', Country1='Austra

In [5]:
fleets.printSchema()

root
 |-- COUNTRY0: string (nullable = true)
 |-- Country1: string (nullable = true)
 |-- FLEET2: string (nullable = true)
 |-- Fleet3: string (nullable = true)
 |-- MEASURE4: string (nullable = true)
 |-- Measure5: string (nullable = true)
 |-- YEAR6: integer (nullable = true)
 |-- Year7: integer (nullable = true)
 |-- Unit Code: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- PowerCode Code: integer (nullable = true)
 |-- PowerCode: string (nullable = true)
 |-- Reference Period Code: string (nullable = true)
 |-- Reference Period: string (nullable = true)
 |-- Value: double (nullable = true)
 |-- Flag Codes: string (nullable = true)
 |-- Flags: string (nullable = true)



In [6]:
# Drop unnecessary columns
fleets = fleets.drop("Reference Period Code", \
                           "Reference Period", \
                           "Flag Codes", \
                           "Flags", \
                           "PowerCode Code", \
                           "PowerCode", \
                           "YEAR6", \
                           "MEASURE4", \
                           "Measure5", \
                           "FLEET2", \
                           "Unit Code"
                    )

In [7]:
# Rename columns
fleets = fleets.withColumnRenamed("COUNTRY0", "Country Code") \
                .withColumnRenamed("Country1", "Country") \
                .withColumnRenamed("Fleet3", "Fleet Class") \
                .withColumnRenamed("Year7", "Year")

In [8]:
fleets.printSchema()

root
 |-- Country Code: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Fleet Class: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Unit: string (nullable = true)
 |-- Value: double (nullable = true)



In [9]:
# Retrieve fleet types and their respective codes
fleet_types = fleets.select("Fleet Class").distinct().sort("Fleet Class")
fleet_types.show()

+-------------+
|  Fleet Class|
+-------------+
|      0-5.9 m|
|    12-17.9 m|
|    18-23.9 m|
|    24-29.9 m|
|    30-35.9 m|
|    36-44.9 m|
|    45-59.9 m|
|     6-11.9 m|
|    60-74.9 m|
|75 m and over|
|  LOA unknown|
|Total Vessels|
+-------------+



In [10]:
# Show units and their respective codes
fleets.select("Unit").distinct().show()

+------+
|  Unit|
+------+
|Number|
|Tonnes|
+------+



In [11]:
# Cast values to integer
fleets = fleets.withColumn("Value", fleets.Value.cast("int"))

In [12]:
# Create columns from fleet types and add subdictionaries for units of measurments, so that there is one row per
# (country, year)
fleets = fleets.groupBy("Country Code", \
               "Country", \
               "Year", \
               "Fleet Class") \
.pivot("Unit") \
.agg(first("Value")) \
.orderBy("Country", "Year")

In [13]:
fleets.printSchema()

root
 |-- Country Code: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Fleet Class: string (nullable = true)
 |-- Number: integer (nullable = true)
 |-- Tonnes: integer (nullable = true)



In [14]:
fleets = fleets.groupBy(
    "Country Code",
    "Country",
    "Year"
).pivot("Fleet Class").agg(first(struct("Number", "Tonnes")).alias("Values"))

In [15]:
fleets.printSchema()

root
 |-- Country Code: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- 0-5.9 m: struct (nullable = true)
 |    |-- Number: integer (nullable = true)
 |    |-- Tonnes: integer (nullable = true)
 |-- 12-17.9 m: struct (nullable = true)
 |    |-- Number: integer (nullable = true)
 |    |-- Tonnes: integer (nullable = true)
 |-- 18-23.9 m: struct (nullable = true)
 |    |-- Number: integer (nullable = true)
 |    |-- Tonnes: integer (nullable = true)
 |-- 24-29.9 m: struct (nullable = true)
 |    |-- Number: integer (nullable = true)
 |    |-- Tonnes: integer (nullable = true)
 |-- 30-35.9 m: struct (nullable = true)
 |    |-- Number: integer (nullable = true)
 |    |-- Tonnes: integer (nullable = true)
 |-- 36-44.9 m: struct (nullable = true)
 |    |-- Number: integer (nullable = true)
 |    |-- Tonnes: integer (nullable = true)
 |-- 45-59.9 m: struct (nullable = true)
 |    |-- Number: integer (nullable = true)
 |    |-- Tonnes: 

In [16]:
# Load the transformed data into MongoDB
import json
import pymongo
from dotenv import dotenv_values

from pyspark.sql.functions import to_json, from_json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType

In [17]:
# Retrieve MongoDB creds
config = dotenv_values("creds.env")

user = config['user']
password = config['password']

# Connect to MongoDB
uri = "mongodb+srv://" + user + ":" + password + "@cluster0.6jfc5iw.mongodb.net/"
client = pymongo.MongoClient(uri)
db = client["gfw"]
collection = db["vessels"]

In [18]:
vessels = fleets.toJSON().collect()

In [19]:
vessels_dict = [json.loads(row) for row in vessels]

In [20]:
collection.insert_many(vessels_dict)

<pymongo.results.InsertManyResult at 0x28380560c40>

In [21]:
client.close()