In [1]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import *

import initspark

In [2]:
spark = SparkSession.builder.appName("Glacier").getOrCreate()
csv = spark.read\
    .option("header", "true")\
    .csv("./data/glacier/database.csv")\
    .cache()

In [18]:
csv.printSchema()

root
 |-- Glacier ID: string (nullable = true)
 |-- Political Unit: string (nullable = true)
 |-- Continent: string (nullable = true)
 |-- Basin Code: string (nullable = true)
 |-- Location Code: string (nullable = true)
 |-- Glacier Code: string (nullable = true)
 |-- Glacier Name: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Primary Class: string (nullable = true)
 |-- Glacier Source: string (nullable = true)
 |-- Basin Count: string (nullable = true)
 |-- Glacier Form: string (nullable = true)
 |-- Glacier Activity: string (nullable = true)
 |-- Activity Start: string (nullable = true)
 |-- Activity End: string (nullable = true)
 |-- Minimum Elevation: string (nullable = true)
 |-- Minimum Elevation Exposed: string (nullable = true)
 |-- Mean Elevation: string (nullable = true)
 |-- Mean Elevation Accumulation: string (nullable = true)
 |-- Mean Elevation Ablation: string (nullable = true)
 |-- Maximum Elevation: stri

In [3]:
csvData = csv.select(
    csv["Political Unit"], 
    csv["Latitude"], 
    csv["Longitude"], 
    csv["Glacier Source"],
    csv["Minimum Elevation"],
    csv["Mean Elevation"]
)

In [4]:
csvData.printSchema()

root
 |-- Political Unit: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Glacier Source: string (nullable = true)
 |-- Minimum Elevation: string (nullable = true)
 |-- Mean Elevation: string (nullable = true)



In [10]:
data = csvData\
    .withColumnRenamed("Political Unit", "country")\
    .withColumn("latitude", csvData["Latitude"].cast(DoubleType()))\
    .withColumn("longitude", csvData["Longitude"].cast(DoubleType()))\
    .withColumnRenamed("Glacier Source", "source")\
    .withColumn("minElevation", csv["Minimum Elevation"].cast(DoubleType()))\
    .withColumn("meanElevation", csv["Mean Elevation"].cast(DoubleType()))\
    .drop(csv["Minimum Elevation"])\
    .drop(csv["Mean Elevation"])

In [19]:
data.show()

+-----------+--------+---------+------+------------+-------------+
|    country|latitude|longitude|source|minElevation|meanElevation|
+-----------+--------+---------+------+------------+-------------+
|AFGHANISTAN|  34.672|   68.874|     0|      3975.0|       4110.0|
|AFGHANISTAN|  34.676|   68.855|     0|      4250.0|       4350.0|
|AFGHANISTAN|  34.689|   68.854|     0|      4000.0|       4100.0|
|AFGHANISTAN|  34.707|   68.857|     0|      4000.0|       4175.0|
|AFGHANISTAN|  34.719|   68.852|     0|      3750.0|       4050.0|
|AFGHANISTAN|  34.806|   68.868|     0|      3550.0|       3900.0|
|AFGHANISTAN|  34.818|   68.872|     0|      3700.0|       3925.0|
|AFGHANISTAN|  34.802|   68.886|     0|      3850.0|       4225.0|
|AFGHANISTAN|  34.807|   68.891|     0|      3800.0|       3925.0|
|AFGHANISTAN|  35.175|   68.757|     0|      3825.0|       4040.0|
|AFGHANISTAN|  35.204|   68.783|     0|      4200.0|       4325.0|
|AFGHANISTAN|  35.218|   68.805|     0|      3900.0|       412

In [29]:
data.describe(["latitude", "longitude", "minElevation", "meanElevation"]).show()

+-------+------------------+------------------+------------------+-----------------+
|summary|          latitude|         longitude|      minElevation|    meanElevation|
+-------+------------------+------------------+------------------+-----------------+
|  count|            132890|            132890|            117162|            81599|
|   mean| 36.32210127022388|31.902295625705534| 3502.362062784862|4490.197281829434|
| stddev|27.821062012108655| 83.86064331896505|1735.3178418669727|1347.229569562204|
|    min|          -71.6572|          -179.918|               0.0|              0.0|
|    max|              89.3|            179.68|            8047.0|           8340.0|
+-------+------------------+------------------+------------------+-----------------+



In [40]:
data.groupBy("country").agg({"minElevation": "stddev"}).show()

+--------------------+--------------------+
|             country|stddev(minElevation)|
+--------------------+--------------------+
|        SOUTH AFRICA|                 NaN|
|              UGANDA|   109.8211655638252|
|            TANZANIA|  311.22233316287037|
|FRENCH SOUTHERN T...|  217.49752453781315|
|              BHUTAN|   305.9927185333915|
|              CANADA|   543.5890055095941|
|            COLOMBIA|  153.90779727776342|
|              MEXICO|  130.26121521666886|
|            PAKISTAN|  239.42756344257916|
|              FRANCE|  261.17239583609324|
|       SOUTH GEORGIA|   86.91199331093948|
|               CHINA|   761.8493260800309|
|             AUSTRIA|  201.35649450652937|
|               NEPAL|    507.969148524235|
|         SWITZERLAND|   313.0840718225019|
|           ARGENTINA|   648.0816129005656|
|           GREENLAND|   338.2707623188701|
|              NORWAY|   563.6999600825588|
|             ECUADOR|   162.7865038252052|
|              SWEDEN|  146.9083