# More on Spark DataFrames

In [9]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml.stat as spark_stat
import pyspark.sql.functions as spark_f
import pyspark.sql.types as spark_types
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [10]:
spark = SparkSession.builder.appName('Spark Test App').getOrCreate() 
sc = spark.sparkContext

Definitionally, a **DataFrame** consists of a series of **records** (like rows in a table), that are of **type Row**, and a number of columns (like columns in a spreadsheet) that represent an computation expression that can performed on each individual record in the dataset. The **schema** defines the name as well as the type of data in each column. The partitioning of the DataFrame defines the layout of the DataFrame or Dataset’s physical distribution across the cluster. The partitioning scheme defines how that is broken up, this can be set to be based on values in a certain column or non-deterministically.

### Schemas

In [11]:
# From a csv file with inferred schema
df_auto = spark.read.csv('../Stat_Learning/data/Auto.csv', header=True, inferSchema=True)
df_auto.show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|18.0|        8|       307.0|       130|  3504|        12.0|  70|     1|chevrolet chevell...|
|15.0|        8|       350.0|       165|  3693|        11.5|  70|     1|   buick skylark 320|
|18.0|        8|       318.0|       150|  3436|        11.0|  70|     1|  plymouth satellite|
|16.0|        8|       304.0|       150|  3433|        12.0|  70|     1|       amc rebel sst|
|17.0|        8|       302.0|       140|  3449|        10.5|  70|     1|         ford torino|
+----+---------+------------+----------+------+------------+----+------+--------------------+
only showing top 5 rows



In [12]:
df_auto.schema

StructType(List(StructField(mpg,DoubleType,true),StructField(cylinders,IntegerType,true),StructField(displacement,DoubleType,true),StructField(horsepower,StringType,true),StructField(weight,IntegerType,true),StructField(acceleration,DoubleType,true),StructField(year,IntegerType,true),StructField(origin,IntegerType,true),StructField(name,StringType,true)))

A schema is a **StructType** made up of a number of fields, **StructFields**, that have a name, type, and a boolean flag which specifies whether or not that column can contain missing or null values.

In [13]:
df_auto.printSchema()

root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |-- displacement: double (nullable = true)
 |-- horsepower: string (nullable = true)
 |-- weight: integer (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- origin: integer (nullable = true)
 |-- name: string (nullable = true)



In [14]:
# Define a manual schema
my_schema = spark_types.StructType([
    spark_types.StructField('mpg', spark_types.FloatType(), True),
    spark_types.StructField('cylinders', spark_types.ByteType(), True),
    spark_types.StructField('displacement', spark_types.FloatType(), True),
    spark_types.StructField('horsepower', spark_types.FloatType(), True),
    spark_types.StructField('weight', spark_types.FloatType(), True),
    spark_types.StructField('acceleration', spark_types.FloatType(), True),
    spark_types.StructField('year', spark_types.ByteType(), True),
    spark_types.StructField('origin', spark_types.ByteType(), True),
    spark_types.StructField('name', spark_types.StringType(), True),
])

In [15]:
# Read data with manual schema
df_auto = spark.read.csv('../Stat_Learning/data/Auto.csv', header=True, schema=my_schema)
df_auto.show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|18.0|        8|       307.0|     130.0|3504.0|        12.0|  70|     1|chevrolet chevell...|
|15.0|        8|       350.0|     165.0|3693.0|        11.5|  70|     1|   buick skylark 320|
|18.0|        8|       318.0|     150.0|3436.0|        11.0|  70|     1|  plymouth satellite|
|16.0|        8|       304.0|     150.0|3433.0|        12.0|  70|     1|       amc rebel sst|
|17.0|        8|       302.0|     140.0|3449.0|        10.5|  70|     1|         ford torino|
+----+---------+------------+----------+------+------------+----+------+--------------------+
only showing top 5 rows



In [16]:
df_auto.printSchema()

root
 |-- mpg: float (nullable = true)
 |-- cylinders: byte (nullable = true)
 |-- displacement: float (nullable = true)
 |-- horsepower: float (nullable = true)
 |-- weight: float (nullable = true)
 |-- acceleration: float (nullable = true)
 |-- year: byte (nullable = true)
 |-- origin: byte (nullable = true)
 |-- name: string (nullable = true)



### Aggregations

In [17]:
df_auto.createOrReplaceTempView('df_auto')

In [20]:
# Standard Aggregation
spark.sql('''
    select year,
        mean(mpg),
        mean(horsepower)
    from df_auto
    group by year
    order by year
''').show()

+----+------------------+------------------+
|year|          avg(mpg)|   avg(horsepower)|
+----+------------------+------------------+
|null|              null|              null|
|  70|17.689655172413794|147.82758620689654|
|  71| 21.11111111111111|107.03703703703704|
|  72|18.714285714285715|120.17857142857143|
|  73|              17.1|           130.475|
|  74| 22.76923076923077| 94.23076923076923|
|  75|20.266666666666666|101.06666666666666|
|  76|21.573529411764707|101.11764705882354|
|  77|            23.375|105.07142857142857|
|  78|24.061111132303875| 99.69444444444444|
|  79|25.093103343042834|101.20689655172414|
|  80|33.803703520033096| 77.48148148148148|
|  81| 30.18571444920131| 81.03571428571429|
|  82|              32.0| 81.46666666666667|
+----+------------------+------------------+



In [48]:
# Window Function: Deviation from year Average
spark.sql('''
    select year, 
        name,
        mpg/mean(mpg) over(partition by year order by year) as mpg_relative
    from df_auto
''').show()

+----+--------------------+------------------+
|year|                name|      mpg_relative|
+----+--------------------+------------------+
|  78|volkswagen rabbit...|1.7912721585104214|
|  78|         ford fiesta|1.5003462755988062|
|  78|    mazda glc deluxe|1.3631955339345925|
|  78|      datsun b210 gx|1.6374971758050443|
|  78|    honda civic cvcc|1.5003462755988062|
|  78|oldsmobile cutlas...|0.8270607084231039|
|  78|      dodge diplomat|0.8062802882151313|
|  78|mercury monarch ghia|0.8395290081104947|
|  78|  pontiac phoenix lj|0.7979681676945496|
|  78|    chevrolet malibu|0.8519972285268733|
|  78|ford fairmont (auto)|0.8395290081104947|
|  78| ford fairmont (man)| 1.043177110294423|
|  78|     plymouth volare|0.8519972285268733|
|  78|         amc concord|0.8062802882151313|
|  78|buick century spe...|0.8561533284226702|
|  78|      mercury zephyr| 0.864465448943252|
|  78|         dodge aspen|0.7730316475907801|
|  78|     amc concord d/l|0.7522512273828077|
|  78|chevrol

In [50]:
# Window Function: Ranking within years
spark.sql('''
    select year, 
        name,
        mpg,
        rank(mpg) over(partition by year 
                        order by mpg desc
                        rows between unbounded preceding and current row) as mpg_rank
    from df_auto
''').show()

+----+--------------------+----+--------+
|year|                name| mpg|mpg_rank|
+----+--------------------+----+--------+
|  78|volkswagen rabbit...|43.1|       1|
|  78|      datsun b210 gx|39.4|       2|
|  78|         ford fiesta|36.1|       3|
|  78|    honda civic cvcc|36.1|       3|
|  78|    mazda glc deluxe|32.8|       5|
|  78| volkswagen scirocco|31.5|       6|
|  78|          dodge omni|30.9|       7|
|  78|  chevrolet chevette|30.0|       8|
|  78|     honda accord lx|29.5|       9|
|  78|       toyota corona|27.5|      10|
|  78|          datsun 510|27.2|      11|
|  78| ford fairmont (man)|25.1|      12|
|  78|       datsun 200-sx|23.9|      13|
|  78|oldsmobile starfi...|23.8|      14|
|  78|    plymouth sapporo|23.2|      15|
|  78|          saab 99gle|21.6|      16|
|  78|toyota celica gt ...|21.1|      17|
|  78|      mercury zephyr|20.8|      18|
|  78|buick century spe...|20.6|      19|
|  78|    chevrolet malibu|20.5|      20|
+----+--------------------+----+--

### More on Spark SQL

In [54]:
# Show current tables
spark.sql('SHOW TABLES').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |  df_auto|       true|
+--------+---------+-----------+



In [55]:
# Show table metadata
spark.sql('DESCRIBE TABLE df_auto').show()

+------------+---------+-------+
|    col_name|data_type|comment|
+------------+---------+-------+
|         mpg|    float|   null|
|   cylinders|  tinyint|   null|
|displacement|    float|   null|
|  horsepower|    float|   null|
|      weight|    float|   null|
|acceleration|    float|   null|
|        year|  tinyint|   null|
|      origin|  tinyint|   null|
|        name|   string|   null|
+------------+---------+-------+



In [72]:
# Show available functions
spark.sql('SHOW FUNCTIONS LIKE "b*"').show(100)

+----------+
|  function|
+----------+
|    base64|
|    bigint|
|       bin|
|    binary|
|bit_length|
|   boolean|
|    bround|
+----------+

