# DataFrame Operations - Basics

In [1]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml.stat as ml_stat
import pyspark.sql.functions as func
import pyspark.sql.types as types
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('Spark Test App').getOrCreate() 
sc = spark.sparkContext

In [3]:
# Create Data Frame from a csv file with inferred schema
df_auto = spark.read.csv('Auto.csv', header=True, inferSchema=True)
df_auto.show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|18.0|        8|       307.0|       130|  3504|        12.0|  70|     1|chevrolet chevell...|
|15.0|        8|       350.0|       165|  3693|        11.5|  70|     1|   buick skylark 320|
|18.0|        8|       318.0|       150|  3436|        11.0|  70|     1|  plymouth satellite|
|16.0|        8|       304.0|       150|  3433|        12.0|  70|     1|       amc rebel sst|
|17.0|        8|       302.0|       140|  3449|        10.5|  70|     1|         ford torino|
+----+---------+------------+----------+------+------------+----+------+--------------------+
only showing top 5 rows



### Selecting

In [4]:
# Select certain columns
df_auto.select('mpg', 'name') \
    .show(5)

+----+--------------------+
| mpg|                name|
+----+--------------------+
|18.0|chevrolet chevell...|
|15.0|   buick skylark 320|
|18.0|  plymouth satellite|
|16.0|       amc rebel sst|
|17.0|         ford torino|
+----+--------------------+
only showing top 5 rows



In [7]:
# Columns can be designated differently
df_auto.select(df_auto.mpg, 'name') \
    .show(5)

+----+--------------------+
| mpg|                name|
+----+--------------------+
|18.0|chevrolet chevell...|
|15.0|   buick skylark 320|
|18.0|  plymouth satellite|
|16.0|       amc rebel sst|
|17.0|         ford torino|
+----+--------------------+
only showing top 5 rows



In [21]:
# Select and change column using SQL like expression
df_auto.selectExpr('name AS carname', 
                   'mpg') \
    .show(5)

+--------------------+----+
|             carname| mpg|
+--------------------+----+
|chevrolet chevell...|18.0|
|   buick skylark 320|15.0|
|  plymouth satellite|18.0|
|       amc rebel sst|16.0|
|         ford torino|17.0|
+--------------------+----+
only showing top 5 rows



In [31]:
# Alternative for renaming:
df_auto.select('name', 'mpg') \
    .withColumnRenamed('name', 'carname').show(5)

+--------------------+----+
|             carname| mpg|
+--------------------+----+
|chevrolet chevell...|18.0|
|   buick skylark 320|15.0|
|  plymouth satellite|18.0|
|       amc rebel sst|16.0|
|         ford torino|17.0|
+--------------------+----+
only showing top 5 rows



In [28]:
# Computed column
df_auto.selectExpr('*',
                  'round(235/mpg,2) as litre_per_100km') \
    .show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+---------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|litre_per_100km|
+----+---------+------------+----------+------+------------+----+------+--------------------+---------------+
|18.0|        8|       307.0|       130|  3504|        12.0|  70|     1|chevrolet chevell...|          13.06|
|15.0|        8|       350.0|       165|  3693|        11.5|  70|     1|   buick skylark 320|          15.67|
|18.0|        8|       318.0|       150|  3436|        11.0|  70|     1|  plymouth satellite|          13.06|
|16.0|        8|       304.0|       150|  3433|        12.0|  70|     1|       amc rebel sst|          14.69|
|17.0|        8|       302.0|       140|  3449|        10.5|  70|     1|         ford torino|          13.82|
+----+---------+------------+----------+------+------------+----+------+--------------------+---------------+
only showi

In [29]:
# Alternative: withColumn
df_auto.withColumn('litre_per_100km', func.expr('round(235/mpg,2)')).show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+---------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|litre_per_100km|
+----+---------+------------+----------+------+------------+----+------+--------------------+---------------+
|18.0|        8|       307.0|       130|  3504|        12.0|  70|     1|chevrolet chevell...|          13.06|
|15.0|        8|       350.0|       165|  3693|        11.5|  70|     1|   buick skylark 320|          15.67|
|18.0|        8|       318.0|       150|  3436|        11.0|  70|     1|  plymouth satellite|          13.06|
|16.0|        8|       304.0|       150|  3433|        12.0|  70|     1|       amc rebel sst|          14.69|
|17.0|        8|       302.0|       140|  3449|        10.5|  70|     1|         ford torino|          13.82|
+----+---------+------------+----------+------+------------+----+------+--------------------+---------------+
only showi

In [32]:
# Special column names in expressions -> backtick for escapig
df_auto.selectExpr('name',
                  'round(235/mpg,2) as `Litres per 100km`') \
    .show(5)

+--------------------+----------------+
|                name|Litres per 100km|
+--------------------+----------------+
|chevrolet chevell...|           13.06|
|   buick skylark 320|           15.67|
|  plymouth satellite|           13.06|
|       amc rebel sst|           14.69|
|         ford torino|           13.82|
+--------------------+----------------+
only showing top 5 rows



In [33]:
# Removing columns with drop
df_auto.drop('displacement', 'acceleration', 'origin').show(5)

+----+---------+----------+------+----+--------------------+
| mpg|cylinders|horsepower|weight|year|                name|
+----+---------+----------+------+----+--------------------+
|18.0|        8|       130|  3504|  70|chevrolet chevell...|
|15.0|        8|       165|  3693|  70|   buick skylark 320|
|18.0|        8|       150|  3436|  70|  plymouth satellite|
|16.0|        8|       150|  3433|  70|       amc rebel sst|
|17.0|        8|       140|  3449|  70|         ford torino|
+----+---------+----------+------+----+--------------------+
only showing top 5 rows



### Type Casting

In [5]:
df_auto.printSchema()

root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |-- displacement: double (nullable = true)
 |-- horsepower: string (nullable = true)
 |-- weight: integer (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- origin: integer (nullable = true)
 |-- name: string (nullable = true)



In [6]:
df_auto.withColumn('horsepower', df_auto.horsepower.cast('float')).printSchema()

root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |-- displacement: double (nullable = true)
 |-- horsepower: float (nullable = true)
 |-- weight: integer (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- origin: integer (nullable = true)
 |-- name: string (nullable = true)



In [16]:
# With types objects
df_auto.withColumn('horsepower', df_auto.horsepower.cast(types.DoubleType())).printSchema()

root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |-- displacement: double (nullable = true)
 |-- horsepower: double (nullable = true)
 |-- weight: integer (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- origin: integer (nullable = true)
 |-- name: string (nullable = true)



In [45]:
# With SQL expression
df_auto.withColumn('horsepower', func.expr('cast(horsepower as float)')).printSchema()

root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |-- displacement: double (nullable = true)
 |-- horsepower: float (nullable = true)
 |-- weight: integer (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- origin: integer (nullable = true)
 |-- name: string (nullable = true)



### Filtering

In [52]:
# Simple filter
df_auto.where('mpg > 40').show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|43.1|        4|        90.0|        48|  1985|        21.5|  78|     2|volkswagen rabbit...|
|41.5|        4|        98.0|        76|  2144|        14.7|  80|     2|           vw rabbit|
|46.6|        4|        86.0|        65|  2110|        17.9|  80|     3|           mazda glc|
|40.8|        4|        85.0|        65|  2110|        19.2|  80|     3|          datsun 210|
|44.3|        4|        90.0|        48|  2085|        21.7|  80|     2|vw rabbit c (diesel)|
+----+---------+------------+----------+------+------------+----+------+--------------------+
only showing top 5 rows



In [75]:
# same with "filer"
df_auto.filter('mpg > 40').show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|43.1|        4|        90.0|        48|  1985|        21.5|  78|     2|volkswagen rabbit...|
|41.5|        4|        98.0|        76|  2144|        14.7|  80|     2|           vw rabbit|
|46.6|        4|        86.0|        65|  2110|        17.9|  80|     3|           mazda glc|
|40.8|        4|        85.0|        65|  2110|        19.2|  80|     3|          datsun 210|
|44.3|        4|        90.0|        48|  2085|        21.7|  80|     2|vw rabbit c (diesel)|
+----+---------+------------+----------+------+------------+----+------+--------------------+
only showing top 5 rows



In [57]:
# Multiple filter: use multiple where instead "AND"
df_auto.where('mpg > 30') \
    .where('horsepower > 90') \
    .show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|37.0|        4|       119.0|        92|  2434|        15.0|  80|     3|datsun 510 hatchback|
|32.7|        6|       168.0|       132|  2910|        11.4|  80|     3|       datsun 280-zx|
|32.9|        4|       119.0|       100|  2615|        14.8|  81|     3|        datsun 200sx|
|32.0|        4|       144.0|        96|  2665|        13.9|  82|     3|    toyota celica gt|
+----+---------+------------+----------+------+------------+----+------+--------------------+



In [14]:
# Equality comparison in expression as =
df_auto.where("cylinders=4").show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|24.0|        4|       113.0|        95|  2372|        15.0|  70|     3|toyota corona mar...|
|27.0|        4|        97.0|        88|  2130|        14.5|  70|     3|        datsun pl510|
|26.0|        4|        97.0|        46|  1835|        20.5|  70|     2|volkswagen 1131 d...|
|25.0|        4|       110.0|        87|  2672|        17.5|  70|     2|         peugeot 504|
|24.0|        4|       107.0|        90|  2430|        14.5|  70|     2|         audi 100 ls|
+----+---------+------------+----------+------+------------+----+------+--------------------+
only showing top 5 rows



In [15]:
# Equality comparison outside expression with ==
df_auto.where(df_auto.cylinders==4).show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|24.0|        4|       113.0|        95|  2372|        15.0|  70|     3|toyota corona mar...|
|27.0|        4|        97.0|        88|  2130|        14.5|  70|     3|        datsun pl510|
|26.0|        4|        97.0|        46|  1835|        20.5|  70|     2|volkswagen 1131 d...|
|25.0|        4|       110.0|        87|  2672|        17.5|  70|     2|         peugeot 504|
|24.0|        4|       107.0|        90|  2430|        14.5|  70|     2|         audi 100 ls|
+----+---------+------------+----------+------+------------+----+------+--------------------+
only showing top 5 rows



### Unique Rows

In [60]:
df_auto.select('name').distinct().show(5)

+--------------------+
|                name|
+--------------------+
|         audi 100 ls|
|pontiac sunbird c...|
|       dodge rampage|
|    chevrolet malibu|
|chevrolet monte c...|
+--------------------+
only showing top 5 rows



In [61]:
df_auto.select('name', 'year').distinct().show(5)

+--------------------+----+
|                name|year|
+--------------------+----+
|plymouth satellit...|  71|
|       pontiac astro|  75|
|        ford granada|  77|
|    mercury capri v6|  73|
|          datsun 710|  75|
+--------------------+----+
only showing top 5 rows



In [65]:
# Count distinct
df_auto.select('name', 'year').distinct().count()

395

### Random Sampling and Splitting

In [68]:
# Random Sampling from the DataFrame
seed=10
withReplacement=None
fraction=0.1
df_auto.sample(withReplacement=withReplacement, seed=seed, fraction=fraction).show(5)

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|16.0|        8|       304.0|       150|  3433|        12.0|  70|     1|       amc rebel sst|
|21.0|        6|       200.0|        85|  2587|        16.0|  70|     1|       ford maverick|
|18.0|        6|       232.0|       100|  3288|        15.5|  71|     1|         amc matador|
|35.0|        4|        72.0|        69|  1613|        18.0|  71|     3|         datsun 1200|
|12.0|        8|       350.0|       160|  4456|        13.5|  72|     1|oldsmobile delta ...|
+----+---------+------------+----------+------+------------+----+------+--------------------+
only showing top 5 rows



In [70]:
# Random Split 
df_train, df_test = df_auto.randomSplit([0.8, 0.2], seed=seed)

In [71]:
df_train.count()

316

In [72]:
df_test.count()

81

### Union

In [73]:
# DataFrames are immutable, so append not possible
df_new = df_train.union(df_test)

In [74]:
df_new.count()

397

### Sorting

In [77]:
# Sorting with "sort" or "orderBy" (no difference)
df_auto.selectExpr('name as carname',
                  'round(235/mpg,2) as litre_per_100km') \
    .sort('litre_per_100km') \
    .show(5)

+--------------------+---------------+
|             carname|litre_per_100km|
+--------------------+---------------+
|           mazda glc|           5.04|
| honda civic 1500 gl|           5.27|
|vw rabbit c (diesel)|            5.3|
|           vw pickup|           5.34|
|  vw dasher (diesel)|           5.41|
+--------------------+---------------+
only showing top 5 rows



In [79]:
# Sort desc
df_auto.selectExpr('name as carname',
                  'round(235/mpg,2) as litre_per_100km') \
    .sort(func.desc('litre_per_100km')) \
    .show(5)

+---------------+---------------+
|        carname|litre_per_100km|
+---------------+---------------+
|       hi 1200d|          26.11|
|      ford f250|           23.5|
|      chevy c20|           23.5|
|     dodge d200|          21.36|
|mercury marquis|          21.36|
+---------------+---------------+
only showing top 5 rows



### Actions

In [86]:
# show to print n number of rows
df_auto.show(2)

+----+---------+------------+----------+------+------------+----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+----+---------+------------+----------+------+------------+----+------+--------------------+
|18.0|        8|       307.0|       130|  3504|        12.0|  70|     1|chevrolet chevell...|
|15.0|        8|       350.0|       165|  3693|        11.5|  70|     1|   buick skylark 320|
+----+---------+------------+----------+------+------------+----+------+--------------------+
only showing top 2 rows



In [87]:
# take to get n rows as spark rows
df_auto.take(2)

[Row(mpg=18.0, cylinders=8, displacement=307.0, horsepower='130', weight=3504, acceleration=12.0, year=70, origin=1, name='chevrolet chevelle malibu'),
 Row(mpg=15.0, cylinders=8, displacement=350.0, horsepower='165', weight=3693, acceleration=11.5, year=70, origin=1, name='buick skylark 320')]

In [88]:
# Get mpg from second row
df_auto.take(2)[1].mpg

15.0

In [90]:
# Get all data with collect
# df_auto.collect()