In [1]:
import pandas as pd
import numpy as np
import pydataset
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

In [2]:
np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [3]:
df = spark.createDataFrame(pandas_dataframe)

In [4]:
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
|  5|    c|
|  6|    a|
|  7|    b|
|  8|    a|
|  9|    b|
| 10|    b|
| 11|    a|
| 12|    b|
| 13|    a|
| 14|    b|
| 15|    b|
| 16|    c|
| 17|    c|
| 18|    a|
| 19|    c|
+---+-----+



In [5]:
df[['group', 'n']]

DataFrame[group: string, n: bigint]

In [6]:
df['group']

Column<b'group'>

In [7]:
df.select(df.group).show()

+-----+
|group|
+-----+
|    b|
|    b|
|    c|
|    a|
|    c|
|    c|
|    a|
|    b|
|    a|
|    b|
|    b|
|    a|
|    b|
|    a|
|    b|
|    b|
|    c|
|    c|
|    a|
|    c|
+-----+



In [8]:
n_incremented = df.n + 1

In [9]:
n_incremented

Column<b'(n + 1)'>

In [10]:
df.select(n_incremented).show()

+-------+
|(n + 1)|
+-------+
|      1|
|      2|
|      3|
|      4|
|      5|
|      6|
|      7|
|      8|
|      9|
|     10|
|     11|
|     12|
|     13|
|     14|
|     15|
|     16|
|     17|
|     18|
|     19|
|     20|
+-------+



In [11]:
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [12]:
pd.read_csv('alksdhbfljashbdfjhasd')

FileNotFoundError: [Errno 2] File b'alksdhbfljashbdfjhasd' does not exist: b'alksdhbfljashbdfjhasd'

In [None]:
df.show(5)

In [None]:
df.describe().select('n', 'summary').show()

In [None]:
df.select('n', 'summary').describe().show()

In [None]:
mpg = spark.createDataFrame(pydataset.data('mpg'))
mpg.show()

In [None]:
mpg.hwy

In [None]:
mpg.select(mpg.model, 'manufacturer', mpg.hwy.alias('highway_mileage')).show()

In [None]:
mpg.show(5)

In [None]:
avg_mileage_column = ((mpg.cty + mpg.hwy) / 2).alias('avg_mileage')
avg_mileage_column

In [None]:
mpg.select('*', avg_mileage_column).show()

In [None]:
mpg.printSchema()

In [None]:
from pyspark.sql.functions import col

col

In [None]:
col('abc')

In [None]:
col('hwy') * 2

In [None]:
just_hwy_and_cty = mpg.select('hwy', 'cty').show(5)

just_hwy_and_cty.select('hwy').show()

In [None]:
mpg.select(col('hwy').alias('highway_mileage')).show(5)

In [None]:
from pyspark.sql.functions import expr

In [None]:
mpg.select(expr('hwy AS highway_mileage')).show(5)

In [None]:
mpg.select(expr('(hwy + cty) / 2 AS average_mileage')).show(5)

In [None]:
mpg.select(expr('AVG(hwy)')).show()

In [None]:
mpg.createOrReplaceTempView('mpg')

In [None]:
mpg2 = spark.sql('''
SELECT
    cty AS city,
    hwy AS highway,
    (cty + hwy) / 2 AS avg_mileage
FROM mpg
WHERE class = "compact"
''')

In [None]:
mpg2

In [None]:
mpg2.select('city', 'highway')

In [None]:
mpg.dtypes

In [None]:
mpg.printSchema()

In [None]:
mpg.show(5)

In [None]:
mpg.select('*', mpg.cyl.cast('string').alias('cyl_string')).show(4)

In [None]:
spark.sql('SELECT *, CAST(cyl AS STRING) AS cyl_string FROM mpg').show(4)

In [None]:
mpg.select(mpg.manufacturer.cast('double')).show()

In [None]:
min([1, 2, 3])

In [None]:
from pyspark.sql.functions import min, max

In [None]:
min([1, 2, 3])

In [None]:
mpg.select(min(mpg.cyl).alias('min_cyl'), max('hwy')).show()

In [None]:
spark.sql('SELECT min(cyl), max(hwy) FROM mpg').show()

In [None]:
textdf = spark.createDataFrame(
    pd.DataFrame(
        {
            "address": [
                "600 Navarro St ste 600, San Antonio, TX 78205",
                "3130 Broadway St, San Antonio, TX 78209",
                "303 Pearl Pkwy, San Antonio, TX 78215",
                "1255 SW Loop 410, San Antonio, TX 78227",
            ]
        }
    )
)

textdf.show(truncate=False)

In [None]:
# - extract house/unit no
# - extract street

In [None]:
from pyspark.sql.functions import regexp_extract, regexp_replace

In [None]:
regexp_extract?

In [None]:
regexp_extract('address', r'^(\d+)', 1)

In [None]:
textdf.select(
    'address',
    regexp_extract('address', r'^(\d+)', 1).alias('unit_no'),
    regexp_extract('address', r'^(\d+)\s+(.*?),', 2).alias('street'),
    regexp_replace(
        regexp_replace('address', r'^\d+\s+.*?,\s+', ''),
        r'\s+\d+(-\d+)?$',
        ''
    ),
    regexp_extract('address', r'(\d+(-\d+)?)$', 1).alias('zip'),
).show(truncate=True, vertical=True)

In [None]:
import pyspark

# type `pyspark.sql.functions.` and hit tab

In [None]:
from pydataset import data

mpg = spark.createDataFrame(data('mpg'))
mpg.show(5)

In [None]:
four_cylinder_filter = mpg.cyl == 4

In [None]:
mpg.filter(four_cylinder_filter).show(7)

In [None]:
from pyspark.sql.functions import col

In [None]:
mpg.where(col('class') == 'minivan').filter(four_cylinder_filter).show(5)

In [None]:
mpg.where((col('class') == 'minivan') | four_cylinder_filter).show()

In [None]:
mpg.createOrReplaceTempView('mpg')

In [None]:
spark.sql('''
SELECT DISTINCT class
FROM mpg
''').show()

In [None]:
from pyspark.sql.functions import when

In [None]:
mpg.select(
    'cty',
    when(col('cty') >= 20, 'good gas mileage').otherwise('bad gas mileage')
).show()

In [None]:
from pyspark.sql.functions import expr

In [None]:
mpg.select(
    mpg.displ,
    (
        when(expr('displ < 2'), "small")
        .when(mpg.displ < 3, "medium")
        .otherwise("large")
        .alias("engine_size")
    ),
).show()

In [None]:
from pyspark.sql.functions import desc, asc

In [None]:
# sort by hwy, asc, desc, method + fn
# mpg.sort('hwy').show()
mpg.orderBy('hwy').show(5)

In [None]:
mpg.orderBy(asc('hwy')).show(5)

In [None]:
mpg.hwy.desc_nulls_first()

In [None]:
mpg.sort(mpg.hwy.desc()).show()

In [None]:
mpg.sort(desc("class"), mpg.cyl.asc(), col("hwy").desc()).show()

In [None]:
# group by cyl, class, agg hwy, cty, rollup, pivot

In [None]:
mpg.groupBy('cyl').mean().show()

In [None]:
mpg.groupBy('cyl', 'class').mean().show()

In [None]:
from pyspark.sql.functions import mean, max, min, count

In [None]:
mpg.groupBy('class').count().show()

In [None]:
spark.sql('SELECT class, COUNT(*) FROM mpg GROUP BY class').show()

In [None]:
mpg.groupBy('class').agg(count('*')).show()

In [None]:
mpg.groupBy('cyl', 'class').agg(
    mean('hwy').alias('avg_highway_mileage'),
    mean('cty'),
    max('displ'),
    min('displ')
).show()

In [None]:
mpg.show(5)

In [None]:
# groupBy(rows).pivot(columns).agg
mpg.groupBy('class').pivot('cyl').count().show()

In [None]:
mpg.groupBy('class').pivot('cyl').agg(mean('hwy')).show()

In [None]:
mpg.filter(col('cty') > 20).select('cyl', 'hwy').sort('hwy').show()

In [None]:
mpg.sort('hwy').select('cyl', 'hwy').filter(col('cty') > 20).show()

In [None]:
mpg.groupby('manufacturer').pivot('drv').count().sort('manufacturer').show()

In [None]:
mpg.rollup('cyl', 'class').agg(mean('hwy')).sort('cyl', 'class').show()

In [14]:
from pydataset import data

In [22]:
mpg = spark.createDataFrame(data('mpg'))
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [None]:
# .toPandas()

In [34]:
mpg.count(), len(mpg.columns)

(234, 11)

In [38]:
from pyspark.sql.functions import *

In [44]:
aggregated_data = mpg.groupBy('class').pivot('cyl').count()
aggregated_data.show()

+----------+----+----+----+----+
|     class|   4|   5|   6|   8|
+----------+----+----+----+----+
|subcompact|  21|   2|   7|   5|
|   compact|  32|   2|  13|null|
|   minivan|   1|null|  10|null|
|       suv|   8|null|  16|  38|
|   midsize|  16|null|  23|   2|
|    pickup|   3|null|  10|  20|
|   2seater|null|null|null|   5|
+----------+----+----+----+----+



In [47]:
aggregated_data.na.drop().show()

+----------+---+---+---+---+
|     class|  4|  5|  6|  8|
+----------+---+---+---+---+
|subcompact| 21|  2|  7|  5|
+----------+---+---+---+---+



In [48]:
aggregated_data.na.drop(subset=['6', '8']).show()

+----------+---+----+---+---+
|     class|  4|   5|  6|  8|
+----------+---+----+---+---+
|subcompact| 21|   2|  7|  5|
|       suv|  8|null| 16| 38|
|   midsize| 16|null| 23|  2|
|    pickup|  3|null| 10| 20|
+----------+---+----+---+---+



In [51]:
aggregated_data.show()

+----------+----+----+----+----+
|     class|   4|   5|   6|   8|
+----------+----+----+----+----+
|subcompact|  21|   2|   7|   5|
|   compact|  32|   2|  13|null|
|   minivan|   1|null|  10|null|
|       suv|   8|null|  16|  38|
|   midsize|  16|null|  23|   2|
|    pickup|   3|null|  10|  20|
|   2seater|null|null|null|   5|
+----------+----+----+----+----+



In [53]:
aggregated_data.na.fill(0, subset=['6', '8']).show()

+----------+----+----+---+---+
|     class|   4|   5|  6|  8|
+----------+----+----+---+---+
|subcompact|  21|   2|  7|  5|
|   compact|  32|   2| 13|  0|
|   minivan|   1|null| 10|  0|
|       suv|   8|null| 16| 38|
|   midsize|  16|null| 23|  2|
|    pickup|   3|null| 10| 20|
|   2seater|null|null|  0|  5|
+----------+----+----+---+---+

