# Spark 101

All the basics! 

## Create Spark Session

In [None]:
#import spark for python! 

#create the spark session


## Dataframe Basics
 - create dataframe: `spark.createdataframe`
 - see the results: `.show`
 - look at your data: `.describe`, `.dtypes`, `printSchema`
 - select & create columns: `.select`, `col`, `expr`

### create dataframe from a pandas dataframe

In [None]:
import pandas as pd
import numpy as np

In [None]:
np.random.seed(452)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)

pandas_dataframe.head()

In [None]:
#create spark dataframe 


### See the results

<div class='alert alert-box alert-info'>
<b>Note:</b> Spark is lazy and won't display info unless you tell it to
</div>

In [None]:
df.show()

### create dataframe from the pydataset

In [None]:
from pydataset import data

In [None]:
df = spark.createDataFrame(data('mpg'))
df

### look at your data

In [None]:
df.show()

In [None]:
#describe


In [None]:
#describe vertical


In [None]:
#printSchema


In [None]:
#dtypes


In [None]:
#number of rows


In [None]:
#number of columns


### select columns

<div class='alert alert-box alert-info'>
<b>Reminder:</b> Spark is still lazy
</div>

In [None]:
#select two columns


#### save our selected columns

<div class='alert alert-box alert-info'>
<b>Note:</b> You can not save a display output
</div>

In [None]:
# df.select('model','year').show()

### create columns
 - use basic math operators
 - change column names: `alias`

In [None]:
df.show(5)

#### half the highway mileage

### select & create columns: `col`

In [None]:
from pyspark.sql.functions import col

In [None]:
df.select(

).show(5)

### select & create columns: `expr`

In [None]:
from pyspark.sql.functions import expr

In [None]:
df.select(

).show(5)

### create column: `withColumn`

## Transforming columns

### built-in functions - math

In [None]:
from pyspark.sql.functions import min, max, sum, count, mean, avg

In [None]:
#use min, max, and calculate average highway mileage
df.select(

).show(5)

### built-in functions - strings
- `concat`: to concatenate strings
- `lit`: creates literal value of character

In [None]:
from pyspark.sql.functions import concat, lit

In [None]:
#combine manufacturer and model together
df.select(

).show(5)

In [None]:
df.select(

).show(5)

In [None]:
#combine city and highway together
df.select(

).show(5)

### Regex! 

- `regexp_extract`: use regex to extract data
- `regexp_replace`: use regex to replace data

In [None]:
from pyspark.sql.functions import regexp_extract, regexp_replace

In [None]:
textdf = spark.createDataFrame(
    pd.DataFrame(
        {
            "address": [
                "600 Navarro St ste 600, San Antonio, TX 78205",
                "3130 Broadway St, San Antonio, TX 78209",
                "303 Pearl Pkwy, San Antonio, TX 78215",
                "1255 SW Loop 410!!!!, San - Antonio, TX @78227@",
            ]
        }
    )
)

textdf.show(truncate=False)

In [None]:
textdf.select(

).show()

### Filter and Where

In [None]:
df = spark.createDataFrame(data('mpg'))
df

In [None]:
df.show(5)

### When and Otherwise

In [None]:
from pyspark.sql.functions import when

In [None]:
df.select(

).show(5)

### sorting and ordering

### Grouping and Aggregating

In [None]:
#groupby/groupBy


In [None]:
#rollup


### Crosstabs and Pivot Tables

In [None]:
#crosstab


In [None]:
#groupby and pivot


### Handling Missing Data

- `.na.fill`: to replace missing values with a specified value
- `.na.drop`: to drop rows containing missing values

In [None]:
df = spark.createDataFrame(
    pd.DataFrame(
        {"x": [1, 2, np.nan, 4, 5, np.nan], "y": [np.nan, 0, 0, 3, 1, np.nan]}
    )
)
df.show()

### More Dataframe Manipulation Examples

In [None]:
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
df = spark.createDataFrame(weather)
df.show(6)

#### shape of df

#### start and end date

#### Find the total rainfall per month

In [None]:
from pyspark.sql.functions import month, year, quarter

### Joins

In [None]:
(
    df.withColumn("month", month("date"))
    .groupBy("month")
    .agg(sum("precipitation").alias("total_rainfall"))
    .sort("month")
    .show()
)

In [None]:
users = spark.createDataFrame(
    pd.DataFrame(
        {
            "id": [1, 2, 3, 4, 5, 6],
            "name": ["bob", "joe", "sally", "adam", "jane", "mike"],
            "role_id": [1, 2, 3, 3, np.nan, np.nan],
        }
    )
)
roles = spark.createDataFrame(
    pd.DataFrame(
        {
            "id": [1, 2, 3, 4],
            "name": ["admin", "author", "reviewer", "commenter"],
        }
    )
)
print("--- users ---")
users.show()
print("--- roles ---")
roles.show()

In [None]:
users.join(roles, on=users.role_id == roles.id).show()

In [None]:
users.join(roles, on=users.role_id == roles.id, how="left").show()