In [None]:
%spark.pyspark

#importing SparkSession
from pyspark.sql import SparkSession

# spark builder
spark = SparkSession.builder.appName("Wine").getOrCreate()

In [None]:
%spark.pyspark

# import SparkFiles, add file url
from pyspark import SparkFiles

url ="https://s3.amazonaws.com/dataviz-curriculum/day_1/wine.csv"
spark.sparkContext.addFile(url)
wine_df = spark.read.csv(SparkFiles.get("wine.csv"), sep = ",", header = True)

# display the data 
wine_df.show()

-------+--------------------+--------------------+------+-----+--------------+-----------------+-----------------+------------------+--------------------+
|country|         description|         designation|points|price|      province|         region_1|         region_2|           variety|              winery|
+-------+--------------------+--------------------+------+-----+--------------+-----------------+-----------------+------------------+--------------------+
|     US|This tremendous 1...|   Martha's Vineyard|    96|  235|    California|      Napa Valley|             Napa|Cabernet Sauvignon|               Heitz|
|  Spain|Ripe aromas of fi...|Carodorum Selecci...|    96|  110|Northern Spain|             Toro|             null|     Tinta de Toro|Bodega Carmen Rod...|
|     US|Mac Watson honors...|Special Selected ...|    96|   90|    California|   Knights Valley|           Sonoma|   Sauvignon Blanc|            Macauley|
|     US|This spent 20 mon...|             Reserve|    96|   65|        Oregon|Willamette Valley|Willamette Valley|        Pinot Noir|               Ponzi|
+-------+--------------------+--------------------+------+-----+--------------+-----------------+-----------------+------------------+--------------------+

In [None]:
%spark.pyspark

# Inspect the data types by printing the schema

wine_df.printSchema()
root
 |-- country: string (nullable = true)
 |-- description: string (nullable = true)
 |-- designation: string (nullable = true)
 |-- points: string (nullable = true)
 |-- price: string (nullable = true)
 |-- province: string (nullable = true)
 |-- region_1: string (nullable = true)
 |-- region_2: string (nullable = true)
 |-- variety: string (nullable = true)
 |-- winery: string (nullable = true)

In [None]:
%spark.pyspark

# Change the data type
# Import struct fields first

from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [None]:
%spark.pyspark

# Create the list of struct fields
schema = [StructField("points", IntegerType(), True), StructField("price", IntegerType(), True)]
schema

In [None]:
%spark.pyspark

# Pass in our fields 
final = StructType(fields = schema)
final

StructType(List(StructField(points,IntegerType,true),StructField(price,IntegerType,true)))

In [None]:
%spark.pyspark

# Read our data with our new schema
wine_dataframe = spark.read.csv(SparkFiles.get("wine.csv"), sep = ",", header = True, schema = final)
wine_dataframe

DataFrame[points: int, price: int]

In [None]:
%spark.pyspark

# Print it out
wine_dataframe.printSchema()


 |-- points: integer (nullable = true)
 |-- price: integer (nullable = true)

In [None]:
%spark.pyspark

# Order a dataframe by ascending values
df.orderBy(df["points"].asc()).head(5)

In [None]:
%spark.pyspark

# Import functions
from pyspark.sql.functions import avg
df.select(avg("points")).show()

+-----------------+
|      avg(points)|
+-----------------+
|87.88834105383143|
+-----------------+

In [None]:
%spark.pyspark

# Using SQL
df.filter("price > 200").show(4)

+-------+--------------------+-----------------+------+-----+----------------+--------------------+--------+------------------+------------------+
|country|         description|      designation|points|price|        province|            region_1|region_2|           variety|            winery|
+-------+--------------------+-----------------+------+-----+----------------+--------------------+--------+------------------+------------------+
|     US|This tremendous 1...|Martha's Vineyard|    96|  235|      California|         Napa Valley|    Napa|Cabernet Sauvignon|             Heitz|
|     US|This blockbuster,...|  Rainin Vineyard|    95|  325|      California|Diamond Mountain ...|    Napa|Cabernet Sauvignon|              Hall|
| France|Coming from a sev...|    Le Pigeonnier|    95|  290|Southwest France|              Cahors|    null|            Malbec|Ch̢teau Lagr̩zette|
|  Spain|Tarry blackberry ...|       Termanthia|    95|  220|  Northern Spain|                Toro|    null|     Tinta de Toro|         Numanthia|
+-------+--------------------+-----------------+------+-----+----------------+--------------------+--------+------------------+------------------+

In [None]:
%spark.pyspark

# Filter by points on certain columns
df.filter("points > 95").select(['points','country', 'winery','price']).show(4)

# or using python code 
# df.filter(df["points"] > 95).show()

+------+-------+--------------------+-----+
|points|country|              winery|price|
+------+-------+--------------------+-----+
|    96|     US|               Heitz|  235|
|    96|  Spain|Bodega Carmen Rod...|  110|
|    96|     US|            Macauley|   90|
|    96|     US|               Ponzi|   65|
+------+-------+--------------------+-----+

In [None]:
%spark.pyspark

df.filter((df["price"] < 200) | (df['points'] > 80)).show()

+-------+--------------------+--------------------+------+-----+--------------+-----------------+-----------------+------------------+--------------------+
|country|         description|         designation|points|price|      province|         region_1|         region_2|           variety|              winery|
+-------+--------------------+--------------------+------+-----+--------------+-----------------+-----------------+------------------+--------------------+
|     US|This tremendous 1...|   Martha's Vineyard|    96|  235|    California|      Napa Valley|             Napa|Cabernet Sauvignon|               Heitz|
|  Spain|Ripe aromas of fi...|Carodorum Selecci...|    96|  110|Northern Spain|             Toro|             null|     Tinta de Toro|Bodega Carmen Rod...|
|     US|Mac Watson honors...|Special Selected ...|    96|   90|    California|   Knights Valley|           Sonoma|   Sauvignon Blanc|            Macauley|
|     US|This spent 20 mon...|             Reserve|    96|   65|        Oregon|Willamette Valley|Willamette Valley|        Pinot Noir|               Ponzi|
+-------+--------------------+--------------------+------+-----+--------------+-----------------+-----------------+------------------+--------------------+

In [None]:
%spark.pyspark

df.filter(df["country"] == "US").show()

+-------+--------------------+--------------------+------+-----+----------+------------------+-----------------+------------------+---------+
|country|         description|         designation|points|price|  province|          region_1|         region_2|           variety|   winery|
+-------+--------------------+--------------------+------+-----+----------+------------------+-----------------+------------------+---------+
|     US|This tremendous 1...|   Martha's Vineyard|    96|  235|California|       Napa Valley|             Napa|Cabernet Sauvignon|    Heitz|
|     US|Mac Watson honors...|Special Selected ...|    96|   90|California|    Knights Valley|           Sonoma|   Sauvignon Blanc| Macauley|
|     US|This spent 20 mon...|             Reserve|    96|   65|    Oregon| Willamette Valley|Willamette Valley|        Pinot Noir|    Ponzi|
|     US|This re-named vin...|              Silice|    95|   65|    Oregon|Chehalem Mountains|Willamette Valley|        Pinot Noir|Bergstr̦m|
+-------+--------------------+--------------------+------+-----+----------+------------------+-----------------+------------------+---------+